1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
|
From 7c7c35d8ea9f65f081564b3ad1bfe9f0db33dd69 Mon Sep 17 00:00:00 2001
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
Date: Sat, 26 Aug 2023 15:08:59 +0200
Subject: [PATCH 1/6] Expose line and column information for use in PHP
---
source/lexbor/dom/interfaces/node.h | 2 ++
source/lexbor/html/token.h | 2 ++
source/lexbor/html/tokenizer.c | 24 +++++++++++++++++++++++-
source/lexbor/html/tokenizer.h | 2 ++
source/lexbor/html/tokenizer/state.h | 2 ++
source/lexbor/html/tree.c | 11 +++++++++++
source/lexbor/html/tree/error.c | 5 +++--
source/lexbor/html/tree/error.h | 5 +++--
8 files changed, 48 insertions(+), 5 deletions(-)
diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h
index c37b790..8ac218b 100644
--- a/source/lexbor/dom/interfaces/node.h
+++ b/source/lexbor/dom/interfaces/node.h
@@ -58,6 +58,8 @@ struct lxb_dom_node {
lxb_dom_node_type_t type;
+ size_t line;
+
#ifdef LXB_DOM_NODE_USER_VARIABLES
LXB_DOM_NODE_USER_VARIABLES
#endif /* LXB_DOM_NODE_USER_VARIABLES */
diff --git a/source/lexbor/html/token.h b/source/lexbor/html/token.h
index 79accd0..0b7f4fd 100644
--- a/source/lexbor/html/token.h
+++ b/source/lexbor/html/token.h
@@ -33,6 +33,8 @@ enum lxb_html_token_type {
typedef struct {
const lxb_char_t *begin;
const lxb_char_t *end;
+ size_t line;
+ size_t column;
const lxb_char_t *text_start;
const lxb_char_t *text_end;
diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c
index 741bced..0bd9aec 100644
--- a/source/lexbor/html/tokenizer.c
+++ b/source/lexbor/html/tokenizer.c
@@ -91,6 +91,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
tkz->pos = tkz->start;
tkz->end = tkz->start + LXB_HTML_TKZ_TEMP_SIZE;
+ /* current_line & current_column already initialized by calloc (zero-based) */
tkz->tree = NULL;
tkz->tags = NULL;
@@ -152,6 +153,8 @@ lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to,
tkz_to->start = tkz_from->start;
tkz_to->end = tkz_from->end;
tkz_to->pos = tkz_to->start;
+ tkz_to->current_line = tkz_from->current_line;
+ tkz_to->current_column = tkz_from->current_column;
return LXB_STATUS_OK;
}
@@ -312,7 +315,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
tkz->last = end;
while (data < end) {
- data = tkz->state(tkz, data, end);
+ size_t current_column = tkz->current_column;
+ const lxb_char_t *new_data = tkz->state(tkz, data, end);
+ while (data < new_data) {
+ /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
+ if (*data == '\n') {
+ tkz->current_line++;
+ current_column = 0;
+ } else {
+ /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
+ * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
+ if ((*data & 0b11000000) == 0b10000000) {
+ /* Continuation byte, do nothing */
+ } else {
+ /* First byte for a codepoint */
+ current_column++;
+ }
+ }
+ data++;
+ }
+ tkz->current_column = current_column;
}
return tkz->status;
diff --git a/source/lexbor/html/tokenizer.h b/source/lexbor/html/tokenizer.h
index ba9602f..74bb55e 100644
--- a/source/lexbor/html/tokenizer.h
+++ b/source/lexbor/html/tokenizer.h
@@ -73,6 +73,8 @@ struct lxb_html_tokenizer {
const lxb_char_t *end;
const lxb_char_t *begin;
const lxb_char_t *last;
+ size_t current_line;
+ size_t current_column;
/* Entities */
const lexbor_sbst_entry_static_t *entity;
diff --git a/source/lexbor/html/tokenizer/state.h b/source/lexbor/html/tokenizer/state.h
index 0892846..77b86ac 100644
--- a/source/lexbor/html/tokenizer/state.h
+++ b/source/lexbor/html/tokenizer/state.h
@@ -90,6 +90,8 @@ extern "C" {
do { \
tkz->pos = tkz->start; \
tkz->token->begin = v_begin; \
+ tkz->token->line = tkz->current_line; \
+ tkz->token->column = tkz->current_column; \
} \
while (0)
diff --git a/source/lexbor/html/tree.c b/source/lexbor/html/tree.c
index 0f067e4..bdec6a5 100644
--- a/source/lexbor/html/tree.c
+++ b/source/lexbor/html/tree.c
@@ -434,6 +434,9 @@ lxb_html_tree_create_element_for_token(lxb_html_tree_t *tree,
return NULL;
}
+ node->line = token->line;
+ /* We only expose line number in PHP DOM */
+
lxb_status_t status;
lxb_dom_element_t *element = lxb_dom_interface_element(node);
@@ -770,6 +773,11 @@ lxb_html_tree_insert_character_for_data(lxb_html_tree_t *tree,
lxb_dom_interface_text(text)->char_data.data = *str;
+ if (tree->tkz_ref) {
+ text->line = tree->tkz_ref->token->line;
+ /* We only expose line number in PHP DOM */
+ }
+
if (ret_node != NULL) {
*ret_node = text;
}
@@ -809,6 +817,9 @@ lxb_html_tree_insert_comment(lxb_html_tree_t *tree,
return NULL;
}
+ node->line = token->line;
+ /* We only expose line number in PHP DOM */
+
tree->status = lxb_html_token_make_text(token, &comment->char_data.data,
tree->document->dom_document.text);
if (tree->status != LXB_STATUS_OK) {
diff --git a/source/lexbor/html/tree/error.c b/source/lexbor/html/tree/error.c
index e6e43f4..88ad8c4 100644
--- a/source/lexbor/html/tree/error.c
+++ b/source/lexbor/html/tree/error.c
@@ -21,8 +21,9 @@ lxb_html_tree_error_add(lexbor_array_obj_t *parse_errors,
}
entry->id = id;
- entry->begin = token->begin;
- entry->end = token->end;
+ entry->line = token->line;
+ entry->column = token->column;
+ entry->length = token->end - token->begin;
return entry;
}
diff --git a/source/lexbor/html/tree/error.h b/source/lexbor/html/tree/error.h
index 2fd06cb..ed1859f 100644
--- a/source/lexbor/html/tree/error.h
+++ b/source/lexbor/html/tree/error.h
@@ -97,8 +97,9 @@ lxb_html_tree_error_id_t;
typedef struct {
lxb_html_tree_error_id_t id;
- const lxb_char_t *begin;
- const lxb_char_t *end;
+ size_t line;
+ size_t column;
+ size_t length;
}
lxb_html_tree_error_t;
--
2.44.0
|