From 7c7c35d8ea9f65f081564b3ad1bfe9f0db33dd69 Mon Sep 17 00:00:00 2001
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
Date: Sat, 26 Aug 2023 15:08:59 +0200
Subject: [PATCH 1/6] Expose line and column information for use in PHP

---
 source/lexbor/dom/interfaces/node.h  |  2 ++
 source/lexbor/html/token.h           |  2 ++
 source/lexbor/html/tokenizer.c       | 24 +++++++++++++++++++++++-
 source/lexbor/html/tokenizer.h       |  2 ++
 source/lexbor/html/tokenizer/state.h |  2 ++
 source/lexbor/html/tree.c            | 11 +++++++++++
 source/lexbor/html/tree/error.c      |  5 +++--
 source/lexbor/html/tree/error.h      |  5 +++--
 8 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h
index c37b790..8ac218b 100644
--- a/source/lexbor/dom/interfaces/node.h
+++ b/source/lexbor/dom/interfaces/node.h
@@ -58,6 +58,8 @@ struct lxb_dom_node {
 
     lxb_dom_node_type_t    type;
 
+    size_t                 line;
+
 #ifdef LXB_DOM_NODE_USER_VARIABLES
     LXB_DOM_NODE_USER_VARIABLES
 #endif /* LXB_DOM_NODE_USER_VARIABLES */
diff --git a/source/lexbor/html/token.h b/source/lexbor/html/token.h
index 79accd0..0b7f4fd 100644
--- a/source/lexbor/html/token.h
+++ b/source/lexbor/html/token.h
@@ -33,6 +33,8 @@ enum lxb_html_token_type {
 typedef struct {
     const lxb_char_t      *begin;
     const lxb_char_t      *end;
+    size_t                line;
+    size_t                column;
 
     const lxb_char_t      *text_start;
     const lxb_char_t      *text_end;
diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c
index 741bced..0bd9aec 100644
--- a/source/lexbor/html/tokenizer.c
+++ b/source/lexbor/html/tokenizer.c
@@ -91,6 +91,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
 
     tkz->pos = tkz->start;
     tkz->end = tkz->start + LXB_HTML_TKZ_TEMP_SIZE;
+    /* current_line & current_column already initialized by calloc (zero-based) */
 
     tkz->tree = NULL;
     tkz->tags = NULL;
@@ -152,6 +153,8 @@ lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to,
     tkz_to->start = tkz_from->start;
     tkz_to->end = tkz_from->end;
     tkz_to->pos = tkz_to->start;
+    tkz_to->current_line = tkz_from->current_line;
+    tkz_to->current_column = tkz_from->current_column;
 
     return LXB_STATUS_OK;
 }
@@ -312,7 +315,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
     tkz->last = end;
 
     while (data < end) {
-        data = tkz->state(tkz, data, end);
+        size_t current_column = tkz->current_column;
+        const lxb_char_t *new_data = tkz->state(tkz, data, end);
+        while (data < new_data) {
+            /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
+            if (*data == '\n') {
+                tkz->current_line++;
+                current_column = 0;
+            } else {
+                /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
+                 * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
+                if ((*data & 0b11000000) == 0b10000000) {
+                    /* Continuation byte, do nothing */
+                } else {
+                    /* First byte for a codepoint */
+                    current_column++;
+                }
+            }
+            data++;
+        }
+        tkz->current_column = current_column;
     }
 
     return tkz->status;
diff --git a/source/lexbor/html/tokenizer.h b/source/lexbor/html/tokenizer.h
index ba9602f..74bb55e 100644
--- a/source/lexbor/html/tokenizer.h
+++ b/source/lexbor/html/tokenizer.h
@@ -73,6 +73,8 @@ struct lxb_html_tokenizer {
     const lxb_char_t                 *end;
     const lxb_char_t                 *begin;
     const lxb_char_t                 *last;
+    size_t                           current_line;
+    size_t                           current_column;
 
     /* Entities */
     const lexbor_sbst_entry_static_t *entity;
diff --git a/source/lexbor/html/tokenizer/state.h b/source/lexbor/html/tokenizer/state.h
index 0892846..77b86ac 100644
--- a/source/lexbor/html/tokenizer/state.h
+++ b/source/lexbor/html/tokenizer/state.h
@@ -90,6 +90,8 @@ extern "C" {
     do {                                                                       \
         tkz->pos = tkz->start;                                                 \
         tkz->token->begin = v_begin;                                           \
+        tkz->token->line = tkz->current_line;                                  \
+        tkz->token->column = tkz->current_column;                              \
     }                                                                          \
     while (0)
 
diff --git a/source/lexbor/html/tree.c b/source/lexbor/html/tree.c
index 0f067e4..bdec6a5 100644
--- a/source/lexbor/html/tree.c
+++ b/source/lexbor/html/tree.c
@@ -434,6 +434,9 @@ lxb_html_tree_create_element_for_token(lxb_html_tree_t *tree,
         return NULL;
     }
 
+    node->line = token->line;
+    /* We only expose line number in PHP DOM */
+
     lxb_status_t status;
     lxb_dom_element_t *element = lxb_dom_interface_element(node);
 
@@ -770,6 +773,11 @@ lxb_html_tree_insert_character_for_data(lxb_html_tree_t *tree,
 
     lxb_dom_interface_text(text)->char_data.data = *str;
 
+    if (tree->tkz_ref) {
+        text->line = tree->tkz_ref->token->line;
+        /* We only expose line number in PHP DOM */
+    }
+
     if (ret_node != NULL) {
         *ret_node = text;
     }
@@ -809,6 +817,9 @@ lxb_html_tree_insert_comment(lxb_html_tree_t *tree,
         return NULL;
     }
 
+    node->line = token->line;
+    /* We only expose line number in PHP DOM */
+
     tree->status = lxb_html_token_make_text(token, &comment->char_data.data,
                                             tree->document->dom_document.text);
     if (tree->status != LXB_STATUS_OK) {
diff --git a/source/lexbor/html/tree/error.c b/source/lexbor/html/tree/error.c
index e6e43f4..88ad8c4 100644
--- a/source/lexbor/html/tree/error.c
+++ b/source/lexbor/html/tree/error.c
@@ -21,8 +21,9 @@ lxb_html_tree_error_add(lexbor_array_obj_t *parse_errors,
     }
 
     entry->id = id;
-    entry->begin = token->begin;
-    entry->end = token->end;
+    entry->line = token->line;
+    entry->column = token->column;
+    entry->length = token->end - token->begin;
 
     return entry;
 }
diff --git a/source/lexbor/html/tree/error.h b/source/lexbor/html/tree/error.h
index 2fd06cb..ed1859f 100644
--- a/source/lexbor/html/tree/error.h
+++ b/source/lexbor/html/tree/error.h
@@ -97,8 +97,9 @@ lxb_html_tree_error_id_t;
 
 typedef struct {
     lxb_html_tree_error_id_t id;
-    const lxb_char_t         *begin;
-    const lxb_char_t         *end;
+    size_t                   line;
+    size_t                   column;
+    size_t                   length;
 }
 lxb_html_tree_error_t;
 
-- 
2.44.0