From b20d746fa7cbb74716171bc49d836af99927e41e Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Sun, 11 Oct 2020 14:15:37 -0400
Subject: [PATCH 2/2] use new htmlParseLookupCommentEnd to find comment ends

Note that the caret in error messages generated during comment parsing
may have moved by one byte.

See guidance provided on incorrectly-closed comments here:

https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment
---
 HTMLparser.c | 46 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 9 deletions(-)

diff --git a/HTMLparser.c b/HTMLparser.c
index 4d43479..000dc3d 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -5331,6 +5331,39 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
     return (-1);
 }
 
+/**
+ * htmlParseLookupCommentEnd:
+ * @ctxt: an HTML parser context
+ *
+ * Try to find a comment end tag in the input stream
+ * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
+ * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
+ * This function has a side effect of (possibly) incrementing ctxt->checkIndex
+ * to avoid rescanning sequences of bytes, it DOES change the state of the
+ * parser, do not use liberally.
+ * This wraps to htmlParseLookupSequence()
+ *
+ * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
+ */
+static int
+htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
+{
+    int mark = 0;
+    int cur = CUR_PTR - BASE_PTR;
+
+    while (mark >= 0) {
+	mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 1, 1);
+	if ((mark < 0) ||
+	    (NXT(mark+2) == '>') ||
+	    ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
+	    return mark;
+	}
+	ctxt->checkIndex = cur + mark + 1;
+    }
+    return mark;
+}
+
+
 /**
  * htmlParseTryOrFinish:
  * @ctxt:  an HTML parser context
@@ -5507,8 +5540,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		cur = in->cur[0];
 	        if ((cur == '<') && (next == '!') &&
 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
-		    if ((!terminate) &&
-		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
+		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
 			goto done;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
@@ -5567,8 +5599,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		next = in->cur[1];
 		if ((cur == '<') && (next == '!') &&
 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
-		    if ((!terminate) &&
-		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
+		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
 			goto done;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
@@ -5614,8 +5645,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		next = in->cur[1];
 	        if ((cur == '<') && (next == '!') &&
 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
-		    if ((!terminate) &&
-		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
+		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
 			goto done;
 #ifdef DEBUG_PUSH
 		    xmlGenericError(xmlGenericErrorContext,
@@ -5871,9 +5901,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 			htmlParseDocTypeDecl(ctxt);
 		    } else if ((cur == '<') && (next == '!') &&
 			(in->cur[2] == '-') && (in->cur[3] == '-')) {
-			if ((!terminate) &&
-			    (htmlParseLookupSequence(
-				ctxt, '-', '-', '>', 1, 1) < 0))
+			if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
 			    goto done;
 #ifdef DEBUG_PUSH
 			xmlGenericError(xmlGenericErrorContext,
-- 
2.25.1

