File: htmlparse.cpp

package info (click to toggle)
recoll 1.43.12-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 16,952 kB
sloc: cpp: 104,785; python: 9,933; xml: 7,314; ansic: 6,447; sh: 1,252; perl: 166; makefile: 72
file content (434 lines) | stat: -rw-r--r-- 13,955 bytes
/* This file was copied/updated from xapian-omega-1.0.1 to 1.2.6 and modified */

/* htmlparse.cc: simple HTML parser for omega indexer
 *
 * Copyright 1999,2000,2001 BrightStation PLC
 * Copyright 2001 Ananova Ltd
 * Copyright 2002,2006,2007,2008,2009,2010,2011,2012,2015,2016,2018,2020 Olly Betts
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */

#include "htmlparse.h"

#include <algorithm>

#include <cctype>
#include <cstring>
#include <cstdio>
#include <iostream>

using std::find;
using std::find_if;
using std::string;
using std::map;

#define CONST_STRLEN(S) (sizeof(S"") - 1)

// HTML5 legacy compatibility doctype.
#define HTML5_LEGACY_COMPAT "about:legacy-compat"
#define HTML5_LEGACY_COMPAT_LEN CONST_STRLEN(HTML5_LEGACY_COMPAT)

map<string, unsigned int> HtmlParser::named_ents;

// These have different defs in xapian but are helpful to minimize the diffs
#define C_tolower(c) tolower(static_cast<unsigned char>(c))
#define C_isalnum(c) isalnum(static_cast<unsigned char>(c))
#define C_isspace(c) isspace(static_cast<unsigned char>(c))
#define C_isalpha(c) isalpha(static_cast<unsigned char>(c))

inline static bool
C_isnotspace(char c)
{
    return !C_isspace(c);
}

static inline void
lowercase_string(string &str)
{
    for (string::iterator i = str.begin(); i != str.end(); ++i) {
        *i = C_tolower(*i);
    }
}

static inline bool
p_nottag(char c)
{
    // ':' for XML namespaces.
    return !C_isalnum(c) && c != '.' && c != '-' && c != ':';
}

static inline bool
p_whitespacegt(char c)
{
    return C_isspace(c) || c == '>';
}

static inline bool
p_whitespaceeqgt(char c)
{
    return C_isspace(c) || c == '=' || c == '>';
}

bool
HtmlParser::get_parameter(const string & param, string & value) const
{
    map<string, string>::const_iterator i = parameters.find(param);
    if (i == parameters.end()) return false;
    value = i->second;
    return true;
}


void
HtmlParser::decode_entities(string &)
{
    // Not used for recoll.
    // Almost the same code in myhtmlparse except that the
    // entity table directly holds the utf-8 strings instead of the
    // unicode positions (one less conversion).
}

void
HtmlParser::parse_html(const string &body)
{
    string::const_iterator begin_after_bom = body.begin();

    in_script = false;

    parameters.clear();
    string::const_iterator start = begin_after_bom;

    while (true) {
        // Skip through until we find an HTML tag, a comment, or the end of
        // document.  Ignore isolated occurrences of '<' which don't start
        // a tag or comment.    
        string::const_iterator p = start;
        while (true) {
            p = find(p, body.end(), '<');
            if (p == body.end()) break;
            unsigned char ch = *(p + 1);

            // Tag, closing tag, or comment (or SGML declaration).
            if ((!in_script && C_isalpha(ch)) || ch == '/' || ch == '!') break;

            if (ch == '?') {
                // PHP code or XML declaration.
                // XML declaration is only valid at the start of the first line.
                if (p != begin_after_bom || body.size() < 20) break;

                // XML declaration looks something like this:
                // <?xml version="1.0" encoding="UTF-8"?>
                if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
                if (strchr(" \t\r\n", p[5]) == NULL) break;

                string::const_iterator decl_end = find(p + 6, body.end(), '?');
                if (decl_end == body.end()) break;

                // Default charset for XML is UTF-8.
                charset = "utf-8";

                string decl(p + 6, decl_end);
                size_t enc = decl.find("encoding");
                if (enc == string::npos) break;

                enc = decl.find_first_not_of(" \t\r\n", enc + 8);
                if (enc == string::npos || enc == decl.size()) break;

                if (decl[enc] != '=') break;
        
                enc = decl.find_first_not_of(" \t\r\n", enc + 1);
                if (enc == string::npos || enc == decl.size()) break;

                if (decl[enc] != '"' && decl[enc] != '\'') break;

                char quote = decl[enc++];
                size_t enc_end = decl.find(quote, enc);

                if (enc != string::npos)
                    charset.assign(decl, enc, enc_end - enc);

                break;
            }
            ++p;
        }

        // Process text up to start of tag.
        if (p > start) {
            string text(body, start - body.begin(), p - start);
            decode_entities(text);
            process_text(text);
        }

        if (p == body.end()) break;

        start = p + 1;
   
        if (start == body.end()) break;

	if (*start == '!') {
	    if (++start == body.end()) break;

	    // Comment, SGML declaration, or HTML5 DTD.
	    char first_ch = *start;
	    if (++start == body.end()) break;
	    if (first_ch == '-' && *start == '-') {
		++start;
		string::const_iterator close = find(start, body.end(), '>');
		// An unterminated comment swallows rest of document
		// (like Netscape, but unlike MSIE IIRC)
		if (close == body.end()) break;

		p = close;
		// look for -->
		while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
		    p = find(p + 1, body.end(), '>');

		if (p != body.end()) {
		    // Check for htdig's "ignore this bit" comments.
		    if (p - start == CONST_STRLEN("htdig_noindex") + 2 &&
			memcmp(&*start, "htdig_noindex",
			       CONST_STRLEN("htdig_noindex")) == 0) {
			auto i = body.find("<!--/htdig_noindex-->",
					   p + 1 - body.begin());
			if (i == string::npos) break;
			start = body.begin() + i +
			    CONST_STRLEN("<!--/htdig_noindex-->");
			continue;
		    }
		    // Check for udmcomment (similar to htdig's)
		    if (p - start == CONST_STRLEN("UdmComment") + 2 &&
			memcmp(&*start, "UdmComment",
			       CONST_STRLEN("UdmComment")) == 0) {
			auto i = body.find("<!--/UdmComment-->",
					   p + 1 - body.begin());
			if (i == string::npos) break;
			start = body.begin() + i +
			    CONST_STRLEN("<!--/UdmComment-->");
			continue;
		    }
		    // If we found --> skip to there.
		    start = p;
		} else {
		    // Otherwise skip to the first > we found (as Netscape does).
		    start = close;
		}
	    } else if (first_ch == '[' &&
		       body.size() - (start - body.begin()) > 6 &&
		       body.compare(start - body.begin(), 6, "CDATA[", 6) == 0) {
		start += 6;
		string::size_type b = start - body.begin();
		string::size_type i;
		i = body.find("]]>", b);
		string text(body, b, i - b);
		process_text(text);
		if (i == string::npos) break;
		start = body.begin() + i + 2;
	    } else if (C_tolower(first_ch) == 'd' &&
		       body.end() - start > 6 &&
		       C_tolower(start[0]) == 'o' &&
		       C_tolower(start[1]) == 'c' &&
		       C_tolower(start[2]) == 't' &&
		       C_tolower(start[3]) == 'y' &&
		       C_tolower(start[4]) == 'p' &&
		       C_tolower(start[5]) == 'e' &&
		       C_isspace(start[6])) {
		// DOCTYPE declaration.
		start += 7;
		while (start != body.end() && C_isspace(*start)) {
		    ++start;
		}
		if (start == body.end()) break;
		if (body.end() - start >= 5 &&
		    C_tolower(start[0]) == 'h' &&
		    C_tolower(start[1]) == 't' &&
		    C_tolower(start[2]) == 'm' &&
		    C_tolower(start[3]) == 'l' &&
		    (start[4] == '>' || C_isspace(start[4]))) {
		    start += 4;

		    // HTML doctype.
		    while (start != body.end() && C_isspace(*start)) {
			++start;
		    }
		    if (start == body.end()) break;

		    if (*start == '>') {
			// <!DOCTYPE html>
			// Default charset for HTML5 is UTF-8.
			charset = "utf-8";
		    }
		} else if (body.end() - start >= 29 &&
			   C_tolower(start[0]) == 's' &&
			   C_tolower(start[1]) == 'y' &&
			   C_tolower(start[2]) == 's' &&
			   C_tolower(start[3]) == 't' &&
			   C_tolower(start[4]) == 'e' &&
			   C_tolower(start[5]) == 'm' &&
			   C_isspace(start[6])) {
		    start += 7;
		    while (start != body.end() && C_isspace(*start)) {
			++start;
		    }
		    size_t left = body.end() - start;
		    if (left >= HTML5_LEGACY_COMPAT_LEN + 3 &&
			(*start == '\'' || *start == '"') &&
			start[HTML5_LEGACY_COMPAT_LEN + 1] == *start &&
			body.compare(start - body.begin() + 1,
				     HTML5_LEGACY_COMPAT_LEN,
				     HTML5_LEGACY_COMPAT,
				     HTML5_LEGACY_COMPAT_LEN) == 0) {
			// HTML5 legacy compatibility doctype:
			// <!DOCTYPE html SYSTEM "about:legacy-compat">
			start += HTML5_LEGACY_COMPAT_LEN + 2;
			// Default charset for HTML5 is UTF-8.
			charset = "utf-8";
		    }
		}
		start = find(start - 1, body.end(), '>');
		if (start == body.end()) break;
	    } else {
		// Some other SGML declaration - ignore it.
		start = find(start - 1, body.end(), '>');
		if (start == body.end()) break;
	    }
	    ++start;
	} else if (*start == '?') {
            if (++start == body.end()) break;
            // PHP - swallow until ?> or EOF
            start = find(start + 1, body.end(), '>');

            // look for ?>
            while (start != body.end() && *(start - 1) != '?')
                start = find(start + 1, body.end(), '>');

	    if (start == body.end()) {
		// The closing ?> at the end of a file is optional so ignore
		// the rest of the document if there isn't one:
		// https://www.php.net/basic-syntax.instruction-separation
	    } else {
		// PHP ignores an immediately trailing newline after the
		// closing tag:
		// https://www.php.net/basic-syntax.instruction-separation
		// Testing shows \n, \r and \r\n are skipped.
		++start;
		if (*start == '\r') ++start;
		if (*start == '\n') ++start;
	    }
        } else {
            // opening or closing tag
            int closing = 0;

            if (*start == '/') {
                closing = 1;
                start = find_if(start + 1, body.end(), C_isnotspace);
            }
          
            p = start;
            start = find_if(start, body.end(), p_nottag);
            string tag(body, p - body.begin(), start - p);
            // convert tagname to lowercase
            lowercase_string(tag);

            if (closing) {
                if (!closing_tag(tag))
                    return;
                if (in_script && tag == "script") in_script = false;

                /* ignore any bogus parameters on closing tags */
                p = find(start, body.end(), '>');
                if (p == body.end()) break;
                start = p + 1;
            } else {
                bool empty_element = false;
                // FIXME: parse parameters lazily.
                while (start < body.end() && *start != '>') {
                    string name, value;

                    p = find_if(start, body.end(), p_whitespaceeqgt);

                    size_t name_len = p - start;
                    if (name_len == 1) {
                        if (*start == '/' && p < body.end() && *p == '>') {
                            // E.g. <tag foo="bar" />
                            start = p;
                            empty_element = true;
                            break;
                        }
                    }

                    name.assign(body, start - body.begin(), name_len);

                    p = find_if(p, body.end(), C_isnotspace);

                    start = p;
                    if (start != body.end() && *start == '=') {
                        start = find_if(start + 1, body.end(), C_isnotspace);

                        p = body.end();

                        int quote = *start;
                        if (quote == '"' || quote == '\'') {
                            ++start;
                            p = find(start, body.end(), quote);
                        }

			if (p != body.end()) {
			    // quoted
			    value.assign(body, start - body.begin(), p - start);
			    ++p;
			} else {
			    // unquoted or no closing quote
			    p = find_if(start, body.end(), p_whitespacegt);
			    value.assign(body, start - body.begin(), p - start);
			}
                        start = find_if(p, body.end(), C_isnotspace);

                        if (!name.empty()) {
                            // convert parameter name to lowercase
                            lowercase_string(name);
                            // in case of multiple entries, use the first
                            // (as Netscape does)
                            parameters.insert(make_pair(name, value));
                        }
                    }
                }
#if 0
                cout << "<" << tag;
                map<string, string>::const_iterator x;
                for (x = parameters.begin(); x != parameters.end(); ++x) {
                    cout << " " << x->first << "=\"" << x->second << "\"";
                }
                cout << ">\n";
#endif
                if (!opening_tag(tag))
                    return;
                parameters.clear();

                if (empty_element) {
                    if (!closing_tag(tag))
                        return;
                }

                // In <script> tags we ignore opening tags to avoid problems
                // with "a<b".
                if (tag == "script") in_script = true;

                if (start != body.end() && *start == '>') ++start;
            }
        }
    }
}