/* * DSI utilities * * Copyright (C) 2005-2023 Sebastiano Vigna * * This program and the accompanying materials are made available under the * terms of the GNU Lesser General Public License v2.1 or later, * which is available at * http://www.gnu.org/licenses/old-licenses/lgpl-2.1-standalone.html, * or the Apache Software License 2.0, which is available at * https://www.apache.org/licenses/LICENSE-2.0. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. * * SPDX-License-Identifier: LGPL-2.1-or-later OR Apache-2.0 */ package it.unimi.dsi.parser; import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap; import it.unimi.dsi.fastutil.objects.Reference2ObjectMap; import it.unimi.dsi.fastutil.objects.ReferenceArraySet; import it.unimi.dsi.fastutil.objects.ReferenceSet; import it.unimi.dsi.fastutil.objects.ReferenceSets; import it.unimi.dsi.lang.MutableString; import it.unimi.dsi.parser.callback.Callback; import it.unimi.dsi.util.TextPattern; /** * A fast, lightweight, on-demand (X)HTML parser. * *

* The bullet parser has been written with two specific goals in mind: web crawling and targeted * data extraction from massive web data sets. To be usable in such environments, a parser must obey * a number of restrictions: *

it should avoid excessive object creation (which, for instance, forbids a significant usage * of Java strings); *
it should tolerate invalid syntax and recover reasonably; in fact, it should never throw * exceptions; *
it should perform actual parsing only on a settable feature subset: there is no reason to * parse the attributes of a P element while searching for links; *
it should parse HTML as a regular language, and leave context-free properties (e.g., * stack maintenance and repair) to suitably designed callbacks. *

* *

* Thus, in fact the bullet parser is not a parser. It is a bunch of spaghetti code that analyses a * stream of characters pretending that it is an (X)HTML document. It has a very defensive attitude * against the stream character it is parsing, but at the same time it is forgiving with all typical * (X)HTML mistakes. * *

* The bullet parser is officially StringFree™. MutableStrings * are used for internal processing, and Java strings are used only to return attribute values. All * internal maps are {@linkplain it.unimi.dsi.fastutil.objects.Reference2ObjectMap reference-based * maps} from fastutil, which helps to * accelerate further the parsing process. * *

HTML data

* *

* The bullet parser uses attributes and methods of {@link it.unimi.dsi.parser.HTMLFactory}, * {@link it.unimi.dsi.parser.Element}, {@link it.unimi.dsi.parser.Attribute} and * {@link it.unimi.dsi.parser.Entity}. Thus, for instance, whenever an element is to be passed * around it is one of the shared objects contained in {@link it.unimi.dsi.parser.Element} (e.g., * {@link it.unimi.dsi.parser.Element#BODY}). * *

Callbacks

* *

* The result of the parsing process is the invocation of a callback. The * {@linkplain it.unimi.dsi.parser.callback.Callback callback interface} of the bullet parser * remembers closely SAX2, but it has some additional methods targeted at (X)HTML, such as * {@link it.unimi.dsi.parser.callback.Callback#cdata(it.unimi.dsi.parser.Element,char[],int,int)}, * which returns characters found in a CDATA section (e.g., a stylesheet). * *

* Each callback must configure the parser, by requesting to perform the analysis and the callbacks * it requires. A callback that wants to extract and tokenise text, for instance, will certainly * require {@link #parseText(boolean) parseText(true)}, but not {@link #parseTags(boolean) * parseTags(true)}. On the other hand, a callback wishing to extract links will require to * {@linkplain #parseAttribute(Attribute) parse selectively} certain attribute types. * *

* A more precise description follows. * *

Writing callbacks

* *

* The first important issue is what has to be required to the parser. A newly created parser does * not invoke any callback. It is up to every callback to add features so that it can do its job. * Remember that since many callbacks can be * {@linkplain it.unimi.dsi.parser.callback.ComposedCallbackBuilder composed}, you must always * add features, never remove them, and moreover your callbacks must be ready to * be invoked with features they did not request (e.g., attribute types added by another callback). * *

* The following parse features may be configured; most of them are just boolean features, a.k.a. * flags: unless otherwise specified, by default all flags are set to false (e.g., by the default * the parser will not parse tags): *

tags ({@link #parseTags(boolean)} method): whether tags should be parsed; *
attributes ({@link #parseAttributes(boolean)} and {@link #parseAttribute(Attribute) * methods)}: whether attributes should be parsed (of course, setting this flag is useless if you * are not parsing tags); note that setting this flag will just activate the attribute parsing * feature, but you must also {@linkplain #parseAttribute(Attribute) register} every attribute whose * value you want to obtain. *
text ({@link #parseText(boolean)}method): whether text should be parsed; if this * flag is set, the parser will call the * {@link it.unimi.dsi.parser.callback.Callback#characters(char[], int, int, boolean)} method for * every text chunk found. *
CDATA sections ({@link #parseCDATA(boolean)}method): whether CDATA sections * (stylesheets & scripts) should be parsed; if this flag is set, the parser will call the * {@link it.unimi.dsi.parser.callback.Callback#cdata(Element,char[],int,int)} method for every * CDATA section found. *

* *

Invoking the parser

* *

* After {@linkplain #setCallback(Callback) setting the parser callback}, you just call * {@link #parse(char[], int, int)}. * * @deprecated This class is obsolete and kept around for backward compatibility only. */ @Deprecated public class BulletParser { private static final boolean DEBUG = false; /** Scanning text.. */ protected static final int STATE_TEXT = 0; /** Scanning attribute name/value pairs. */ protected static final int STATE_BEFORE_START_TAG_NAME = 1; /** Scanning a closing tag. */ protected static final int STATE_BEFORE_END_TAG_NAME = 2; /** Scanning attribute name/value pairs. */ protected static final int STATE_IN_START_TAG = 3; /** Scanning a closing tag. */ protected static final int STATE_IN_END_TAG = 4; /** The maximum Unicode value accepted for a numeric entity. */ protected static final int MAX_ENTITY_VALUE = 65535; /** The base for non-decimal entity. */ protected static final int HEXADECIMAL = 16; /** The maximum number of digits of a hexadecimal numeric entity. */ protected static final int MAX_HEX_ENTITY_LENGTH = 8; /** The maximum number of digits of a decimal numeric entity. */ protected static final int MAX_DEC_ENTITY_LENGTH = 9; /** Closing tag for a script element. */ protected static final TextPattern SCRIPT_CLOSE_TAG_PATTERN = new TextPattern("", TextPattern.CASE_INSENSITIVE); /** Closing tag for a style element. */ protected static final TextPattern STYLE_CLOSE_TAG_PATTERN = new TextPattern("", TextPattern.CASE_INSENSITIVE); /** An array containing the non-space whitespace. */ protected static final char[] NONSPACE_WHITESPACE = { '\n', '\r', '\t' }; /** An array, parallel to {@link #NONSPACE_WHITESPACE}, containing spaces. */ protected static final char[] SPACE = { ' ', ' ', ' ' }; /** Closed comment. It should be "-->", but mistakes are common. */ protected static final TextPattern CLOSED_COMMENT = new TextPattern("->"); /** Closed ASP or similar tag. */ protected static final TextPattern CLOSED_PERCENT = new TextPattern("%>"); /** Closed processing instruction. */ protected static final TextPattern CLOSED_PIC = new TextPattern("?>"); /** Closed section (conditional, etc.). */ protected static final TextPattern CLOSED_SECTION = new TextPattern("]>"); /** Closed section (conditional, CDATA, etc.). */ protected static final TextPattern CLOSED_CDATA = new TextPattern("]]>"); /** TODO: what is this?. */ //protected static final TextPattern CLOSED_BOH = new TextPattern("!>"); /** The parsing factory used by this parser. */ public final ParsingFactory factory; /** The callback of this parser. */ protected Callback callback; /** A map from attributes to attribute values. */ protected Reference2ObjectMap attrMap; /** Whether we should invoke the text handler. */ protected boolean parseText; /** Whether we should invoke the CDATA section handler. */ protected boolean parseCDATA; /** Whether we should parse tags. */ protected boolean parseTags; /** Whether we should parse attributes. */ protected boolean parseAttributes; /** * The subset of attributes whose values will be actually parsed (if, of * course, {@link #parseAttributes}is true). */ protected ReferenceArraySet parsedAttrs = new ReferenceArraySet<>(); /** * An externally visible, immutable subset of attributes whose values will * be actually parsed. */ public ReferenceSet parsedAttributes = ReferenceSets.unmodifiable(parsedAttrs); /** The character represented by the last scanned entity. */ protected char lastEntity; /** Creates a new bullet parser. */ public BulletParser(final ParsingFactory factory) { this.factory = factory; } /** Creates a new bullet parser using the default factory {@link HTMLFactory#INSTANCE}. */ public BulletParser() { this(HTMLFactory.INSTANCE); } /** * Returns whether this parser will invoke the text handler. * * @return whether this parser will invoke the text handler. * @see #parseText(boolean) */ public boolean parseText() { return parseText; } /** * Sets the text handler flag. * * @param parseText * the new value. * @return this parser. */ public BulletParser parseText(final boolean parseText) { this.parseText = parseText; return this; } /** * Returns whether this parser will invoke the CDATA-section handler. * * @return whether this parser will invoke the CDATA-section handler. * @see #parseCDATA(boolean) */ public boolean parseCDATA() { return parseCDATA; } /** * Sets the CDATA-section handler flag. * * @param parseCDATA * the new value. * @return this parser. */ public BulletParser parseCDATA(final boolean parseCDATA) { this.parseCDATA = parseCDATA; return this; } /** * Returns whether this parser will parse tags and invoke element handlers. * * @return whether this parser will parse tags and invoke element handlers. * @see #parseTags(boolean) */ public boolean parseTags() { return parseTags; } /** * Sets whether this parser will parse tags and invoke element handlers. * * @param parseTags * the new value. * @return this parser. */ public BulletParser parseTags(final boolean parseTags) { this.parseTags = parseTags; return this; } /** * Returns whether this parser will parse attributes. * * @return whether this parser will parse attributes. * @see #parseAttributes(boolean) */ public boolean parseAttributes() { return parseAttributes; } /** * Sets the attribute parsing flag. * * @param parseAttributes * the new value for the flag. * @return this parser. */ public BulletParser parseAttributes(final boolean parseAttributes) { this.parseAttributes = parseAttributes; return this; } /** * Adds the given attribute to the set of attributes to be parsed. * * @param attribute * an attribute that should be parsed. * @throws IllegalStateException * if {@link #parseAttributes(boolean) parseAttributes(true)} * has not been invoked on this parser. * @return this parser. */ public BulletParser parseAttribute(final Attribute attribute) { parsedAttrs.add(attribute); return this; } /** Sets the callback for this parser, resetting at the same time all parsing flags. * * @param callback the new callback. * @return this parser. */ public BulletParser setCallback(final Callback callback) { this.callback = callback; parseCDATA = parseText = parseAttributes = parseTags = false; parsedAttrs.clear(); callback.configure(this); return this; } /** Returns the character corresponding to a given entity name. * * @param name the name of an entity. * @return the character corresponding to the entity, or an ASCII NUL if no entity with that name was found. */ protected char entity2Char(final MutableString name) { final Entity e = factory.getEntity(name); return e == null ? (char)0 : e.character; } /** Searches for the end of an entity. * *

This method will search for the end of an entity starting at the given offset (the offset * must correspond to the ampersand). * *

Real-world HTML pages often contain hundreds of misplaced ampersands, due to the * unfortunate idea of using the ampersand as query separator (please use the comma * in new code!). All such ampersand should be specified as &. * If named entities are delimited using a transition * from alphabetical to non-alphabetical characters, we can easily get false positives. If the parameter * loose is false, named entities can be delimited only by whitespace or by a comma. * * @param a a character array containing the entity. * @param offset the offset at which the entity starts (the offset must point at the ampersand). * @param length an upper bound to the maximum returned position. * @param loose if true, named entities can be terminated by any non-alphabetical character * (instead of whitespace or comma). * @param entity a support mutable string used to query {@link ParsingFactory#getEntity(MutableString)}. * @return the position of the last character of the entity, or -1 if no entity was found. */ protected int scanEntity(final char[] a, final int offset, final int length, final boolean loose, final MutableString entity) { int i, c = 0; String tmpEntity; if (length < 2) return -1; if (a[offset + 1] == '#') { if (length > 2 && a[offset + 2] == 'x') { for(i = 3; i < length && i < MAX_HEX_ENTITY_LENGTH && Character.digit(a[i + offset], HEXADECIMAL) != -1; i++); tmpEntity = new String(a, offset + 3, i - 3); if (i != 3) c = Integer.parseInt(tmpEntity, HEXADECIMAL); } else { for(i = 2; i < length && i < MAX_DEC_ENTITY_LENGTH && Character.isDigit(a[i + offset]); i++); tmpEntity = new String(a, offset + 2, i - 2); if (i != 2) c = Integer.parseInt(tmpEntity); } if (c > 0 && c < MAX_ENTITY_VALUE) { lastEntity = (char)c; if (i < length && a[i + offset] == ';') i++; return i + offset; } } else if (Character.isLetter(a[offset + 1])) { for(i = 2; i < length && Character.isLetterOrDigit(a[offset + i]); i++); if (i != 1 && (loose || (i < length && (Character.isWhitespace(a[offset + i]) || a[offset + i] == ';'))) && (lastEntity = entity2Char(entity.length(0).append(a, offset + 1, i - 1))) != 0) { if (i < length && a[i + offset] == ';') i++; return i + offset; } } return -1; } /** * Replaces entities with the corresponding characters. * *

This method will modify the mutable string s so that all legal occurrences * of entities are replaced by the corresponding character. * * @param s a mutable string whose entities will be replaced by the corresponding characters. * @param entity a support mutable string used by {@link #scanEntity(char[], int, int, boolean, MutableString)}. * @param loose a parameter that will be passed to {@link #scanEntity(char[], int, int, boolean, MutableString)}. */ protected void replaceEntities(final MutableString s, final MutableString entity, final boolean loose) { final char[] a = s.array(); int length = s.length(); /* We examine the string *backwards*, so that i is always a valid index. */ int i = length, j; while(i-- > 0) if (a[i] == '&' && (j = scanEntity(a, i, length - i, loose, entity)) != -1) length = s.replace(i, j, lastEntity).length(); } /** Handles markup. * * @param text the text. * @param pos the first character in the markup after <!. * @param end the end of text. * @return the position of the first character after the markup. */ protected int handleMarkup(final char[] text, int pos, final int end) { // A markup instruction (doctype, comment, etc.). switch(text[++pos]) { case 'D': case 'd': // DOCTYPE while(pos < end && text[pos++] != '>'); break; case '-': // comment if ((pos = CLOSED_COMMENT.search(text, pos, end)) == -1) pos = end; else pos += CLOSED_COMMENT.length(); break; default: if (pos < end - 6 && text[pos] == '[' && text[pos + 1] == 'C' && text[pos + 2] == 'D' && text[pos + 3] == 'A' && text[pos + 4] == 'T' && text[pos + 5] == 'A' && text[pos + 6] == '[') { // CDATA section final int last = CLOSED_CDATA.search(text, pos, end); if (parseCDATA) callback.cdata(null, text, pos + 7, (last == -1 ? end : last) - pos - 7); pos = last == -1 ? end : last + CLOSED_CDATA.length(); } // Generic markup else while(pos < end && text[pos++] != '>'); break; } return pos; } /** Handles processing instruction, ASP tags etc. * * @param text the text. * @param pos the first character in the markup after <%. * @param end the end of text. * @return the position of the first character after the processing instruction. */ protected int handleProcessingInstruction(final char[] text, int pos, final int end) { switch(text[++pos]) { case '%': if ((pos = CLOSED_PERCENT.search(text, pos, end)) == -1) pos = end; else pos += CLOSED_PERCENT.length(); break; case '?': if ((pos = CLOSED_PIC.search(text, pos, end)) == -1) pos = end; else pos += CLOSED_PIC.length(); break; case '[': if ((pos = CLOSED_SECTION.search(text, pos, end)) == -1) pos = end; else pos += CLOSED_SECTION.length(); break; default: // Generic markup while(pos < end && text[pos++] != '>'); break; } return pos; } /** * Analyze the text document to extract information. * * @param text a char array of text to be parsed. */ public void parse(final char[] text) { parse(text, 0, text.length); } /** * Analyze the text document to extract information. * * @param text a char array of text to be parsed. * @param offset the offset in the array from which the parsing will begin. * @param length the number of characters to be parsed. */ public void parse(final char[] text, final int offset, final int length) { final MutableString tagElemTypeName = new MutableString(); final MutableString attrName = new MutableString(); final MutableString attrValue = new MutableString(); final MutableString entity = new MutableString(); final MutableString characters = new MutableString(); /* During the analysis of attribute we need a separator for values */ char delim; /* The current character */ char currChar; /* The state of the switch */ int state; /* Others integer values used in the parsing process */ int start, k; /* This boolean is set true if we have words to handle */ boolean flowBroken = false, parseCurrAttr; /* The current element. */ Element currentElement; /* The current attribute object */ Attribute currAttr = null; attrMap = new Reference2ObjectArrayMap<>(16); callback.startDocument(); tagElemTypeName.length(0); attrName.length(0); attrValue.length(0); entity.length(0); state = STATE_TEXT; currentElement = null; final int end = offset + length; int pos = offset; /* This is the main loop. */ while (pos < end) switch(state) { case STATE_TEXT: currChar = text[pos]; if (currChar == '&') { // We handle both the case of an entity, and that of a stray '&'. if ((k = scanEntity(text, pos, end - pos, true, entity)) == -1) { currChar = '&'; pos++; } else { currChar = lastEntity; pos = k; if (DEBUG) System.err.println("Entity at: " + pos + " end of entity: " + k + " entity: " + entity + " char: " + currChar); } if (parseText) characters.append(currChar); continue; } // No tags can happen later than end - 2. if (currChar != '<' || pos >= end - 2) { if (parseText) characters.append(currChar); pos++; continue; } switch(text[++pos]) { case '!': pos = handleMarkup(text, pos, end); break; case '%': case '?': pos = handleProcessingInstruction(text, pos, end); break; default: // Actually a tag. Note that we allow for and that we skip false positives // due to sloppy HTML writing (e.g., "<-- hello! -->"). if (Character.isLetter(text[pos])) state = STATE_BEFORE_START_TAG_NAME; else if (text[pos] == '/' && (Character.isLetter(text[pos + 1]) || text[pos + 1] == '>')) { state = STATE_BEFORE_END_TAG_NAME; pos++; } else { // Not really a tag. if (parseText) characters.append('<'); continue; } break; } if (parseText && characters.length() != 0) { callback.characters(characters.array(), 0, characters.length(), flowBroken); characters.length(0); } flowBroken = false; break; case STATE_BEFORE_START_TAG_NAME: case STATE_BEFORE_END_TAG_NAME: // Let's get the name. tagElemTypeName.length(0); for(start = pos; pos < end && (Character.isLetterOrDigit(text[pos]) || text[pos] == ':' || text[pos] == '_' ||text[pos] == '-' || text[pos] == '.'); pos++); tagElemTypeName.append(text, start, pos - start); tagElemTypeName.toLowerCase(); currentElement = factory.getElement(tagElemTypeName); if (DEBUG) System.err.println((state == STATE_BEFORE_START_TAG_NAME ? "Opening" : "Closing") + " tag for " + tagElemTypeName + " (element: " + currentElement+ ")"); if (currentElement != null && currentElement.breaksFlow) flowBroken = true; while(pos < end && Character.isWhitespace(text[pos])) pos++; state = state == STATE_BEFORE_START_TAG_NAME ? STATE_IN_START_TAG : STATE_IN_END_TAG; break; case STATE_IN_START_TAG: currChar = text[pos]; if (currChar != '>' && (currChar != '/' || pos == end - 1 || text[pos + 1] != '>')) { // We got attributes. if (Character.isLetter(currChar)) { parseCurrAttr = false; attrName.length(0); for(start = pos; pos < end && (Character.isLetter(text[pos]) || text[pos] == '-'); pos++); if (currentElement != null && parseAttributes) { attrName.append(text, start, pos - start); attrName.toLowerCase(); if (DEBUG) System.err.println("Got attribute named \"" + attrName + "\""); currAttr = factory.getAttribute(attrName); parseCurrAttr = parsedAttrs.contains(currAttr); } // Skip whitespace while (pos < end && Character.isWhitespace(text[pos])) pos++; if (pos == end) break; if (text[pos] != '=') { // We found an attribute without explicit value. // TODO: can we avoid another string? if (parseCurrAttr) attrMap.put(currAttr, new MutableString(currAttr.name)); break; } pos++; while (pos < end && Character.isWhitespace(text[pos])) pos++; if (pos == end) break; attrValue.length(0); if (pos < end && ((delim = text[pos]) == '"' || (delim = text[pos]) == '\'')) { // An attribute value with delimiters. for(start = ++pos; pos < end && text[pos] != delim; pos++); if (parseCurrAttr) attrValue.append(text, start, pos - start).replace(NONSPACE_WHITESPACE, SPACE); if (pos < end) pos++; } else { // An attribute value without delimiters. Due to very common errors, we // gather characters up to the first occurrence of whitespace or '>'. for(start = pos; pos < end && !Character.isWhitespace(text[pos]) && text[pos] != '>'; pos++); if (parseCurrAttr) attrValue.append(text, start, pos - start); } if (parseCurrAttr) { replaceEntities(attrValue, entity, false); attrMap.put(currAttr, attrValue.copy()); if (DEBUG) System.err.println("Attribute value: \"" + attrValue + "\""); } // Skip whitespace while (pos < end && Character.isWhitespace(text[pos])) pos++; } else { // It's a mess. Our only reasonable chance is to try to resync on the first // whitespace, or alternatively to get to the end of the tag. do pos++; while (pos < end && text[pos] != '>' && ! Character.isWhitespace(text[pos])); // Skip whitespace while (pos < end && Character.isWhitespace(text[pos])) pos++; continue; } } else { if (parseTags && ! callback.startElement(currentElement, attrMap)) break; if (attrMap != null) attrMap.clear(); if (currentElement == Element.SCRIPT || currentElement == Element.STYLE) { final TextPattern pattern = currentElement == Element.SCRIPT ? SCRIPT_CLOSE_TAG_PATTERN : STYLE_CLOSE_TAG_PATTERN; start = pos + 1; pos = pattern.search(text, start, end); if (pos == -1) pos = end; if (parseText) callback.cdata(currentElement, text, start, pos - start); if (pos < end) { if (parseTags) callback.endElement(currentElement); pos += pattern.length(); } } else pos += currChar == '/' ? 2 : 1; state = STATE_TEXT; } break; case STATE_IN_END_TAG: while (pos < end && text[pos] != '>') pos++; if (parseTags && currentElement != null && ! callback.endElement(currentElement)) break; state = STATE_TEXT; pos++; break; default: } // We do what we can to invoke tag handlers in case of a truncated text. if (state == STATE_IN_START_TAG && parseTags && currentElement != null) callback.startElement(currentElement, attrMap); if (state == STATE_IN_END_TAG && parseTags && currentElement != null) callback.endElement(currentElement); if (state == STATE_TEXT && parseText && characters.length() > 0) callback.characters(characters.array(), 0, characters.length(), flowBroken); callback.endDocument(); } }