/* * XmlParser.java * Copyright (C) 1999,2000,2001 The Free Software Foundation * * This file is part of GNU JAXP, a library. * * GNU JAXP is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * GNU JAXP is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * As a special exception, if you link this library with other files to * produce an executable, this library does not by itself cause the * resulting executable to be covered by the GNU General Public License. * This exception does not however invalidate any other reasons why the * executable file might be covered by the GNU General Public License. */ // // Copyright (c) 1997, 1998 by Microstar Software Ltd. // From Microstar's README (the entire original license): // // Separate statements also said it's in the public domain. // All modifications are distributed under the license // above (GPL with library exception). // // AElfred is free for both commercial and non-commercial use and // redistribution, provided that Microstar's copyright and disclaimer are // retained intact. You are free to modify AElfred for your own use and // to redistribute AElfred with your modifications, provided that the // modifications are clearly documented. // // This program is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // merchantability or fitness for a particular purpose. Please use it AT // YOUR OWN RISK. // package gnu.xml.aelfred2; import java.io.BufferedInputStream; import java.io.CharConversionException; import java.io.EOFException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.IOException; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLConnection; // maintaining 1.1 compatibility for now ... // Iterator and Hashmap ought to be faster import java.util.Enumeration; import java.util.Hashtable; import java.util.Stack; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * Parse XML documents and return parse events through call-backs. * Use the SAXDriver class as your entry point, as all * internal parser interfaces are subject to change. * * @author Written by David Megginson <dmeggins@microstar.com> * (version 1.2a with bugfixes) * @author Updated by David Brownell <dbrownell@users.sourceforge.net> * @see SAXDriver */ final class XmlParser { // avoid slow per-character readCh() private final static boolean USE_CHEATS = true; ////////////////////////////////////////////////////////////////////// // Constructors. //////////////////////////////////////////////////////////////////////// /** * Construct a new parser with no associated handler. * @see #setHandler * @see #parse */ // package private XmlParser () { } /** * Set the handler that will receive parsing events. * @param handler The handler to receive callback events. * @see #parse */ // package private void setHandler (SAXDriver handler) { this.handler = handler; } /** * Parse an XML document from the character stream, byte stream, or URI * that you provide (in that order of preference). Any URI that you * supply will become the base URI for resolving relative URI, and may * be used to acquire a reader or byte stream. * *

Only one thread at a time may use this parser; since it is * private to this package, post-parse cleanup is done by the caller, * which MUST NOT REUSE the parser (just null it). * * @param systemId Absolute URI of the document; should never be null, * but may be so iff a reader or a stream is provided. * @param publicId The public identifier of the document, or null. * @param reader A character stream; must be null if stream isn't. * @param stream A byte input stream; must be null if reader isn't. * @param encoding The suggested encoding, or null if unknown. * @exception java.lang.Exception Basically SAXException or IOException */ // package private void doParse ( String systemId, String publicId, Reader reader, InputStream stream, String encoding ) throws Exception { if (handler == null) throw new IllegalStateException ("no callback handler"); initializeVariables (); // predeclare the built-in entities here (replacement texts) // we don't need to intern(), since we're guaranteed literals // are always (globally) interned. setInternalEntity ("amp", "&"); setInternalEntity ("lt", "<"); setInternalEntity ("gt", ">"); setInternalEntity ("apos", "'"); setInternalEntity ("quot", """); try { // pushURL first to ensure locator is correct in startDocument // ... it might report an IO or encoding exception. handler.startDocument (); pushURL (false, "[document]", // default baseURI: null new String [] { publicId, systemId, null}, reader, stream, encoding, false); parseDocument (); } catch (EOFException e){ //empty input error("empty document, with no root element."); }finally { if (reader != null) try { reader.close (); } catch (IOException e) { /* ignore */ } if (stream != null) try { stream.close (); } catch (IOException e) { /* ignore */ } if (is != null) try { is.close (); } catch (IOException e) { /* ignore */ } if (reader != null) try { reader.close (); } catch (IOException e) { /* ignore */ } scratch = null; } } //////////////////////////////////////////////////////////////////////// // Constants. //////////////////////////////////////////////////////////////////////// // // Constants for element content type. // /** * Constant: an element has not been declared. * @see #getElementContentType */ public final static int CONTENT_UNDECLARED = 0; /** * Constant: the element has a content model of ANY. * @see #getElementContentType */ public final static int CONTENT_ANY = 1; /** * Constant: the element has declared content of EMPTY. * @see #getElementContentType */ public final static int CONTENT_EMPTY = 2; /** * Constant: the element has mixed content. * @see #getElementContentType */ public final static int CONTENT_MIXED = 3; /** * Constant: the element has element content. * @see #getElementContentType */ public final static int CONTENT_ELEMENTS = 4; // // Constants for the entity type. // /** * Constant: the entity has not been declared. * @see #getEntityType */ public final static int ENTITY_UNDECLARED = 0; /** * Constant: the entity is internal. * @see #getEntityType */ public final static int ENTITY_INTERNAL = 1; /** * Constant: the entity is external, non-parsable data. * @see #getEntityType */ public final static int ENTITY_NDATA = 2; /** * Constant: the entity is external XML data. * @see #getEntityType */ public final static int ENTITY_TEXT = 3; // // Attribute type constants are interned literal strings. // // // Constants for supported encodings. "external" is just a flag. // private final static int ENCODING_EXTERNAL = 0; private final static int ENCODING_UTF_8 = 1; private final static int ENCODING_ISO_8859_1 = 2; private final static int ENCODING_UCS_2_12 = 3; private final static int ENCODING_UCS_2_21 = 4; private final static int ENCODING_UCS_4_1234 = 5; private final static int ENCODING_UCS_4_4321 = 6; private final static int ENCODING_UCS_4_2143 = 7; private final static int ENCODING_UCS_4_3412 = 8; private final static int ENCODING_ASCII = 9; // // Constants for attribute default value. // /** * Constant: the attribute is not declared. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30; /** * Constant: the attribute has a literal default value specified. * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue */ public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31; /** * Constant: the attribute was declared #IMPLIED. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32; /** * Constant: the attribute was declared #REQUIRED. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33; /** * Constant: the attribute was declared #FIXED. * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue */ public final static int ATTRIBUTE_DEFAULT_FIXED = 34; // // Constants for input. // private final static int INPUT_NONE = 0; private final static int INPUT_INTERNAL = 1; private final static int INPUT_STREAM = 3; private final static int INPUT_READER = 5; // // Flags for reading literals. // // expand general entity refs (attribute values in dtd and content) private final static int LIT_ENTITY_REF = 2; // normalize this value (space chars) (attributes, public ids) private final static int LIT_NORMALIZE = 4; // literal is an attribute value private final static int LIT_ATTRIBUTE = 8; // don't expand parameter entities private final static int LIT_DISABLE_PE = 16; // don't expand [or parse] character refs private final static int LIT_DISABLE_CREF = 32; // don't parse general entity refs private final static int LIT_DISABLE_EREF = 64; // literal is a public ID value private final static int LIT_PUBID = 256; // // Flags affecting PE handling in DTDs (if expandPE is true). // PEs expand with space padding, except inside literals. // private final static int CONTEXT_NORMAL = 0; private final static int CONTEXT_LITERAL = 1; ////////////////////////////////////////////////////////////////////// // Error reporting. ////////////////////////////////////////////////////////////////////// /** * Report an error. * @param message The error message. * @param textFound The text that caused the error (or null). * @see SAXDriver#error * @see #line */ private void error (String message, String textFound, String textExpected) throws SAXException { if (textFound != null) { message = message + " (found \"" + textFound + "\")"; } if (textExpected != null) { message = message + " (expected \"" + textExpected + "\")"; } handler.fatal (message); // "can't happen" throw new SAXException (message); } /** * Report a serious error. * @param message The error message. * @param textFound The text that caused the error (or null). */ private void error (String message, char textFound, String textExpected) throws SAXException { error (message, new Character (textFound).toString (), textExpected); } /** Report typical case fatal errors. */ private void error (String message) throws SAXException { handler.fatal (message); } ////////////////////////////////////////////////////////////////////// // Major syntactic productions. ////////////////////////////////////////////////////////////////////// /** * Parse an XML document. *

     * [1] document ::= prolog element Misc*
     *

This is the top-level parsing function for a single XML * document. As a minimum, a well-formed document must have * a document element, and a valid document must have a prolog * (one with doctype) as well. */ private void parseDocument () throws Exception { try { // added by MHK boolean sawDTD = parseProlog (); require ('<'); parseElement (!sawDTD); } catch (EOFException ee) { // added by MHK error("premature end of file", "[EOF]", null); } try { parseMisc (); //skip all white, PIs, and comments char c = readCh (); //if this doesn't throw an exception... error ("unexpected characters after document end", c, null); } catch (EOFException e) { return; } } static final char startDelimComment [] = { '<', '!', '-', '-' }; static final char endDelimComment [] = { '-', '-' }; /** * Skip a comment. *

     * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
     *

(The <!-- has already been read.) */ private void parseComment () throws Exception { char c; boolean saved = expandPE; expandPE = false; parseUntil (endDelimComment); require ('>'); expandPE = saved; handler.comment (dataBuffer, 0, dataBufferPos); dataBufferPos = 0; } static final char startDelimPI [] = { '<', '?' }; static final char endDelimPI [] = { '?', '>' }; /** * Parse a processing instruction and do a call-back. *

     * [16] PI ::= '<?' PITarget
     *		(S (Char* - (Char* '?>' Char*)))?
     *		'?>'
     * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
     *

(The <? has already been read.) */ private void parsePI () throws SAXException, IOException { String name; boolean saved = expandPE; expandPE = false; name = readNmtoken (true); //NE08 if (name.indexOf(':') >= 0) error ("Illegal character(':') in processing instruction name ", name, null); if ("xml".equalsIgnoreCase (name)) error ("Illegal processing instruction target", name, null); if (!tryRead (endDelimPI)) { requireWhitespace (); parseUntil (endDelimPI); } expandPE = saved; handler.processingInstruction (name, dataBufferToString ()); } static final char endDelimCDATA [] = { ']', ']', '>' }; private boolean isDirtyCurrentElement; /** * Parse a CDATA section. *

     * [18] CDSect ::= CDStart CData CDEnd
     * [19] CDStart ::= '<![CDATA['
     * [20] CData ::= (Char* - (Char* ']]>' Char*))
     * [21] CDEnd ::= ']]>'
     *

(The '<![CDATA[' has already been read.) */ private void parseCDSect () throws Exception { parseUntil (endDelimCDATA); dataBufferFlush (); } /** * Parse the prolog of an XML document. *

     * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
     *

We do not look for the XML declaration here, because it was * handled by pushURL (). * @see pushURL * @return true if a DTD was read. */ private boolean parseProlog () throws Exception { parseMisc (); if (tryRead (" * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' * [24] VersionInfo ::= S 'version' Eq * ("'" VersionNum "'" | '"' VersionNum '"' ) * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* * [32] SDDecl ::= S 'standalone' Eq * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) * [80] EncodingDecl ::= S 'encoding' Eq * ( "'" EncName "'" | "'" EncName "'" ) * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* * *

(The <?xml and whitespace have already been read.) * @return the encoding in the declaration, uppercased; or null * @see #parseTextDecl * @see #setupDecoding */ private String parseXMLDecl (boolean ignoreEncoding) throws SAXException, IOException { String version; String encodingName = null; String standalone = null; int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; // Read the version. require ("version"); parseEq (); checkLegalVersion (version = readLiteral (flags)); if (!version.equals ("1.0")){ if(version.equals ("1.1")){ handler.warn ("expected XML version 1.0, not: " + version); xmlVersion = XML_11; }else { error("illegal XML version", version, "1.0 or 1.1"); } } else xmlVersion = XML_10; // Try reading an encoding declaration. boolean white = tryWhitespace (); if (tryRead ("encoding")) { if (!white) error ("whitespace required before 'encoding='"); parseEq (); encodingName = readLiteral (flags); if (!ignoreEncoding) setupDecoding (encodingName); } // Try reading a standalone declaration if (encodingName != null) white = tryWhitespace (); if (tryRead ("standalone")) { if (!white) error ("whitespace required before 'standalone='"); parseEq (); standalone = readLiteral (flags); if ("yes".equals (standalone)) docIsStandalone = true; else if (!"no".equals (standalone)) error ("standalone flag must be 'yes' or 'no'"); } skipWhitespace (); require ("?>"); return encodingName; } /** * Parse a text declaration. *

     * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
     * [80] EncodingDecl ::= S 'encoding' Eq
     *		( '"' EncName '"' | "'" EncName "'" )
     * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
     *

(The <?xml' and whitespace have already been read.) * @return the encoding in the declaration, uppercased; or null * @see #parseXMLDecl * @see #setupDecoding */ private String parseTextDecl (boolean ignoreEncoding) throws SAXException, IOException { String encodingName = null; int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; // Read an optional version. if (tryRead ("version")) { String version; parseEq (); checkLegalVersion (version = readLiteral (flags)); if (version.equals ("1.1")){ if (xmlVersion == XML_10){ error ("external subset has later version number.", "1.0", version); } handler.warn ("expected XML version 1.0, not: " + version); xmlVersion = XML_11; }else if(!version.equals ("1.0")) { error("illegal XML version", version, "1.0 or 1.1"); } requireWhitespace (); } // Read the encoding. require ("encoding"); parseEq (); encodingName = readLiteral (flags); if (!ignoreEncoding) setupDecoding (encodingName); skipWhitespace (); require ("?>"); return encodingName; } /** * Sets up internal state so that we can decode an entity using the * specified encoding. This is used when we start to read an entity * and we have been given knowledge of its encoding before we start to * read any data (e.g. from a SAX input source or from a MIME type). * *

It is also used after autodetection, at which point only very * limited adjustments to the encoding may be used (switching between * related builtin decoders). * * @param encodingName The name of the encoding specified by the user. * @exception IOException if the encoding isn't supported either * internally to this parser, or by the hosting JVM. * @see #parseXMLDecl * @see #parseTextDecl */ private void setupDecoding (String encodingName) throws SAXException, IOException { encodingName = encodingName.toUpperCase (); // ENCODING_EXTERNAL indicates an encoding that wasn't // autodetected ... we can use builtin decoders, or // ones from the JVM (InputStreamReader). // Otherwise we can only tweak what was autodetected, and // only for single byte (ASCII derived) builtin encodings. // ASCII-derived encodings if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) { if (encodingName.equals ("ISO-8859-1") || encodingName.equals ("8859_1") || encodingName.equals ("ISO8859_1") ) { encoding = ENCODING_ISO_8859_1; return; } else if (encodingName.equals ("US-ASCII") || encodingName.equals ("ASCII")) { encoding = ENCODING_ASCII; return; } else if (encodingName.equals ("UTF-8") || encodingName.equals ("UTF8")) { encoding = ENCODING_UTF_8; return; } else if (encoding != ENCODING_EXTERNAL) { // used to start with a new reader ... throw new UnsupportedEncodingException (encodingName); } // else fallthrough ... // it's ASCII-ish and something other than a builtin } // Unicode and such if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) { if (!(encodingName.equals ("ISO-10646-UCS-2") || encodingName.equals ("UTF-16") || encodingName.equals ("UTF-16BE") || encodingName.equals ("UTF-16LE"))) error ("unsupported Unicode encoding", encodingName, "UTF-16"); return; } // four byte encodings if (encoding == ENCODING_UCS_4_1234 || encoding == ENCODING_UCS_4_4321 || encoding == ENCODING_UCS_4_2143 || encoding == ENCODING_UCS_4_3412) { // Strictly: "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists if (!encodingName.equals ("ISO-10646-UCS-4")) error ("unsupported 32-bit encoding", encodingName, "ISO-10646-UCS-4"); return; } // assert encoding == ENCODING_EXTERNAL // if (encoding != ENCODING_EXTERNAL) // throw new RuntimeException ("encoding = " + encoding); if (encodingName.equals ("UTF-16BE")) { encoding = ENCODING_UCS_2_12; return; } if (encodingName.equals ("UTF-16LE")) { encoding = ENCODING_UCS_2_21; return; } // We couldn't use the builtin decoders at all. But we can try to // create a reader, since we haven't messed up buffering. Tweak // the encoding name if necessary. if (encodingName.equals ("UTF-16") || encodingName.equals ("ISO-10646-UCS-2")) encodingName = "Unicode"; // Ignoring all the EBCDIC aliases here reader = new InputStreamReader (is, encodingName); sourceType = INPUT_READER; } /** * Parse miscellaneous markup outside the document element and DOCTYPE * declaration. *

     * [27] Misc ::= Comment | PI | S
     *

*/ private void parseMisc () throws Exception { while (true) { skipWhitespace (); if (tryRead (startDelimPI)) { parsePI (); } else if (tryRead (startDelimComment)) { parseComment (); } else { return; } } } /** * Parse a document type declaration. *

     * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
     *		('[' (markupdecl | PEReference | S)* ']' S?)? '>'
     *

(The <!DOCTYPE has already been read.) */ private void parseDoctypedecl () throws Exception { String rootName, ids[]; // Read the document type name. requireWhitespace (); rootName = readNmtoken (true); // Read the External subset's IDs skipWhitespace (); ids = readExternalIds (false, true); // report (a) declaration of name, (b) lexical info (ids) handler.doctypeDecl (rootName, ids [0], ids [1]); // Internal subset is parsed first, if present skipWhitespace (); if (tryRead ('[')) { // loop until the subset ends while (true) { doReport = expandPE = true; skipWhitespace (); doReport = expandPE = false; if (tryRead (']')) { break; // end of subset } else { // WFC, PEs in internal subset (only between decls) peIsError = expandPE = true; parseMarkupdecl (); peIsError = expandPE = false; } } } skipWhitespace (); require ('>'); // Read the external subset, if any InputSource subset; if (ids [1] == null) subset = handler.getExternalSubset (rootName, handler.getSystemId ()); else subset = null; if (ids [1] != null || subset != null) { pushString (null, ">"); // NOTE: [dtd] is so we say what SAX2 expects, // though it's misleading (subset, not entire dtd) if (ids [1] != null) pushURL (true, "[dtd]", ids, null, null, null, true); else { handler.warn ("modifying document by adding external subset"); pushURL (true, "[dtd]", new String [] { subset.getPublicId (), subset.getSystemId (), null }, subset.getCharacterStream (), subset.getByteStream (), subset.getEncoding (), false); } // Loop until we end up back at '>' while (true) { doReport = expandPE = true; skipWhitespace (); doReport = expandPE = false; if (tryRead ('>')) { break; } else { expandPE = true; parseMarkupdecl (); expandPE = false; } } // the ">" string isn't popped yet if (inputStack.size () != 1) error ("external subset has unmatched '>'"); } // done dtd handler.endDoctype (); expandPE = false; doReport = true; } /** * Parse a markup declaration in the internal or external DTD subset. *

     * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
     *		| NotationDecl | PI | Comment
     * [30] extSubsetDecl ::= (markupdecl | conditionalSect
     *		| PEReference | S) *
     *

Reading toplevel PE references is handled as a lexical issue * by the caller, as is whitespace. */ private void parseMarkupdecl () throws Exception { char saved [] = null; boolean savedPE = expandPE; // prevent "<%foo;" and ensures saved entity is right require ('<'); unread ('<'); expandPE = false; if (tryRead (" 0) parseConditionalSect (saved); else error ("conditional sections illegal in internal subset"); } else { error ("expected markup declaration"); } // VC: Proper Decl/PE Nesting if (readBuffer != saved) handler.verror ("Illegal Declaration/PE nesting"); } /** * Parse an element, with its tags. *

     * [39] element ::= EmptyElementTag | STag content ETag
     * [40] STag ::= '<' Name (S Attribute)* S? '>'
     * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>'
     *

(The '<' has already been read.) *

NOTE: this method actually chains onto parseContent (), if necessary, * and parseContent () will take care of calling parseETag (). */ private void parseElement (boolean maybeGetSubset) throws Exception { String gi; char c; int oldElementContent = currentElementContent; String oldElement = currentElement; Object element []; // This is the (global) counter for the // array of specified attributes. tagAttributePos = 0; // Read the element type name. gi = readNmtoken (true); // If we saw no DTD, and this is the document root element, // let the application modify the input stream by providing one. if (maybeGetSubset) { InputSource subset = handler.getExternalSubset (gi, handler.getSystemId ()); if (subset != null) { String publicId = subset.getPublicId (); String systemId = subset.getSystemId (); handler.warn ("modifying document by adding DTD"); handler.doctypeDecl (gi, publicId, systemId); pushString (null, ">"); // NOTE: [dtd] is so we say what SAX2 expects, // though it's misleading (subset, not entire dtd) pushURL (true, "[dtd]", new String [] { publicId, systemId, null }, subset.getCharacterStream (), subset.getByteStream (), subset.getEncoding (), false); // Loop until we end up back at '>' while (true) { doReport = expandPE = true; skipWhitespace (); doReport = expandPE = false; if (tryRead ('>')) { break; } else { expandPE = true; parseMarkupdecl (); expandPE = false; } } // the ">" string isn't popped yet if (inputStack.size () != 1) error ("external subset has unmatched '>'"); handler.endDoctype (); } } // Determine the current content type. currentElement = gi; element = (Object []) elementInfo.get (gi); currentElementContent = getContentType (element, CONTENT_ANY); // Read the attributes, if any. // After this loop, "c" is the closing delimiter. boolean white = tryWhitespace (); c = readCh (); while (c != '/' && c != '>') { unread (c); if (!white) error ("need whitespace between attributes"); parseAttribute (gi); white = tryWhitespace (); c = readCh (); } // Supply any defaulted attributes. Enumeration atts = declaredAttributes (element); if (atts != null) { String aname; loop: while (atts.hasMoreElements ()) { aname = (String) atts.nextElement (); // See if it was specified. for (int i = 0; i < tagAttributePos; i++) { if (tagAttributes [i] == aname) { continue loop; } } // ... or has a default String value = getAttributeDefaultValue (gi, aname); if (value == null) continue; handler.attribute (aname, value, false); } } // Figure out if this is a start tag // or an empty element, and dispatch an // event accordingly. switch (c) { case '>': handler.startElement (gi); parseContent (); break; case '/': require ('>'); handler.startElement (gi); handler.endElement (gi); break; } // Restore the previous state. currentElement = oldElement; currentElementContent = oldElementContent; } /** * Parse an attribute assignment. *

     * [41] Attribute ::= Name Eq AttValue
     *

* @param name The name of the attribute's element. * @see SAXDriver#attribute */ private void parseAttribute (String name) throws Exception { String aname; String type; String value; int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF; // Read the attribute name. aname = readNmtoken (true); type = getAttributeType (name, aname); // Parse '=' parseEq (); // Read the value, normalizing whitespace // unless it is CDATA. if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { if (type == "CDATA" || type == null) { value = readLiteral (flags); } else { value = readLiteral (flags | LIT_NORMALIZE); } } else { if (type.equals("CDATA") || type == null) { value = readLiteral (flags); } else { value = readLiteral (flags | LIT_NORMALIZE); } } // WFC: no duplicate attributes for (int i = 0; i < tagAttributePos; i++) if (aname.equals (tagAttributes [i])) error ("duplicate attribute", aname, null); // Inform the handler about the // attribute. handler.attribute (aname, value, true); dataBufferPos = 0; // Note that the attribute has been // specified. if (tagAttributePos == tagAttributes.length) { String newAttrib[] = new String [tagAttributes.length * 2]; System.arraycopy (tagAttributes, 0, newAttrib, 0, tagAttributePos); tagAttributes = newAttrib; } tagAttributes [tagAttributePos++] = aname; } /** * Parse an equals sign surrounded by optional whitespace. *

     * [25] Eq ::= S? '=' S?
     *

*/ private void parseEq () throws SAXException, IOException { skipWhitespace (); require ('='); skipWhitespace (); } /** * Parse an end tag. *

     * [42] ETag ::= ''
     *

NOTE: parseContent () chains to here, we already read the * "</". */ private void parseETag () throws Exception { require (currentElement); skipWhitespace (); require ('>'); handler.endElement (currentElement); // not re-reporting any SAXException re bogus end tags, // even though that diagnostic might be clearer ... } /** * Parse the content of an element. *

     * [43] content ::= (element | CharData | Reference
     *		| CDSect | PI | Comment)*
     * [67] Reference ::= EntityRef | CharRef
     *

NOTE: consumes ETtag. */ private void parseContent () throws Exception { char c; while (true) { // consume characters (or ignorable whitspace) until delimiter parseCharData (); // Handle delimiters c = readCh (); switch (c) { case '&': // Found "&" c = readCh (); if (c == '#') { parseCharRef (); } else { unread (c); parseEntityRef (true); } isDirtyCurrentElement = true; break; case '<': // Found "<" dataBufferFlush (); c = readCh (); switch (c) { case '!': // Found " * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' * *

NOTE: the '<!ELEMENT' has already been read. */ private void parseElementDecl () throws Exception { String name; requireWhitespace (); // Read the element type name. name = readNmtoken (true); requireWhitespace (); // Read the content model. parseContentspec (name); skipWhitespace (); require ('>'); } /** * Content specification. *

     * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
     *

*/ private void parseContentspec (String name) throws Exception { // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ... if (tryRead ("EMPTY")) { setElement (name, CONTENT_EMPTY, null, null); if (!skippedPE) handler.getDeclHandler ().elementDecl (name, "EMPTY"); return; } else if (tryRead ("ANY")) { setElement (name, CONTENT_ANY, null, null); if (!skippedPE) handler.getDeclHandler ().elementDecl (name, "ANY"); return; } else { String model; char saved []; require ('('); saved = readBuffer; dataBufferAppend ('('); skipWhitespace (); if (tryRead ("#PCDATA")) { dataBufferAppend ("#PCDATA"); parseMixed (saved); model = dataBufferToString (); setElement (name, CONTENT_MIXED, model, null); } else { parseElements (saved); model = dataBufferToString (); setElement (name, CONTENT_ELEMENTS, model, null); } if (!skippedPE) handler.getDeclHandler ().elementDecl (name, model); } } /** * Parse an element-content model. *

     * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
     * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
     * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
     *

* *

NOTE: the opening '(' and S have already been read. * * @param saved Buffer for entity that should have the terminal ')' */ private void parseElements (char saved []) throws Exception { char c; char sep; // Parse the first content particle skipWhitespace (); parseCp (); // Check for end or for a separator. skipWhitespace (); c = readCh (); switch (c) { case ')': // VC: Proper Group/PE Nesting if (readBuffer != saved) handler.verror ("Illegal Group/PE nesting"); dataBufferAppend (')'); c = readCh (); switch (c) { case '*': case '+': case '?': dataBufferAppend (c); break; default: unread (c); } return; case ',': // Register the separator. case '|': sep = c; dataBufferAppend (c); break; default: error ("bad separator in content model", c, null); return; } // Parse the rest of the content model. while (true) { skipWhitespace (); parseCp (); skipWhitespace (); c = readCh (); if (c == ')') { // VC: Proper Group/PE Nesting if (readBuffer != saved) handler.verror ("Illegal Group/PE nesting"); dataBufferAppend (')'); break; } else if (c != sep) { error ("bad separator in content model", c, null); return; } else { dataBufferAppend (c); } } // Check for the occurrence indicator. c = readCh (); switch (c) { case '?': case '*': case '+': dataBufferAppend (c); return; default: unread (c); return; } } /** * Parse a content particle. *

     * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
     *

*/ private void parseCp () throws Exception { if (tryRead ('(')) { dataBufferAppend ('('); parseElements (readBuffer); } else { dataBufferAppend (readNmtoken (true)); char c = readCh (); switch (c) { case '?': case '*': case '+': dataBufferAppend (c); break; default: unread (c); break; } } } /** * Parse mixed content. *

     * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
     *	      | '(' S? ('#PCDATA') S? ')'
     *

* * @param saved Buffer for entity that should have the terminal ')' */ private void parseMixed (char saved []) throws Exception { // Check for PCDATA alone. skipWhitespace (); if (tryRead (')')) { // VC: Proper Group/PE Nesting if (readBuffer != saved) handler.verror ("Illegal Group/PE nesting"); dataBufferAppend (")*"); tryRead ('*'); return; } // Parse mixed content. skipWhitespace (); while (!tryRead (")")) { require ('|'); dataBufferAppend ('|'); skipWhitespace (); dataBufferAppend (readNmtoken (true)); skipWhitespace (); } // VC: Proper Group/PE Nesting if (readBuffer != saved) handler.verror ("Illegal Group/PE nesting"); require ('*'); dataBufferAppend (")*"); } /** * Parse an attribute list declaration. *

     * [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
     *

NOTE: the '<!ATTLIST' has already been read. */ private void parseAttlistDecl () throws Exception { String elementName; requireWhitespace (); elementName = readNmtoken (true); boolean white = tryWhitespace (); while (!tryRead ('>')) { if (!white) error ("whitespace required before attribute definition"); parseAttDef (elementName); white = tryWhitespace (); } } /** * Parse a single attribute definition. *

     * [53] AttDef ::= S Name S AttType S DefaultDecl
     *

*/ private void parseAttDef (String elementName) throws Exception { String name; String type; String enumer = null; // Read the attribute name. name = readNmtoken (true); // Read the attribute type. requireWhitespace (); type = readAttType (); // Get the string of enumerated values if necessary. if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { if ("ENUMERATION" == type || "NOTATION" == type) enumer = dataBufferToString (); } else { if ("ENUMERATION".equals(type) || "NOTATION".equals(type)) enumer = dataBufferToString (); } // Read the default value. requireWhitespace (); parseDefault (elementName, name, type, enumer); } /** * Parse the attribute type. *

   * [54] AttType ::= StringType | TokenizedType | EnumeratedType
   * [55] StringType ::= 'CDATA'
   * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
   *		| 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
   * [57] EnumeratedType ::= NotationType | Enumeration
   *

*/ private String readAttType () throws Exception { if (tryRead ('(')) { parseEnumeration (false); return "ENUMERATION"; } else { String typeString = readNmtoken (true); if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { if ("NOTATION" == typeString) { parseNotationType (); return typeString; } else if ("CDATA" == typeString || "ID" == typeString || "IDREF" == typeString || "IDREFS" == typeString || "ENTITY" == typeString || "ENTITIES" == typeString || "NMTOKEN" == typeString || "NMTOKENS" == typeString) return typeString; } else { if ("NOTATION".equals(typeString)) { parseNotationType (); return typeString; } else if ("CDATA".equals(typeString) || "ID".equals(typeString) || "IDREF".equals(typeString) || "IDREFS".equals(typeString) || "ENTITY".equals(typeString) || "ENTITIES".equals(typeString) || "NMTOKEN".equals(typeString) || "NMTOKENS".equals(typeString)) return typeString; } error ("illegal attribute type", typeString, null); return null; } } /** * Parse an enumeration. *

     * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
     *

NOTE: the '(' has already been read. */ private void parseEnumeration (boolean isNames) throws Exception { dataBufferAppend ('('); // Read the first token. skipWhitespace (); dataBufferAppend (readNmtoken (isNames)); // Read the remaining tokens. skipWhitespace (); while (!tryRead (')')) { require ('|'); dataBufferAppend ('|'); skipWhitespace (); dataBufferAppend (readNmtoken (isNames)); skipWhitespace (); } dataBufferAppend (')'); } /** * Parse a notation type for an attribute. *

     * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
     *		(S? '|' S? name)* S? ')'
     *

NOTE: the 'NOTATION' has already been read */ private void parseNotationType () throws Exception { requireWhitespace (); require ('('); parseEnumeration (true); } /** * Parse the default value for an attribute. *

     * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
     *		| (('#FIXED' S)? AttValue)
     *

*/ private void parseDefault ( String elementName, String name, String type, String enumer ) throws Exception { int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; String value = null; int flags = LIT_ATTRIBUTE; boolean saved = expandPE; String defaultType = null; // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace // chars to spaces (doesn't matter when that's done if it doesn't // interfere with char refs expanding to whitespace). if (!skippedPE) { flags |= LIT_ENTITY_REF; if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { if ("CDATA" != type) flags |= LIT_NORMALIZE; } else { if (!"CDATA".equals(type)) flags |= LIT_NORMALIZE; } } expandPE = false; if (tryRead ('#')) { if (tryRead ("FIXED")) { defaultType = "#FIXED"; valueType = ATTRIBUTE_DEFAULT_FIXED; requireWhitespace (); value = readLiteral (flags); } else if (tryRead ("REQUIRED")) { defaultType = "#REQUIRED"; valueType = ATTRIBUTE_DEFAULT_REQUIRED; } else if (tryRead ("IMPLIED")) { defaultType = "#IMPLIED"; valueType = ATTRIBUTE_DEFAULT_IMPLIED; } else { error ("illegal keyword for attribute default value"); } } else value = readLiteral (flags); expandPE = saved; setAttribute (elementName, name, type, enumer, value, valueType); if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { if ("ENUMERATION" == type) type = enumer; else if ("NOTATION" == type) type = "NOTATION " + enumer; } else { if ("ENUMERATION".equals(type)) type = enumer; else if ("NOTATION".equals(type)) type = "NOTATION " + enumer; } if (!skippedPE) handler.getDeclHandler () .attributeDecl (elementName, name, type, defaultType, value); } /** * Parse a conditional section. *

     * [61] conditionalSect ::= includeSect || ignoreSect
     * [62] includeSect ::= '<![' S? 'INCLUDE' S? '['
     *		extSubsetDecl ']]>'
     * [63] ignoreSect ::= '<![' S? 'IGNORE' S? '['
     *		ignoreSectContents* ']]>'
     * [64] ignoreSectContents ::= Ignore
     *		('<![' ignoreSectContents* ']]>' Ignore )*
     * [65] Ignore ::= Char* - (Char* ( '<![' | ']]>') Char* )
     *

NOTE: the '>![' has already been read. */ private void parseConditionalSect (char saved []) throws Exception { skipWhitespace (); if (tryRead ("INCLUDE")) { skipWhitespace (); require ('['); // VC: Proper Conditional Section/PE Nesting if (readBuffer != saved) handler.verror ("Illegal Conditional Section/PE nesting"); skipWhitespace (); while (!tryRead ("]]>")) { parseMarkupdecl (); skipWhitespace (); } } else if (tryRead ("IGNORE")) { skipWhitespace (); require ('['); // VC: Proper Conditional Section/PE Nesting if (readBuffer != saved) handler.verror ("Illegal Conditional Section/PE nesting"); int nesting = 1; char c; expandPE = false; for (int nest = 1; nest > 0;) { c = readCh (); switch (c) { case '<': if (tryRead ("![")) { nest++; } case ']': if (tryRead ("]>")) { nest--; } } } expandPE = true; } else { error ("conditional section must begin with INCLUDE or IGNORE"); } } private void parseCharRef () throws SAXException, IOException { parseCharRef (true /* do flushDataBuffer by default */); } /** * Try to read a character reference without consuming data from buffer. *

   * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
   *

NOTE: the '&#' has already been read. */ private void tryReadCharRef () throws SAXException, IOException { int value = 0; char c; if (tryRead ('x')) { loop1: while (true) { c = readCh (); int n; switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': n = c - '0'; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': n = (c - 'a') + 10; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': n = (c - 'A') + 10; break; case ';': break loop1; default: error ("illegal character in character reference", c, null); break loop1; } value *= 16; value += n; } } else { loop2: while (true) { c = readCh (); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': value *= 10; value += c - '0'; break; case ';': break loop2; default: error ("illegal character in character reference", c, null); break loop2; } } } // check for character refs being legal XML if ((value < 0x0020 && ! (value == '\n' || value == '\t' || value == '\r')) || (value >= 0xD800 && value <= 0xDFFF) || value == 0xFFFE || value == 0xFFFF || value > 0x0010ffff) error ("illegal XML character reference U+" + Integer.toHexString (value)); // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: if (value > 0x0010ffff) { // too big for surrogate error ("character reference " + value + " is too large for UTF-16", new Integer (value).toString (), null); } } /** * Read and interpret a character reference. *

     * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
     *

NOTE: the '&#' has already been read. */ private void parseCharRef (boolean doFlush) throws SAXException, IOException { int value = 0; char c; if (tryRead ('x')) { loop1: while (true) { c = readCh (); int n; switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': n = c - '0'; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': n = (c - 'a') + 10; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': n = (c - 'A') + 10; break; case ';': break loop1; default: error ("illegal character in character reference", c, null); break loop1; } value *= 16; value += n; } } else { loop2: while (true) { c = readCh (); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': value *= 10; value += c - '0'; break; case ';': break loop2; default: error ("illegal character in character reference", c, null); break loop2; } } } // check for character refs being legal XML if ((value < 0x0020 && ! (value == '\n' || value == '\t' || value == '\r')) || (value >= 0xD800 && value <= 0xDFFF) || value == 0xFFFE || value == 0xFFFF || value > 0x0010ffff) error ("illegal XML character reference U+" + Integer.toHexString (value)); // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: if (value <= 0x0000ffff) { // no surrogates needed dataBufferAppend ((char) value); } else if (value <= 0x0010ffff) { value -= 0x10000; // > 16 bits, surrogate needed dataBufferAppend ((char) (0xd800 | (value >> 10))); dataBufferAppend ((char) (0xdc00 | (value & 0x0003ff))); } else { // too big for surrogate error ("character reference " + value + " is too large for UTF-16", new Integer (value).toString (), null); } if (doFlush) dataBufferFlush (); } /** * Parse and expand an entity reference. *

     * [68] EntityRef ::= '&' Name ';'
     *

NOTE: the '&' has already been read. * @param externalAllowed External entities are allowed here. */ private void parseEntityRef (boolean externalAllowed) throws SAXException, IOException { String name; name = readNmtoken (true); require (';'); switch (getEntityType (name)) { case ENTITY_UNDECLARED: // NOTE: XML REC describes amazingly convoluted handling for // this case. Nothing as meaningful as being a WFness error // unless the processor might _legitimately_ not have seen a // declaration ... which is what this implements. String message; message = "reference to undeclared general entity " + name; if (skippedPE && !docIsStandalone) { handler.verror (message); // we don't know this entity, and it might be external... if (externalAllowed) handler.skippedEntity (name); } else error (message); break; case ENTITY_INTERNAL: pushString (name, getEntityValue (name)); //workaround for possible input pop before marking //the buffer reading position char t = readCh (); unread (t); int bufferPosMark = readBufferPos; int end = readBufferPos + getEntityValue (name).length(); for(int k = readBufferPos; k < end; k++){ t = readCh (); if (t == '&'){ t = readCh (); if (t == '#'){ //try to match a character ref tryReadCharRef (); //everything has been read if (readBufferPos >= end) break; k = readBufferPos; continue; } else if (Character.isLetter(t)){ //looks like an entity ref unread (t); readNmtoken (true); require (';'); //everything has been read if (readBufferPos >= end) break; k = readBufferPos; continue; } error(" malformed entity reference"); } } readBufferPos = bufferPosMark; break; case ENTITY_TEXT: if (externalAllowed) { pushURL (false, name, getEntityIds (name), null, null, null, true); } else { error ("reference to external entity in attribute value.", name, null); } break; case ENTITY_NDATA: if (externalAllowed) { error ("unparsed entity reference in content", name, null); } else { error ("reference to external entity in attribute value.", name, null); } break; default: throw new RuntimeException (); } } /** * Parse and expand a parameter entity reference. *

     * [69] PEReference ::= '%' Name ';'
     *

NOTE: the '%' has already been read. */ private void parsePEReference () throws SAXException, IOException { String name; name = "%" + readNmtoken (true); require (';'); switch (getEntityType (name)) { case ENTITY_UNDECLARED: // VC: Entity Declared handler.verror ("reference to undeclared parameter entity " + name); // we should disable handling of all subsequent declarations // unless this is a standalone document (info discarded) break; case ENTITY_INTERNAL: if (inLiteral) pushString (name, getEntityValue (name)); else pushString (name, ' ' + getEntityValue (name) + ' '); break; case ENTITY_TEXT: if (!inLiteral) pushString (null, " "); pushURL (true, name, getEntityIds (name), null, null, null, true); if (!inLiteral) pushString (null, " "); break; } } /** * Parse an entity declaration. *

     * [70] EntityDecl ::= GEDecl | PEDecl
     * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
     * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
     * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
     * [74] PEDef ::= EntityValue | ExternalID
     * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
     *		   | 'PUBLIC' S PubidLiteral S SystemLiteral
     * [76] NDataDecl ::= S 'NDATA' S Name
     *

NOTE: the '<!ENTITY' has already been read. */ private void parseEntityDecl () throws Exception { boolean peFlag = false; int flags = 0; // Check for a parameter entity. expandPE = false; requireWhitespace (); if (tryRead ('%')) { peFlag = true; requireWhitespace (); } expandPE = true; // Read the entity name, and prepend // '%' if necessary. String name = readNmtoken (true); //NE08 if (name.indexOf(':') >= 0) error ("Illegal character(':') in entity name ", name, null); if (peFlag) { name = "%" + name; } // Read the entity value. requireWhitespace (); char c = readCh (); unread (c); if (c == '"' || c == '\'') { // Internal entity ... replacement text has expanded refs // to characters and PEs, but not to general entities String value = readLiteral (flags); setInternalEntity (name, value); } else { // Read the external IDs String ids [] = readExternalIds (false, false); // Check for NDATA declaration. boolean white = tryWhitespace (); if (!peFlag && tryRead ("NDATA")) { if (!white) error ("whitespace required before NDATA"); requireWhitespace (); String notationName = readNmtoken (true); if (!skippedPE) { setExternalEntity (name, ENTITY_NDATA, ids, notationName); handler.unparsedEntityDecl (name, ids, notationName); } } else if (!skippedPE) { setExternalEntity (name, ENTITY_TEXT, ids, null); handler.getDeclHandler () .externalEntityDecl (name, ids [0], handler.resolveURIs () // FIXME: ASSUMES not skipped // "false" forces error on bad URI ? handler.absolutize (ids [2], ids [1], false) : ids [1]); } } // Finish the declaration. skipWhitespace (); require ('>'); } /** * Parse a notation declaration. *

     * [82] NotationDecl ::= '<!NOTATION' S Name S
     *		(ExternalID | PublicID) S? '>'
     * [83] PublicID ::= 'PUBLIC' S PubidLiteral
     *

NOTE: the '<!NOTATION' has already been read. */ private void parseNotationDecl () throws Exception { String nname, ids[]; requireWhitespace (); nname = readNmtoken (true); //NE08 if (nname.indexOf(':') >= 0) error ("Illegal character(':') in notation name ", nname, null); requireWhitespace (); // Read the external identifiers. ids = readExternalIds (true, false); // Register the notation. setNotation (nname, ids); skipWhitespace (); require ('>'); } /** * Parse character data. *

     * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
     *

*/ private void parseCharData () throws Exception { char c; int state = 0; boolean pureWhite = false; // assert (dataBufferPos == 0); // are we expecting pure whitespace? it might be dirty... if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement) pureWhite = true; // always report right out of readBuffer // to minimize (pointless) buffer copies while (true) { int lineAugment = 0; int columnAugment = 0; int i; loop: for (i = readBufferPos; i < readBufferLength; i++) { switch (c = readBuffer [i]) { case '\n': lineAugment++; columnAugment = 0; // pureWhite unmodified break; case '\r': // should not happen!! case '\t': case ' ': // pureWhite unmodified columnAugment++; break; case '&': case '<': columnAugment++; // pureWhite unmodified // CLEAN end of text sequence state = 1; break loop; case ']': // that's not a whitespace char, and // can not terminate pure whitespace either pureWhite = false; if ((i + 2) < readBufferLength) { if (readBuffer [i + 1] == ']' && readBuffer [i + 2] == '>') { // ERROR end of text sequence state = 2; break loop; } } else { // FIXME missing two end-of-buffer cases } columnAugment++; break; default: if ((c < 0x0020 || c > 0xFFFD) || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) && xmlVersion == XML_11)) error ("illegal XML character U+" + Integer.toHexString (c)); // that's not a whitespace char pureWhite = false; columnAugment++; } } // report text thus far if (lineAugment > 0) { line += lineAugment; column = columnAugment; } else { column += columnAugment; } // report characters/whitspace int length = i - readBufferPos; if (length != 0) { if (pureWhite) handler.ignorableWhitespace (readBuffer, readBufferPos, length); else handler.charData (readBuffer, readBufferPos, length); readBufferPos = i; } if (state != 0) break; // fill next buffer from this entity, or // pop stack and continue with previous entity unread (readCh ()); } if (!pureWhite) isDirtyCurrentElement = true; // finish, maybe with error if (state != 1) // finish, no error error ("character data may not contain ']]>'"); } ////////////////////////////////////////////////////////////////////// // High-level reading and scanning methods. ////////////////////////////////////////////////////////////////////// /** * Require whitespace characters. */ private void requireWhitespace () throws SAXException, IOException { char c = readCh (); if (isWhitespace (c)) { skipWhitespace (); } else { error ("whitespace required", c, null); } } /** * Skip whitespace characters. *

     * [3] S ::= (#x20 | #x9 | #xd | #xa)+
     *

*/ private void skipWhitespace () throws SAXException, IOException { // Start with a little cheat. Most of // the time, the white space will fall // within the current read buffer; if // not, then fall through. if (USE_CHEATS) { int lineAugment = 0; int columnAugment = 0; loop: for (int i = readBufferPos; i < readBufferLength; i++) { switch (readBuffer [i]) { case ' ': case '\t': case '\r': columnAugment++; break; case '\n': lineAugment++; columnAugment = 0; break; case '%': if (expandPE) break loop; // else fall through... default: readBufferPos = i; if (lineAugment > 0) { line += lineAugment; column = columnAugment; } else { column += columnAugment; } return; } } } // OK, do it the slow way. char c = readCh (); while (isWhitespace (c)) { c = readCh (); } unread (c); } /** * Read a name or (when parsing an enumeration) name token. *

     * [5] Name ::= (Letter | '_' | ':') (NameChar)*
     * [7] Nmtoken ::= (NameChar)+
     *

*/ private String readNmtoken (boolean isName) throws SAXException, IOException { char c; if (USE_CHEATS) { loop: for (int i = readBufferPos; i < readBufferLength; i++) { c = readBuffer [i]; switch (c) { case '%': if (expandPE) break loop; // else fall through... // What may legitimately come AFTER a name/nmtoken? case '<': case '>': case '&': case ',': case '|': case '*': case '+': case '?': case ')': case '=': case '\'': case '"': case '[': case ' ': case '\t': case '\r': case '\n': case ';': case '/': int start = readBufferPos; if (i == start) error ("name expected", readBuffer [i], null); readBufferPos = i; return intern (readBuffer, start, i - start); default: // FIXME ... per IBM's OASIS test submission, these: // ? U+06dd // Combining U+309B //these switches are kind of ugly but at least we won't //have to go over the whole lits for each char if (isName && i == readBufferPos){ char c2 = (char) (c & 0x00f0); switch (c & 0xff00){ //starting with 01 case 0x0100: switch (c2){ case 0x0030: if (c == 0x0132 || c == 0x0133 || c == 0x013f) error ("Not a name start character, U+" + Integer.toHexString (c)); break; case 0x0040: if (c == 0x0140 || c == 0x0149) error ("Not a name start character, U+" + Integer.toHexString (c)); break; case 0x00c0: if (c == 0x01c4 || c == 0x01cc) error ("Not a name start character, U+" + Integer.toHexString (c)); break; case 0x00f0: if (c == 0x01f1 || c == 0x01f3) error ("Not a name start character, U+" + Integer.toHexString (c)); break; case 0x00b0: if (c == 0x01f1 || c == 0x01f3) error ("Not a name start character, U+" + Integer.toHexString (c)); break; default: if (c == 0x017f) error ("Not a name start character, U+" + Integer.toHexString (c)); } break; //starting with 11 case 0x1100: switch (c2){ case 0x0000: if (c == 0x1104 || c == 0x1108 || c == 0x110a || c == 0x110d) error ("Not a name start character, U+" + Integer.toHexString (c)); break; case 0x0030: if (c == 0x113b || c == 0x113f) error ("Not a name start character, U+" + Integer.toHexString (c)); break; case 0x0040: if (c == 0x1141 || c == 0x114d || c == 0x114f ) error ("Not a name start character, U+" + Integer.toHexString (c)); break; case 0x0050: if (c == 0x1151 || c == 0x1156) error ("Not a name start character, U+" + Integer.toHexString (c)); break; case 0x0060: if (c == 0x1162 || c == 0x1164 || c == 0x1166 || c == 0x116b || c == 0x116f) error ("Not a name start character, U+" + Integer.toHexString (c)); break; case 0x00b0: if (c == 0x11b6 || c == 0x11b9 || c == 0x11bb || c == 0x116f) error ("Not a name start character, U+" + Integer.toHexString (c)); break; default: if (c == 0x1174 || c == 0x119f || c == 0x11ac || c == 0x11c3 || c == 0x11f1) error ("Not a name start character, U+" + Integer.toHexString (c)); } break; default: if (c == 0x0e46 || c == 0x1011 || c == 0x212f || c == 0x0587 || c == 0x0230 ) error ("Not a name start character, U+" + Integer.toHexString (c)); } } // punt on exact tests from Appendix A; approximate // them using the Unicode ID start/part rules if (i == readBufferPos && isName) { if (!Character.isUnicodeIdentifierStart (c) && c != ':' && c != '_') error ("Not a name start character, U+" + Integer.toHexString (c)); } else if (!Character.isUnicodeIdentifierPart (c) && c != '-' && c != ':' && c != '_' && c != '.' && !isExtender (c)) error ("Not a name character, U+" + Integer.toHexString (c)); } } } nameBufferPos = 0; // Read the first character. loop: while (true) { c = readCh (); switch (c) { case '%': case '<': case '>': case '&': case ',': case '|': case '*': case '+': case '?': case ')': case '=': case '\'': case '"': case '[': case ' ': case '\t': case '\n': case '\r': case ';': case '/': unread (c); if (nameBufferPos == 0) { error ("name expected"); } // punt on exact tests from Appendix A, but approximate them if (isName && !Character.isUnicodeIdentifierStart ( nameBuffer [0]) && ":_".indexOf (nameBuffer [0]) == -1) error ("Not a name start character, U+" + Integer.toHexString (nameBuffer [0])); String s = intern (nameBuffer, 0, nameBufferPos); nameBufferPos = 0; return s; default: // punt on exact tests from Appendix A, but approximate them if ((nameBufferPos != 0 || !isName) && !Character.isUnicodeIdentifierPart (c) && ":-_.".indexOf (c) == -1 && !isExtender (c)) error ("Not a name character, U+" + Integer.toHexString (c)); if (nameBufferPos >= nameBuffer.length) nameBuffer = (char[]) extendArray (nameBuffer, nameBuffer.length, nameBufferPos); nameBuffer [nameBufferPos++] = c; } } } private static boolean isExtender (char c) { // [88] Extender ::= ... return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035) || (c >= 0x309d && c <= 0x309e) || (c >= 0x30fc && c <= 0x30fe); } /** * Read a literal. With matching single or double quotes as * delimiters (and not embedded!) this is used to parse: *

     *	[9] EntityValue ::= ... ([^%&] | PEReference | Reference)* ...
     *	[10] AttValue ::= ... ([^<&] | Reference)* ...
     *	[11] SystemLiteral ::= ... (URLchar - "'")* ...
     *	[12] PubidLiteral ::= ... (PubidChar - "'")* ...
     *

* as well as the quoted strings in XML and text declarations * (for version, encoding, and standalone) which have their * own constraints. */ private String readLiteral (int flags) throws SAXException, IOException { char delim, c; int startLine = line; boolean saved = expandPE; boolean savedReport = doReport; // Find the first delimiter. delim = readCh (); if (delim != '"' && delim != '\'') { error ("expected '\"' or \"'\"", delim, null); return null; } inLiteral = true; if ((flags & LIT_DISABLE_PE) != 0) expandPE = false; doReport = false; // Each level of input source has its own buffer; remember // ours, so we won't read the ending delimiter from any // other input source, regardless of entity processing. char ourBuf [] = readBuffer; // Read the literal. try { c = readCh (); boolean ampRead = false; loop: while (! (c == delim && readBuffer == ourBuf)) { switch (c) { // attributes and public ids are normalized // in almost the same ways case '\n': case '\r': if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0) c = ' '; break; case '\t': if ((flags & LIT_ATTRIBUTE) != 0) c = ' '; break; case '&': c = readCh (); // Char refs are expanded immediately, except for // all the cases where it's deferred. if (c == '#') { if ((flags & LIT_DISABLE_CREF) != 0) { dataBufferAppend ('&'); break; } parseCharRef (false /* Do not do flushDataBuffer */); // exotic WFness risk: this is an entity literal, // dataBuffer [dataBufferPos - 1] == '&', and // following chars are a _partial_ entity/char ref // It looks like an entity ref ... } else { unread (c); // Expand it? if ((flags & LIT_ENTITY_REF) > 0) { parseEntityRef (false); if (String.valueOf (readBuffer).equals("&")) ampRead = true; //Is it just data? } else if ((flags & LIT_DISABLE_EREF) != 0) { dataBufferAppend ('&'); // OK, it will be an entity ref -- expanded later. } else { String name = readNmtoken (true); require (';'); dataBufferAppend ('&'); dataBufferAppend (name); dataBufferAppend (';'); } } c = readCh (); continue loop; case '<': // and why? Perhaps so "&foo;" expands the same // inside and outside an attribute? if ((flags & LIT_ATTRIBUTE) != 0) error ("attribute values may not contain '<'"); break; // We don't worry about case '%' and PE refs, readCh does. default: break; } dataBufferAppend (c); c = readCh (); } } catch (EOFException e) { error ("end of input while looking for delimiter (started on line " + startLine + ')', null, new Character (delim).toString ()); } inLiteral = false; expandPE = saved; doReport = savedReport; // Normalise whitespace if necessary. if ((flags & LIT_NORMALIZE) > 0) { dataBufferNormalize (); } // Return the value. return dataBufferToString (); } /** * Try reading external identifiers. * A system identifier is not required for notations. * @param inNotation Are we parsing a notation decl? * @param isSubset Parsing external subset decl (may be omitted)? * @return A three-member String array containing the identifiers, * or nulls. Order: public, system, baseURI. */ private String[] readExternalIds (boolean inNotation, boolean isSubset) throws Exception { char c; String ids[] = new String [3]; int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; if (tryRead ("PUBLIC")) { requireWhitespace (); ids [0] = readLiteral (LIT_NORMALIZE | LIT_PUBID | flags); if (inNotation) { skipWhitespace (); c = readCh (); unread (c); if (c == '"' || c == '\'') { ids [1] = readLiteral (flags); } } else { requireWhitespace (); ids [1] = readLiteral (flags); } for (int i = 0; i < ids [0].length (); i++) { c = ids [0].charAt (i); if (c >= 'a' && c <= 'z') continue; if (c >= 'A' && c <= 'Z') continue; if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf (c) != -1) continue; error ("illegal PUBLIC id character U+" + Integer.toHexString (c)); } } else if (tryRead ("SYSTEM")) { requireWhitespace (); ids [1] = readLiteral (flags); } else if (!isSubset) error ("missing SYSTEM or PUBLIC keyword"); if (ids [1] != null) { if (ids [1].indexOf ('#') != -1) handler.verror ("SYSTEM id has a URI fragment: " + ids [1]); ids [2] = handler.getSystemId (); if (ids [2] == null) handler.warn ("No base URI; hope URI is absolute: " + ids [1]); } return ids; } /** * Test if a character is whitespace. *

     * [3] S ::= (#x20 | #x9 | #xd | #xa)+
     *

* @param c The character to test. * @return true if the character is whitespace. */ private final boolean isWhitespace (char c) { if (c > 0x20) return false; if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d) return true; return false; // illegal ... } ////////////////////////////////////////////////////////////////////// // Utility routines. ////////////////////////////////////////////////////////////////////// /** * Add a character to the data buffer. */ private void dataBufferAppend (char c) { // Expand buffer if necessary. if (dataBufferPos >= dataBuffer.length) dataBuffer = (char[]) extendArray (dataBuffer, dataBuffer.length, dataBufferPos); dataBuffer [dataBufferPos++] = c; } /** * Add a string to the data buffer. */ private void dataBufferAppend (String s) { dataBufferAppend (s.toCharArray (), 0, s.length ()); } /** * Append (part of) a character array to the data buffer. */ private void dataBufferAppend (char ch[], int start, int length) { dataBuffer = (char[]) extendArray (dataBuffer, dataBuffer.length, dataBufferPos + length); System.arraycopy (ch, start, dataBuffer, dataBufferPos, length); dataBufferPos += length; } /** * Normalise space characters in the data buffer. */ private void dataBufferNormalize () { int i = 0; int j = 0; int end = dataBufferPos; // Skip spaces at the start. while (j < end && dataBuffer [j] == ' ') { j++; } // Skip whitespace at the end. while (end > j && dataBuffer [end - 1] == ' ') { end --; } // Start copying to the left. while (j < end) { char c = dataBuffer [j++]; // Normalise all other spaces to // a single space. if (c == ' ') { while (j < end && dataBuffer [j++] == ' ') continue; dataBuffer [i++] = ' '; dataBuffer [i++] = dataBuffer [j - 1]; } else { dataBuffer [i++] = c; } } // The new length is <= the old one. dataBufferPos = i; } /** * Convert the data buffer to a string. */ private String dataBufferToString () { String s = new String (dataBuffer, 0, dataBufferPos); dataBufferPos = 0; return s; } /** * Flush the contents of the data buffer to the handler, as * appropriate, and reset the buffer for new input. */ private void dataBufferFlush () throws SAXException { if (currentElementContent == CONTENT_ELEMENTS && dataBufferPos > 0 && !inCDATA ) { // We can't just trust the buffer to be whitespace, there // are (error) cases when it isn't for (int i = 0; i < dataBufferPos; i++) { if (!isWhitespace (dataBuffer [i])) { handler.charData (dataBuffer, 0, dataBufferPos); dataBufferPos = 0; } } if (dataBufferPos > 0) { handler.ignorableWhitespace (dataBuffer, 0, dataBufferPos); dataBufferPos = 0; } } else if (dataBufferPos > 0) { handler.charData (dataBuffer, 0, dataBufferPos); dataBufferPos = 0; } } /** * Require a string to appear, or throw an exception. *

Precondition: Entity expansion is not required. *

Precondition: data buffer has no characters that * will get sent to the application. */ private void require (String delim) throws SAXException, IOException { int length = delim.length (); char ch []; if (length < dataBuffer.length) { ch = dataBuffer; delim.getChars (0, length, ch, 0); } else ch = delim.toCharArray (); if (USE_CHEATS && length <= (readBufferLength - readBufferPos)) { int offset = readBufferPos; for (int i = 0; i < length; i++, offset++) if (ch [i] != readBuffer [offset]) error ("required string", null, delim); readBufferPos = offset; } else { for (int i = 0; i < length; i++) require (ch [i]); } } /** * Require a character to appear, or throw an exception. */ private void require (char delim) throws SAXException, IOException { char c = readCh (); if (c != delim) { error ("required character", c, new Character (delim).toString ()); } } /** * Create an interned string from a character array. * Ælfred uses this method to create an interned version * of all names and name tokens, so that it can test equality * with == instead of String.equals (). * *

This is much more efficient than constructing a non-interned * string first, and then interning it. * * @param ch an array of characters for building the string. * @param start the starting position in the array. * @param length the number of characters to place in the string. * @return an interned string. * @see #intern (String) * @see java.lang.String#intern */ public String intern (char ch[], int start, int length) { int index = 0; int hash = 0; Object bucket []; // Generate a hash code. This is a widely used string hash, // often attributed to Brian Kernighan. for (int i = start; i < start + length; i++) hash = 31 * hash + ch [i]; hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH; // Get the bucket -- consists of {array,String} pairs if ((bucket = symbolTable [hash]) == null) { // first string in this bucket bucket = new Object [8]; // Search for a matching tuple, and // return the string if we find one. } else { while (index < bucket.length) { char chFound [] = (char []) bucket [index]; // Stop when we hit an empty entry. if (chFound == null) break; // If they're the same length, check for a match. if (chFound.length == length) { for (int i = 0; i < chFound.length; i++) { // continue search on failure if (ch [start + i] != chFound [i]) { break; } else if (i == length - 1) { // That's it, we have a match! return (String) bucket [index + 1]; } } } index += 2; } // Not found -- we'll have to add it. // Do we have to grow the bucket? bucket = (Object []) extendArray (bucket, bucket.length, index); } symbolTable [hash] = bucket; // OK, add it to the end of the bucket -- "local" interning. // Intern "globally" to let applications share interning benefits. // That is, "!=" and "==" work on our strings, not just equals(). String s = new String (ch, start, length).intern (); bucket [index] = s.toCharArray (); bucket [index + 1] = s; return s; } /** * Ensure the capacity of an array, allocating a new one if * necessary. Usually extends only for name hash collisions. */ private Object extendArray (Object array, int currentSize, int requiredSize) { if (requiredSize < currentSize) { return array; } else { Object newArray = null; int newSize = currentSize * 2; if (newSize <= requiredSize) newSize = requiredSize + 1; if (array instanceof char[]) newArray = new char [newSize]; else if (array instanceof Object[]) newArray = new Object [newSize]; else throw new RuntimeException (); System.arraycopy (array, 0, newArray, 0, currentSize); return newArray; } } ////////////////////////////////////////////////////////////////////// // XML query routines. ////////////////////////////////////////////////////////////////////// boolean isStandalone () { return docIsStandalone; } // // Elements // private int getContentType (Object element [], int defaultType) { int retval; if (element == null) return defaultType; retval = ((Integer) element [0]).intValue (); if (retval == CONTENT_UNDECLARED) retval = defaultType; return retval; } /** * Look up the content type of an element. * @param name The element type name. * @return An integer constant representing the content type. * @see #CONTENT_UNDECLARED * @see #CONTENT_ANY * @see #CONTENT_EMPTY * @see #CONTENT_MIXED * @see #CONTENT_ELEMENTS */ public int getElementContentType (String name) { Object element [] = (Object []) elementInfo.get (name); return getContentType (element, CONTENT_UNDECLARED); } /** * Register an element. * Array format: * [0] element type name * [1] content model (mixed, elements only) * [2] attribute hash table */ private void setElement ( String name, int contentType, String contentModel, Hashtable attributes ) throws SAXException { if (skippedPE) return; Object element [] = (Object []) elementInfo.get (name); // first or for this type? if (element == null) { element = new Object [3]; element [0] = new Integer (contentType); element [1] = contentModel; element [2] = attributes; elementInfo.put (name, element); return; } // declaration? if (contentType != CONTENT_UNDECLARED) { // ... following an associated if (((Integer) element [0]).intValue () == CONTENT_UNDECLARED) { element [0] = new Integer (contentType); element [1] = contentModel; } else // VC: Unique Element Type Declaration handler.verror ("multiple declarations for element type: " + name); } // first , before ? else if (attributes != null) element [2] = attributes; } /** * Look up the attribute hash table for an element. * The hash table is the second item in the element array. */ private Hashtable getElementAttributes (String name) { Object element[] = (Object[]) elementInfo.get (name); if (element == null) return null; else return (Hashtable) element [2]; } // // Attributes // /** * Get the declared attributes for an element type. * @param elname The name of the element type. * @return An Enumeration of all the attributes declared for * a specific element type. The results will be valid only * after the DTD (if any) has been parsed. * @see #getAttributeType * @see #getAttributeEnumeration * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue * @see #getAttributeExpandedValue */ private Enumeration declaredAttributes (Object element []) { Hashtable attlist; if (element == null) return null; if ((attlist = (Hashtable) element [2]) == null) return null; return attlist.keys (); } /** * Get the declared attributes for an element type. * @param elname The name of the element type. * @return An Enumeration of all the attributes declared for * a specific element type. The results will be valid only * after the DTD (if any) has been parsed. * @see #getAttributeType * @see #getAttributeEnumeration * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue * @see #getAttributeExpandedValue */ public Enumeration declaredAttributes (String elname) { return declaredAttributes ((Object []) elementInfo.get (elname)); } /** * Retrieve the declared type of an attribute. * @param name The name of the associated element. * @param aname The name of the attribute. * @return An interend string denoting the type, or null * indicating an undeclared attribute. */ public String getAttributeType (String name, String aname) { Object attribute[] = getAttribute (name, aname); if (attribute == null) { return null; } else { return (String) attribute [0]; } } /** * Retrieve the allowed values for an enumerated attribute type. * @param name The name of the associated element. * @param aname The name of the attribute. * @return A string containing the token list. */ public String getAttributeEnumeration (String name, String aname) { Object attribute[] = getAttribute (name, aname); if (attribute == null) { return null; } else { // assert: attribute [0] is "ENUMERATION" or "NOTATION" return (String) attribute [3]; } } /** * Retrieve the default value of a declared attribute. * @param name The name of the associated element. * @param aname The name of the attribute. * @return The default value, or null if the attribute was * #IMPLIED or simply undeclared and unspecified. * @see #getAttributeExpandedValue */ public String getAttributeDefaultValue (String name, String aname) { Object attribute[] = getAttribute (name, aname); if (attribute == null) { return null; } else { return (String) attribute [1]; } } /* // FIXME: Leaving this in, until W3C finally resolves the confusion // between parts of the XML 2nd REC about when entity declararations // are guaranteed to be known. Current code matches what section 5.1 // (conformance) describes, but some readings of the self-contradicting // text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that // attribute expansion/normalization must be deferred in some cases // (just TRY to identify them!). * Retrieve the expanded value of a declared attribute. *

General entities (and char refs) will be expanded (once). * @param name The name of the associated element. * @param aname The name of the attribute. * @return The expanded default value, or null if the attribute was * #IMPLIED or simply undeclared * @see #getAttributeDefaultValue public String getAttributeExpandedValue (String name, String aname) throws Exception { Object attribute[] = getAttribute (name, aname); if (attribute == null) { return null; } else if (attribute [4] == null && attribute [1] != null) { // we MUST use the same buf for both quotes else the literal // can't be properly terminated char buf [] = new char [1]; int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE; String type = getAttributeType (name, aname); if (type != "CDATA" && type != null) flags |= LIT_NORMALIZE; buf [0] = '"'; pushCharArray (null, buf, 0, 1); pushString (null, (String) attribute [1]); pushCharArray (null, buf, 0, 1); attribute [4] = readLiteral (flags); } return (String) attribute [4]; } */ /** * Retrieve the default value mode of a declared attribute. * @see #ATTRIBUTE_DEFAULT_SPECIFIED * @see #ATTRIBUTE_DEFAULT_IMPLIED * @see #ATTRIBUTE_DEFAULT_REQUIRED * @see #ATTRIBUTE_DEFAULT_FIXED */ public int getAttributeDefaultValueType (String name, String aname) { Object attribute[] = getAttribute (name, aname); if (attribute == null) { return ATTRIBUTE_DEFAULT_UNDECLARED; } else { return ((Integer) attribute [2]).intValue (); } } /** * Register an attribute declaration for later retrieval. * Format: * - String type * - String default value * - int value type * - enumeration * - processed default value */ private void setAttribute (String elName, String name, String type, String enumeration, String value, int valueType) throws Exception { Hashtable attlist; if (skippedPE) return; // Create a new hashtable if necessary. attlist = getElementAttributes (elName); if (attlist == null) attlist = new Hashtable (); // ignore multiple attribute declarations! if (attlist.get (name) != null) { // warn ... return; } else { Object attribute [] = new Object [5]; attribute [0] = type; attribute [1] = value; attribute [2] = new Integer (valueType); attribute [3] = enumeration; attribute [4] = null; attlist.put (name, attribute); // save; but don't overwrite any existing setElement (elName, CONTENT_UNDECLARED, null, attlist); } } /** * Retrieve the array representing an attribute declaration. */ private Object[] getAttribute (String elName, String name) { Hashtable attlist; attlist = getElementAttributes (elName); if (attlist == null) return null; return (Object[]) attlist.get (name); } // // Entities // /** * Find the type of an entity. * @returns An integer constant representing the entity type. * @see #ENTITY_UNDECLARED * @see #ENTITY_INTERNAL * @see #ENTITY_NDATA * @see #ENTITY_TEXT */ public int getEntityType (String ename) { Object entity[] = (Object[]) entityInfo.get (ename); if (entity == null) { return ENTITY_UNDECLARED; } else { return ((Integer) entity [0]).intValue (); } } /** * Return an external entity's identifier array. * @param ename The name of the external entity. * @return Three element array containing (in order) the entity's * public identifier, system identifier, and base URI. Null if * the entity was not declared as an external entity. * @see #getEntityType */ public String [] getEntityIds (String ename) { Object entity[] = (Object[]) entityInfo.get (ename); if (entity == null) { return null; } else { return (String []) entity [1]; } } /** * Return an internal entity's replacement text. * @param ename The name of the internal entity. * @return The entity's replacement text, or null if * the entity was not declared as an internal entity. * @see #getEntityType */ public String getEntityValue (String ename) { Object entity[] = (Object[]) entityInfo.get (ename); if (entity == null) { return null; } else { return (String) entity [3]; } } /** * Register an entity declaration for later retrieval. */ private void setInternalEntity (String eName, String value) throws SAXException { if (skippedPE) return; if (entityInfo.get (eName) == null) { Object entity[] = new Object [5]; entity [0] = new Integer (ENTITY_INTERNAL); // FIXME: shrink!! [2] useless entity [3] = value; entityInfo.put (eName, entity); } if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { if ("lt" == eName || "gt" == eName || "quot" == eName || "apos" == eName || "amp" == eName) return; } else { if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName) || "apos".equals(eName) || "amp".equals(eName)) return; } handler.getDeclHandler () .internalEntityDecl (eName, value); } /** * Register an external entity declaration for later retrieval. */ private void setExternalEntity (String eName, int eClass, String ids [], String nName) { if (entityInfo.get (eName) == null) { Object entity[] = new Object [5]; entity [0] = new Integer (eClass); entity [1] = ids; // FIXME: shrink!! [2] no longer used, [4] irrelevant given [0] entity [4] = nName; entityInfo.put (eName, entity); } } // // Notations. // /** * Report a notation declaration, checking for duplicates. */ private void setNotation (String nname, String ids []) throws SAXException { if (skippedPE) return; handler.notationDecl (nname, ids); if (notationInfo.get (nname) == null) notationInfo.put (nname, nname); else // VC: Unique Notation Name handler.verror ("Duplicate notation name decl: " + nname); } // // Location. // /** * Return the current line number. */ public int getLineNumber () { return line; } /** * Return the current column number. */ public int getColumnNumber () { return column; } ////////////////////////////////////////////////////////////////////// // High-level I/O. ////////////////////////////////////////////////////////////////////// /** * Read a single character from the readBuffer. *

The readDataChunk () method maintains the buffer. *

If we hit the end of an entity, try to pop the stack and * keep going. *

(This approach doesn't really enforce XML's rules about * entity boundaries, but this is not currently a validating * parser). *

This routine also attempts to keep track of the current * position in external entities, but it's not entirely accurate. * @return The next available input character. * @see #unread (char) * @see #readDataChunk * @see #readBuffer * @see #line * @return The next character from the current input source. */ private char readCh () throws SAXException, IOException { // As long as there's nothing in the // read buffer, try reading more data // (for an external entity) or popping // the entity stack (for either). while (readBufferPos >= readBufferLength) { switch (sourceType) { case INPUT_READER: case INPUT_STREAM: readDataChunk (); while (readBufferLength < 1) { popInput (); if (readBufferLength < 1) { readDataChunk (); } } break; default: popInput (); break; } } char c = readBuffer [readBufferPos++]; if (c == '\n') { line++; column = 0; } else { if (c == '<') { /* the most common return to parseContent () ... NOP */ } else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD) || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) && xmlVersion == XML_11)) error ("illegal XML character U+" + Integer.toHexString (c)); // If we're in the DTD and in a context where PEs get expanded, // do so ... 1/14/2000 errata identify those contexts. There // are also spots in the internal subset where PE refs are fatal // errors, hence yet another flag. else if (c == '%' && expandPE) { if (peIsError) error ("PE reference within decl in internal subset."); parsePEReference (); return readCh (); } column++; } return c; } /** * Push a single character back onto the current input stream. *

This method usually pushes the character back onto * the readBuffer. *

I don't think that this would ever be called with * readBufferPos = 0, because the methods always reads a character * before unreading it, but just in case, I've added a boundary * condition. * @param c The character to push back. * @see #readCh * @see #unread (char[]) * @see #readBuffer */ private void unread (char c) throws SAXException { // Normal condition. if (c == '\n') { line--; column = -1; } if (readBufferPos > 0) { readBuffer [--readBufferPos] = c; } else { pushString (null, new Character (c).toString ()); } } /** * Push a char array back onto the current input stream. *

NOTE: you must never push back characters that you * haven't actually read: use pushString () instead. * @see #readCh * @see #unread (char) * @see #readBuffer * @see #pushString */ private void unread (char ch[], int length) throws SAXException { for (int i = 0; i < length; i++) { if (ch [i] == '\n') { line--; column = -1; } } if (length < readBufferPos) { readBufferPos -= length; } else { pushCharArray (null, ch, 0, length); } } /** * Push, or skip, a new external input source. * The source will be some kind of parsed entity, such as a PE * (including the external DTD subset) or content for the body. * * @param url The java.net.URL object for the entity. * @see SAXDriver#resolveEntity * @see #pushString * @see #sourceType * @see #pushInput * @see #detectEncoding * @see #sourceType * @see #readBuffer */ private void pushURL ( boolean isPE, String ename, String ids [], // public, system, baseURI Reader reader, InputStream stream, String encoding, boolean doResolve ) throws SAXException, IOException { boolean ignoreEncoding; String systemId; InputSource source; if (!isPE) dataBufferFlush (); scratch.setPublicId (ids [0]); scratch.setSystemId (ids [1]); // See if we should skip or substitute the entity. // If we're not skipping, resolving reports startEntity() // and updates the (handler's) stack of URIs. if (doResolve) { // assert (stream == null && reader == null && encoding == null) source = handler.resolveEntity (isPE, ename, scratch, ids [2]); if (source == null) { handler.warn ("skipping entity: " + ename); handler.skippedEntity (ename); if (isPE) skippedPE = true; return; } // we might be using alternate IDs/encoding systemId = source.getSystemId (); // The following warning and setting systemId was deleted bcause // the application has the option of not setting systemId // provided that it has set the characte/byte stream. /* if (systemId == null) { handler.warn ("missing system ID, using " + ids [1]); systemId = ids [1]; } */ } else { // "[document]", or "[dtd]" via getExternalSubset() scratch.setCharacterStream (reader); scratch.setByteStream (stream); scratch.setEncoding (encoding); source = scratch; systemId = ids [1]; if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) { handler.startExternalEntity (ename, systemId, "[document]" == ename); } else { handler.startExternalEntity (ename, systemId, "[document]".equals(ename)); } } // we may have been given I/O streams directly if (source.getCharacterStream () != null) { if (source.getByteStream () != null) error ("InputSource has two streams!"); reader = source.getCharacterStream (); } else if (source.getByteStream () != null) { encoding = source.getEncoding (); if (encoding == null) stream = source.getByteStream (); else try { reader = new InputStreamReader ( source.getByteStream (), encoding); } catch (IOException e) { stream = source.getByteStream (); } } else if (systemId == null) error ("InputSource has no URI!"); scratch.setCharacterStream (null); scratch.setByteStream (null); scratch.setEncoding (null); // Push the existing status. pushInput (ename); // Create a new read buffer. // (Note the four-character margin) readBuffer = new char [READ_BUFFER_MAX + 4]; readBufferPos = 0; readBufferLength = 0; readBufferOverflow = -1; is = null; line = 1; column = 0; currentByteCount = 0; // If there's an explicit character stream, just // ignore encoding declarations. if (reader != null) { sourceType = INPUT_READER; this.reader = reader; tryEncodingDecl (true); return; } // Else we handle the conversion, and need to ensure // it's done right. sourceType = INPUT_STREAM; if (stream != null) { is = stream; } else { // We have to open our own stream to the URL. URL url = new URL (systemId); externalEntity = url.openConnection (); externalEntity.connect (); is = externalEntity.getInputStream (); } // If we get to here, there must be // an InputStream available. if (!is.markSupported ()) { is = new BufferedInputStream (is); } // Get any external encoding label. if (encoding == null && externalEntity != null) { // External labels can be untrustworthy; filesystems in // particular often have the wrong default for content // that wasn't locally originated. Those we autodetect. if (!"file".equals (externalEntity.getURL ().getProtocol ())) { int temp; // application/xml;charset=something;otherAttr=... // ... with many variants on 'something' encoding = externalEntity.getContentType (); // MHK code (fix for Saxon 5.5.1/007): // protect against encoding==null if (encoding==null) { temp = -1; } else { temp = encoding.indexOf ("charset"); } // RFC 2376 sez MIME text defaults to ASCII, but since the // JDK will create a MIME type out of thin air, we always // autodetect when there's no explicit charset attribute. if (temp < 0) encoding = null; // autodetect else { // only this one attribute if ((temp = encoding.indexOf (';')) > 0) encoding = encoding.substring (0, temp); if ((temp = encoding.indexOf ('=', temp + 7)) > 0) { encoding = encoding.substring (temp + 1); // attributes can have comment fields (RFC 822) if ((temp = encoding.indexOf ('(')) > 0) encoding = encoding.substring (0, temp); // ... and values may be quoted if ((temp = encoding.indexOf ('"')) > 0) encoding = encoding.substring (temp + 1, encoding.indexOf ('"', temp + 2)); encoding.trim (); } else { handler.warn ("ignoring illegal MIME attribute: " + encoding); encoding = null; } } } } // if we got an external encoding label, use it ... if (encoding != null) { this.encoding = ENCODING_EXTERNAL; setupDecoding (encoding); ignoreEncoding = true; // ... else autodetect from first bytes. } else { detectEncoding (); ignoreEncoding = false; } // Read any XML or text declaration. // If we autodetected, it may tell us the "real" encoding. try { tryEncodingDecl (ignoreEncoding); } catch (UnsupportedEncodingException x) { encoding = x.getMessage (); // if we don't handle the declared encoding, // try letting a JVM InputStreamReader do it try { if (sourceType != INPUT_STREAM) throw x; is.reset (); readBufferPos = 0; readBufferLength = 0; readBufferOverflow = -1; line = 1; currentByteCount = column = 0; sourceType = INPUT_READER; this.reader = new InputStreamReader (is, encoding); is = null; tryEncodingDecl (true); } catch (IOException e) { error ("unsupported text encoding", encoding, null); } } } /** * Check for an encoding declaration. This is the second part of the * XML encoding autodetection algorithm, relying on detectEncoding to * get to the point that this part can read any encoding declaration * in the document (using only US-ASCII characters). * *

Because this part starts to fill parser buffers with this data, * it's tricky to setup a reader so that Java's built-in decoders can be * used for the character encodings that aren't built in to this parser * (such as EUC-JP, KOI8-R, Big5, etc). * * @return any encoding in the declaration, uppercased; or null * @see detectEncoding */ private String tryEncodingDecl (boolean ignoreEncoding) throws SAXException, IOException { // Read the XML/text declaration. if (tryRead (" 0) { return parseTextDecl (ignoreEncoding); } else { return parseXMLDecl (ignoreEncoding); } } else { // or similar unread ('l'); unread ('m'); unread ('x'); unread ('?'); unread ('<'); } } return null; } /** * Attempt to detect the encoding of an entity. *

The trick here (as suggested in the XML standard) is that * any entity not in UTF-8, or in UCS-2 with a byte-order mark, * must begin with an XML declaration or an encoding * declaration; we simply have to look for "<?xml" in various * encodings. *

This method has no way to distinguish among 8-bit encodings. * Instead, it sets up for UTF-8, then (possibly) revises its assumption * later in setupDecoding (). Any ASCII-derived 8-bit encoding * should work, but most will be rejected later by setupDecoding (). * @see #tryEncoding (byte[], byte, byte, byte, byte) * @see #tryEncoding (byte[], byte, byte) * @see #setupDecoding */ private void detectEncoding () throws SAXException, IOException { byte signature[] = new byte [4]; // Read the first four bytes for // autodetection. is.mark (4); is.read (signature); is.reset (); // // FIRST: four byte encodings (who uses these?) // if (tryEncoding (signature, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x3c)) { // UCS-4 must begin with "Utility routine for detectEncoding (). *

Always looks for some part of "Looks for a UCS-2 byte-order mark. *

Utility routine for detectEncoding (). * @param sig The first four bytes read. * @param b1 The first byte of the signature * @param b2 The second byte of the signature * @see #detectEncoding */ private static boolean tryEncoding (byte sig[], byte b1, byte b2) { return ((sig [0] == b1) && (sig [1] == b2)); } /** * This method pushes a string back onto input. *

It is useful either as the expansion of an internal entity, * or for backtracking during the parse. *

Call pushCharArray () to do the actual work. * @param s The string to push back onto input. * @see #pushCharArray */ private void pushString (String ename, String s) throws SAXException { char ch[] = s.toCharArray (); pushCharArray (ename, ch, 0, ch.length); } /** * Push a new internal input source. *

This method is useful for expanding an internal entity, * or for unreading a string of characters. It creates a new * readBuffer containing the characters in the array, instead * of characters converted from an input byte stream. * @param ch The char array to push. * @see #pushString * @see #pushURL * @see #readBuffer * @see #sourceType * @see #pushInput */ private void pushCharArray (String ename, char ch[], int start, int length) throws SAXException { // Push the existing status pushInput (ename); if (ename != null && doReport) { dataBufferFlush (); handler.startInternalEntity (ename); } sourceType = INPUT_INTERNAL; readBuffer = ch; readBufferPos = start; readBufferLength = length; readBufferOverflow = -1; } /** * Save the current input source onto the stack. *

This method saves all of the global variables associated with * the current input source, so that they can be restored when a new * input source has finished. It also tests for entity recursion. *

The method saves the following global variables onto a stack * using a fixed-length array: *

sourceType *
externalEntity *
readBuffer *
readBufferPos *
readBufferLength *
line *
encoding *

* @param ename The name of the entity (if any) causing the new input. * @see #popInput * @see #sourceType * @see #externalEntity * @see #readBuffer * @see #readBufferPos * @see #readBufferLength * @see #line * @see #encoding */ private void pushInput (String ename) throws SAXException { // Check for entity recursion. if (ename != null) { Enumeration entities = entityStack.elements (); while (entities.hasMoreElements ()) { String e = (String) entities.nextElement (); if (e != null && e == ename) { error ("recursive reference to entity", ename, null); } } } entityStack.push (ename); // Don't bother if there is no current input. if (sourceType == INPUT_NONE) { return; } // Set up a snapshot of the current // input source. Object input[] = new Object [12]; input [0] = new Integer (sourceType); input [1] = externalEntity; input [2] = readBuffer; input [3] = new Integer (readBufferPos); input [4] = new Integer (readBufferLength); input [5] = new Integer (line); input [6] = new Integer (encoding); input [7] = new Integer (readBufferOverflow); input [8] = is; input [9] = new Integer (currentByteCount); input [10] = new Integer (column); input [11] = reader; // Push it onto the stack. inputStack.push (input); } /** * Restore a previous input source. *

This method restores all of the global variables associated with * the current input source. * @exception java.io.EOFException * If there are no more entries on the input stack. * @see #pushInput * @see #sourceType * @see #externalEntity * @see #readBuffer * @see #readBufferPos * @see #readBufferLength * @see #line * @see #encoding */ private void popInput () throws SAXException, IOException { String ename = (String) entityStack.pop (); if (ename != null && doReport) dataBufferFlush (); switch (sourceType) { case INPUT_STREAM: handler.endExternalEntity (ename); is.close (); break; case INPUT_READER: handler.endExternalEntity (ename); reader.close (); break; case INPUT_INTERNAL: if (ename != null && doReport) handler.endInternalEntity (ename); break; } // Throw an EOFException if there // is nothing else to pop. if (inputStack.isEmpty ()) { throw new EOFException ("no more input"); } Object input [] = (Object[]) inputStack.pop (); sourceType = ((Integer) input [0]).intValue (); externalEntity = (URLConnection) input [1]; readBuffer = (char[]) input [2]; readBufferPos = ((Integer) input [3]).intValue (); readBufferLength = ((Integer) input [4]).intValue (); line = ((Integer) input [5]).intValue (); encoding = ((Integer) input [6]).intValue (); readBufferOverflow = ((Integer) input [7]).intValue (); is = (InputStream) input [8]; currentByteCount = ((Integer) input [9]).intValue (); column = ((Integer) input [10]).intValue (); reader = (Reader) input [11]; } /** * Return true if we can read the expected character. *

Note that the character will be removed from the input stream * on success, but will be put back on failure. Do not attempt to * read the character again if the method succeeds. * @param delim The character that should appear next. For a * insensitive match, you must supply this in upper-case. * @return true if the character was successfully read, or false if * it was not. * @see #tryRead (String) */ private boolean tryRead (char delim) throws SAXException, IOException { char c; // Read the character c = readCh (); // Test for a match, and push the character // back if the match fails. if (c == delim) { return true; } else { unread (c); return false; } } /** * Return true if we can read the expected string. *

This is simply a convenience method. *

Note that the string will be removed from the input stream * on success, but will be put back on failure. Do not attempt to * read the string again if the method succeeds. *

This method will push back a character rather than an * array whenever possible (probably the majority of cases). * @param delim The string that should appear next. * @return true if the string was successfully read, or false if * it was not. * @see #tryRead (char) */ private boolean tryRead (String delim) throws SAXException, IOException { return tryRead (delim.toCharArray ()); } private boolean tryRead (char ch []) throws SAXException, IOException { char c; // Compare the input, character- // by character. for (int i = 0; i < ch.length; i++) { c = readCh (); if (c != ch [i]) { unread (c); if (i != 0) { unread (ch, i); } return false; } } return true; } /** * Return true if we can read some whitespace. *

This is simply a convenience method. *

This method will push back a character rather than an * array whenever possible (probably the majority of cases). * @return true if whitespace was found. */ private boolean tryWhitespace () throws SAXException, IOException { char c; c = readCh (); if (isWhitespace (c)) { skipWhitespace (); return true; } else { unread (c); return false; } } /** * Read all data until we find the specified string. * This is useful for scanning CDATA sections and PIs. *

This is inefficient right now, since it calls tryRead () * for every character. * @param delim The string delimiter * @see #tryRead (String, boolean) * @see #readCh */ private void parseUntil (String delim) throws SAXException, IOException { parseUntil (delim.toCharArray ()); } private void parseUntil (char delim []) throws SAXException, IOException { char c; int startLine = line; try { while (!tryRead (delim)) { c = readCh (); dataBufferAppend (c); } } catch (EOFException e) { error ("end of input while looking for delimiter " + "(started on line " + startLine + ')', null, new String (delim)); } } ////////////////////////////////////////////////////////////////////// // Low-level I/O. ////////////////////////////////////////////////////////////////////// /** * Prefetch US-ASCII XML/text decl from input stream into read buffer. * Doesn't buffer more than absolutely needed, so that when an encoding * decl says we need to create an InputStreamReader, we can discard our * buffer and reset(). Caller knows the first chars of the decl exist * in the input stream. */ private void prefetchASCIIEncodingDecl () throws SAXException, IOException { int ch; readBufferPos = readBufferLength = 0; is.mark (readBuffer.length); while (true) { ch = is.read (); readBuffer [readBufferLength++] = (char) ch; switch (ch) { case (int) '>': return; case -1: error ("file ends before end of XML or encoding declaration.", null, "?>"); } if (readBuffer.length == readBufferLength) error ("unfinished XML or encoding declaration"); } } /** * Read a chunk of data from an external input source. *

This is simply a front-end that fills the rawReadBuffer * with bytes, then calls the appropriate encoding handler. * @see #encoding * @see #rawReadBuffer * @see #readBuffer * @see #filterCR * @see #copyUtf8ReadBuffer * @see #copyIso8859_1ReadBuffer * @see #copyUcs_2ReadBuffer * @see #copyUcs_4ReadBuffer */ private void readDataChunk () throws SAXException, IOException { int count; // See if we have any overflow (filterCR sets for CR at end) if (readBufferOverflow > -1) { readBuffer [0] = (char) readBufferOverflow; readBufferOverflow = -1; readBufferPos = 1; sawCR = true; } else { readBufferPos = 0; sawCR = false; } // input from a character stream. if (sourceType == INPUT_READER) { count = reader.read (readBuffer, readBufferPos, READ_BUFFER_MAX - readBufferPos); if (count < 0) readBufferLength = readBufferPos; else readBufferLength = readBufferPos + count; if (readBufferLength > 0) filterCR (count >= 0); sawCR = false; return; } // Read as many bytes as possible into the raw buffer. count = is.read (rawReadBuffer, 0, READ_BUFFER_MAX); // Dispatch to an encoding-specific reader method to populate // the readBuffer. In most parser speed profiles, these routines // show up at the top of the CPU usage chart. if (count > 0) { switch (encoding) { // one byte builtins case ENCODING_ASCII: copyIso8859_1ReadBuffer (count, (char) 0x0080); break; case ENCODING_UTF_8: copyUtf8ReadBuffer (count); break; case ENCODING_ISO_8859_1: copyIso8859_1ReadBuffer (count, (char) 0); break; // two byte builtins case ENCODING_UCS_2_12: copyUcs2ReadBuffer (count, 8, 0); break; case ENCODING_UCS_2_21: copyUcs2ReadBuffer (count, 0, 8); break; // four byte builtins case ENCODING_UCS_4_1234: copyUcs4ReadBuffer (count, 24, 16, 8, 0); break; case ENCODING_UCS_4_4321: copyUcs4ReadBuffer (count, 0, 8, 16, 24); break; case ENCODING_UCS_4_2143: copyUcs4ReadBuffer (count, 16, 24, 0, 8); break; case ENCODING_UCS_4_3412: copyUcs4ReadBuffer (count, 8, 0, 24, 16); break; } } else readBufferLength = readBufferPos; readBufferPos = 0; // Filter out all carriage returns if we've seen any // (including any saved from a previous read) if (sawCR) { filterCR (count >= 0); sawCR = false; // must actively report EOF, lest some CRs get lost. if (readBufferLength == 0 && count >= 0) readDataChunk (); } if (count > 0) currentByteCount += count; } /** * Filter carriage returns in the read buffer. * CRLF becomes LF; CR becomes LF. * @param moreData true iff more data might come from the same source * @see #readDataChunk * @see #readBuffer * @see #readBufferOverflow */ private void filterCR (boolean moreData) { int i, j; readBufferOverflow = -1; loop: for (i = j = readBufferPos; j < readBufferLength; i++, j++) { switch (readBuffer [j]) { case '\r': if (j == readBufferLength - 1) { if (moreData) { readBufferOverflow = '\r'; readBufferLength--; } else // CR at end of buffer readBuffer [i++] = '\n'; break loop; } else if (readBuffer [j + 1] == '\n') { j++; } readBuffer [i] = '\n'; break; case '\n': default: readBuffer [i] = readBuffer [j]; break; } } readBufferLength = i; } /** * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters. *

When readDataChunk () calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. *

Note that as of Unicode 3.1, good practice became a requirement, * so that each Unicode character has exactly one UTF-8 representation. * @param count The number of bytes to convert. * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer * @see #getNextUtf8Byte */ private void copyUtf8ReadBuffer (int count) throws SAXException, IOException { int i = 0; int j = readBufferPos; int b1; char c = 0; /* // check once, so the runtime won't (if it's smart enough) if (count < 0 || count > rawReadBuffer.length) throw new ArrayIndexOutOfBoundsException (Integer.toString (count)); */ while (i < count) { b1 = rawReadBuffer [i++]; // Determine whether we are dealing // with a one-, two-, three-, or four- // byte sequence. if (b1 < 0) { if ((b1 & 0xe0) == 0xc0) { // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx c = (char) (((b1 & 0x1f) << 6) | getNextUtf8Byte (i++, count)); if (c < 0x0080) encodingError ("Illegal two byte UTF-8 sequence", c, 0); //Sec 2.11 // [1] the two-character sequence #xD #xA // [2] the two-character sequence #xD #x85 if ((c == 0x0085 || c == 0x000a) && sawCR) continue; // Sec 2.11 // [3] the single character #x85 if(c == 0x0085 && xmlVersion == XML_11) readBuffer[j++] = '\r'; } else if ((b1 & 0xf0) == 0xe0) { // 3-byte sequence: // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx // most CJKV characters c = (char) (((b1 & 0x0f) << 12) | (getNextUtf8Byte (i++, count) << 6) | getNextUtf8Byte (i++, count)); //sec 2.11 //[4] the single character #x2028 if(c == 0x2028 && xmlVersion == XML_11){ readBuffer[j++] = '\r'; sawCR = true; continue; } if (c < 0x0800 || (c >= 0xd800 && c <= 0xdfff)) encodingError ("Illegal three byte UTF-8 sequence", c, 0); } else if ((b1 & 0xf8) == 0xf0) { // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx // (uuuuu = wwww + 1) // "Surrogate Pairs" ... from the "Astral Planes" // Unicode 3.1 assigned the first characters there int iso646 = b1 & 07; iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count); iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count); iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count); if (iso646 <= 0xffff) { encodingError ("Illegal four byte UTF-8 sequence", iso646, 0); } else { if (iso646 > 0x0010ffff) encodingError ( "UTF-8 value out of range for Unicode", iso646, 0); iso646 -= 0x010000; readBuffer [j++] = (char) (0xd800 | (iso646 >> 10)); readBuffer [j++] = (char) (0xdc00 | (iso646 & 0x03ff)); continue; } } else { // The five and six byte encodings aren't supported; // they exceed the Unicode (and XML) range. encodingError ( "unsupported five or six byte UTF-8 sequence", 0xff & b1, i); // NOTREACHED c = 0; } } else { // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx // (US-ASCII character, "common" case, one branch to here) c = (char) b1; } readBuffer [j++] = c; if (c == '\r') sawCR = true; } // How many characters have we read? readBufferLength = j; } /** * Return the next byte value in a UTF-8 sequence. * If it is not possible to get a byte from the current * entity, throw an exception. * @param pos The current position in the rawReadBuffer. * @param count The number of bytes in the rawReadBuffer * @return The significant six bits of a non-initial byte in * a UTF-8 sequence. * @exception EOFException If the sequence is incomplete. */ private int getNextUtf8Byte (int pos, int count) throws SAXException, IOException { int val; // Take a character from the buffer // or from the actual input stream. if (pos < count) { val = rawReadBuffer [pos]; } else { val = is.read (); if (val == -1) { encodingError ("unfinished multi-byte UTF-8 sequence at EOF", -1, pos); } } // Check for the correct bits at the start. if ((val & 0xc0) != 0x80) { encodingError ("bad continuation of multi-byte UTF-8 sequence", val, pos + 1); } // Return the significant bits. return (val & 0x3f); } /** * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into * UTF-16 characters. * *

When readDataChunk () calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. * * @param count The number of bytes to convert. * @param mask For ASCII conversion, 0x7f; else, 0xff. * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer */ private void copyIso8859_1ReadBuffer (int count, char mask) throws IOException { int i, j; for (i = 0, j = readBufferPos; i < count; i++, j++) { char c = (char) (rawReadBuffer [i] & 0xff); if ((c & mask) != 0) throw new CharConversionException ("non-ASCII character U+" + Integer.toHexString (c)); if (c == 0x0085 && xmlVersion == XML_11) c = '\r'; readBuffer [j] = c; if (c == '\r') { sawCR = true; } } readBufferLength = j; } /** * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters * (as used in Java string manipulation). * *

When readDataChunk () calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. * @param count The number of bytes to convert. * @param shift1 The number of bits to shift byte 1. * @param shift2 The number of bits to shift byte 2 * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer */ private void copyUcs2ReadBuffer (int count, int shift1, int shift2) throws SAXException { int j = readBufferPos; if (count > 0 && (count % 2) != 0) { encodingError ("odd number of bytes in UCS-2 encoding", -1, count); } // The loops are faster with less internal brancing; hence two if (shift1 == 0) { // "UTF-16-LE" for (int i = 0; i < count; i += 2) { char c = (char) (rawReadBuffer [i + 1] << 8); c |= 0xff & rawReadBuffer [i]; readBuffer [j++] = c; if (c == '\r') sawCR = true; } } else { // "UTF-16-BE" for (int i = 0; i < count; i += 2) { char c = (char) (rawReadBuffer [i] << 8); c |= 0xff & rawReadBuffer [i + 1]; readBuffer [j++] = c; if (c == '\r') sawCR = true; } } readBufferLength = j; } /** * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters. * *

When readDataChunk () calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. *

Java has Unicode chars, and this routine uses surrogate pairs * for ISO-10646 values between 0x00010000 and 0x000fffff. An * exception is thrown if the ISO-10646 character has no Unicode * representation. * * @param count The number of bytes to convert. * @param shift1 The number of bits to shift byte 1. * @param shift2 The number of bits to shift byte 2 * @param shift3 The number of bits to shift byte 2 * @param shift4 The number of bits to shift byte 2 * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer */ private void copyUcs4ReadBuffer (int count, int shift1, int shift2, int shift3, int shift4) throws SAXException { int j = readBufferPos; if (count > 0 && (count % 4) != 0) { encodingError ( "number of bytes in UCS-4 encoding not divisible by 4", -1, count); } for (int i = 0; i < count; i += 4) { int value = (((rawReadBuffer [i] & 0xff) << shift1) | ((rawReadBuffer [i + 1] & 0xff) << shift2) | ((rawReadBuffer [i + 2] & 0xff) << shift3) | ((rawReadBuffer [i + 3] & 0xff) << shift4)); if (value < 0x0000ffff) { readBuffer [j++] = (char) value; if (value == (int) '\r') { sawCR = true; } } else if (value < 0x0010ffff) { value -= 0x010000; readBuffer [j++] = (char) (0xd8 | ((value >> 10) & 0x03ff)); readBuffer [j++] = (char) (0xdc | (value & 0x03ff)); } else { encodingError ("UCS-4 value out of range for Unicode", value, i); } } readBufferLength = j; } /** * Report a character encoding error. */ private void encodingError (String message, int value, int offset) throws SAXException { if (value != -1) message = message + " (character code: 0x" + Integer.toHexString (value) + ')'; error (message); } ////////////////////////////////////////////////////////////////////// // Local Variables. ////////////////////////////////////////////////////////////////////// /** * Re-initialize the variables for each parse. */ private void initializeVariables () { // First line line = 1; column = 0; // Set up the buffers for data and names dataBufferPos = 0; dataBuffer = new char [DATA_BUFFER_INITIAL]; nameBufferPos = 0; nameBuffer = new char [NAME_BUFFER_INITIAL]; // Set up the DTD hash tables elementInfo = new Hashtable (); entityInfo = new Hashtable (); notationInfo = new Hashtable (); skippedPE = false; // Set up the variables for the current // element context. currentElement = null; currentElementContent = CONTENT_UNDECLARED; // Set up the input variables sourceType = INPUT_NONE; inputStack = new Stack (); entityStack = new Stack (); externalEntity = null; tagAttributePos = 0; tagAttributes = new String [100]; rawReadBuffer = new byte [READ_BUFFER_MAX]; readBufferOverflow = -1; scratch = new InputSource (); inLiteral = false; expandPE = false; peIsError = false; doReport = false; inCDATA = false; symbolTable = new Object [SYMBOL_TABLE_LENGTH][]; } // // The current XML handler interface. // private SAXDriver handler; // // I/O information. // private Reader reader; // current reader private InputStream is; // current input stream private int line; // current line number private int column; // current column number private int sourceType; // type of input source private Stack inputStack; // stack of input soruces private URLConnection externalEntity; // current external entity private int encoding; // current character encoding private int currentByteCount; // bytes read from current source private InputSource scratch; // temporary // // Buffers for decoded but unparsed character input. // private char readBuffer []; private int readBufferPos; private int readBufferLength; private int readBufferOverflow; // overflow from last data chunk. // // Buffer for undecoded raw byte input. // private final static int READ_BUFFER_MAX = 16384; private byte rawReadBuffer []; // // Buffer for attribute values, char refs, DTD stuff. // private static int DATA_BUFFER_INITIAL = 4096; private char dataBuffer []; private int dataBufferPos; // // Buffer for parsed names. // private static int NAME_BUFFER_INITIAL = 1024; private char nameBuffer []; private int nameBufferPos; // // Save any standalone flag // private boolean docIsStandalone; // // Hashtables for DTD information on elements, entities, and notations. // Populated until we start ignoring decls (because of skipping a PE) // private Hashtable elementInfo; private Hashtable entityInfo; private Hashtable notationInfo; private boolean skippedPE; // // Element type currently in force. // private String currentElement; private int currentElementContent; // // Stack of entity names, to detect recursion. // private Stack entityStack; // // PE expansion is enabled in most chunks of the DTD, not all. // When it's enabled, literals are treated differently. // private boolean inLiteral; private boolean expandPE; private boolean peIsError; // // can't report entity expansion inside two constructs: // - attribute expansions (internal entities only) // - markup declarations (parameter entities only) // private boolean doReport; // // Symbol table, for caching interned names. // // These show up wherever XML names or nmtokens are used: naming elements, // attributes, PIs, notations, entities, and enumerated attribute values. // // NOTE: This hashtable doesn't grow. The default size is intended to be // rather large for most documents. Example: one snapshot of the DocBook // XML 4.1 DTD used only about 350 such names. As a rule, only pathological // documents (ones that don't reuse names) should ever see much collision. // // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing. // "2039" keeps the hash table size at about two memory pages on typical // 32 bit hardware. // private final static int SYMBOL_TABLE_LENGTH = 2039; private Object symbolTable [][]; // // Hash table of attributes found in current start tag. // private String tagAttributes []; private int tagAttributePos; // // Utility flag: have we noticed a CR while reading the last // data chunk? If so, we will have to go back and normalise // CR or CR/LF line ends. // private boolean sawCR; // // Utility flag: are we in CDATA? If so, whitespace isn't ignorable. // private boolean inCDATA; // // Xml version. // private static final int XML_10 = 0; private static final int XML_11 = 1; private int xmlVersion = XML_10; }