package com.jclark.xml.tok;

/**
 * An <code>Encoding</code> object corresponds to a possible
 * encoding (a mapping from characters to sequences of bytes).
 * It provides operations on byte arrays
 * that represent all or part of a parsed XML entity in that encoding.
 * <p>
 * The set of ASCII characters excluding <code>$@\^`{}~</code>
 * have a special status; these are called <i>XML significant</i>
 * characters.
 * <p>
 * This class imposes certain restrictions on an encoding:
 * <ul>
 * <li>the encoding must be stateless;
 * <li>a single byte must not encode more than one character;
 * <li>all XML significant characters must be encoded by the same number
 * of bytes, and no character may be encoded by fewer bytes.
 * </ul>
 * <p>
 * Several methods operate on byte subarrays. The subarray is specified
 * by a byte array <code>buf</code> and two integers,
 * <code>off</code> and <code>end</code>; <code>off</code>
 * gives the index in <code>buf</code> of the first byte of the subarray
 * and <code>end</code> gives the
 * index in <code>buf</code> of the byte immediately after the last byte.
 * <p>
 * Use the <code>getInitialEncoding</code> method to get an
 * <code>Encoding</code> object to use to start parsing an entity.
 * <p>
 * The main operations provided by <code>Encoding</code> are
 * <code>tokenizeProlog</code>, <code>tokenizeContent</code> and
 * <code>tokenizeCdataSection</code>;
 * these are used to divide up an XML entity into tokens.
 * <code>tokenizeProlog</code> is used for the prolog of an XML document
 * as well as for the external subset and parameter entities (except
 * when referenced in an <code>EntityValue</code>);
 * it can also be used for parsing the <code>Misc</code>* that follows
 * the document element.
 * <code>tokenizeContent</code> is used for the document element and for
 * parsed general entities that are referenced in <code>content</code>
 * except for CDATA sections.
 * <code>tokenizeCdataSection</code> is used for CDATA sections, following
 * the <code>&lt;![CDATA[</code> up to and including the <code>]]&gt;</code>.
 * <p>
 * <code>tokenizeAttributeValue</code> and <code>tokenizeEntityValue</code>
 * are used to further divide up tokens returned by <code>tokenizeProlog</code>
 * and <code>tokenizeContent</code>; they are also used to divide up entities
 * referenced in attribute values or entity values.
 * @version $Revision: 1.15 $ $Date: 1998/12/28 08:05:18 $
 */

public abstract class Encoding {
  /**
   * Represents one or more characters of data.
   */
  public static final int TOK_DATA_CHARS = 0;

  /**
   * Represents a newline (CR, LF or CR followed by LF) in data.
   */
  public static final int TOK_DATA_NEWLINE = TOK_DATA_CHARS + 1;

  /**
   * Represents a complete start-tag <code>&lt;name&gt;</code>,
   * that doesn't have any attribute specifications.
   */
  public static final int TOK_START_TAG_NO_ATTS = TOK_DATA_NEWLINE + 1;

  /**
   * Represents a complete start-tag <code>&lt;name att="val"&gt;</code>,
   * that contains one or more attribute specifications.
   */
  public static final int TOK_START_TAG_WITH_ATTS = TOK_START_TAG_NO_ATTS + 1;

  /**
   * Represents an empty element tag <code>&lt;name/&gt;</code>,
   * that doesn't have any attribute specifications.
   */
  public static final int TOK_EMPTY_ELEMENT_NO_ATTS = TOK_START_TAG_WITH_ATTS + 1;

  /**
   * Represents an empty element tag <code>&lt;name att="val"/&gt;</code>,
   * that contains one or more attribute specifications.
   */
  public static final int TOK_EMPTY_ELEMENT_WITH_ATTS = TOK_EMPTY_ELEMENT_NO_ATTS + 1;

  /**
   * Represents a complete end-tag <code>&lt;/name&gt;</code>.
   */
  public static final int TOK_END_TAG = TOK_EMPTY_ELEMENT_WITH_ATTS + 1;

  /**
   * Represents the start of a CDATA section <code>&lt;![CDATA[</code>.
   */
  public static final int TOK_CDATA_SECT_OPEN = TOK_END_TAG + 1;

  /**
   * Represents the end of a CDATA section <code>]]></code>.
   */
  public static final int TOK_CDATA_SECT_CLOSE = TOK_CDATA_SECT_OPEN + 1;

  /**
   * Represents a general entity reference.
   */
  public static final int TOK_ENTITY_REF = TOK_CDATA_SECT_CLOSE + 1;

  /**
   * Represents a general entity reference to a one of the 5 predefined
   * entities <code>amp</code>, <code>lt</code>, <code>gt</code>,
   * <code>quot</code>, <code>apos</code>.
   */
  public static final int TOK_MAGIC_ENTITY_REF = TOK_ENTITY_REF + 1;

  /**
   * Represents a numeric character reference (decimal or hexadecimal),
   * when the referenced character is less than or equal to 0xFFFF
   * and so is represented by a single char.
   */
  public static final int TOK_CHAR_REF = TOK_MAGIC_ENTITY_REF + 1;

  /**
   * Represents a numeric character reference (decimal or hexadecimal),
   * when the referenced character is greater than 0xFFFF and so is
   * represented by a pair of chars.
   */
  public static final int TOK_CHAR_PAIR_REF = TOK_CHAR_REF + 1;

  /**
   * Represents a processing instruction.
   */
  public static final int TOK_PI = TOK_CHAR_PAIR_REF + 1;

  /**
   * Represents an XML declaration or text declaration (a processing
   * instruction whose target is <code>xml</code>).
   */
  public static final int TOK_XML_DECL = TOK_PI + 1;

  /**
   * Represents a comment <code>&lt;!-- comment --&gt;</code>.
   * This can occur both in the prolog and in content.
   */
  public static final int TOK_COMMENT = TOK_XML_DECL + 1;

  /**
   * Represents a white space character in an attribute value,
   * excluding white space characters that are part of line boundaries.
   */
  public static final int TOK_ATTRIBUTE_VALUE_S = TOK_COMMENT + 1;

  /**
   * Represents a parameter entity reference in the prolog.
   */
  public static final int TOK_PARAM_ENTITY_REF = TOK_ATTRIBUTE_VALUE_S + 1;

  /**
   * Represents whitespace in the prolog.
   * The token contains one or more whitespace characters.
   */
  public static final int TOK_PROLOG_S = TOK_PARAM_ENTITY_REF + 1;

  /**
   * Represents <code>&lt;!NAME</code> in the prolog.
   */
  public static final int TOK_DECL_OPEN = TOK_PROLOG_S + 1;

  /**
   * Represents <code>&gt;</code> in the prolog.
   */
  public static final int TOK_DECL_CLOSE = TOK_DECL_OPEN + 1;

  /**
   * Represents a name in the prolog.
   */
  public static final int TOK_NAME = TOK_DECL_CLOSE + 1;

  /**
   * Represents a name token in the prolog that is not a name.
   */
  public static final int TOK_NMTOKEN = TOK_NAME + 1;

  /**
   * Represents <code>#NAME</code> in the prolog.
   */
  public static final int TOK_POUND_NAME = TOK_NMTOKEN + 1;

  /**
   * Represents <code>|</code> in the prolog.
   */
  public static final int TOK_OR = TOK_POUND_NAME + 1;

  /**
   * Represents a <code>%</code> in the prolog that does not start
   * a parameter entity reference.
   * This can occur in an entity declaration.
   */
  public static final int TOK_PERCENT = TOK_OR + 1;

  /**
   * Represents a <code>(</code> in the prolog.
   */
  public static final int TOK_OPEN_PAREN = TOK_PERCENT + 1;

  /**
   * Represents a <code>)</code> in the prolog that is not
   * followed immediately by any of
   *  <code>*</code>, <code>+</code> or <code>?</code>.
   */
  public static final int TOK_CLOSE_PAREN = TOK_OPEN_PAREN + 1;

  /**
   * Represents <code>[</code> in the prolog.
   */
  public static final int TOK_OPEN_BRACKET = TOK_CLOSE_PAREN + 1;

  /**
   * Represents <code>]</code> in the prolog.
   */
  public static final int TOK_CLOSE_BRACKET = TOK_OPEN_BRACKET + 1;

  /**
   * Represents a literal (EntityValue, AttValue, SystemLiteral or
   * PubidLiteral).
   */
  public static final int TOK_LITERAL = TOK_CLOSE_BRACKET + 1;

  /**
   * Represents a name followed immediately by <code>?</code>.
   */
  public static final int TOK_NAME_QUESTION = TOK_LITERAL + 1;

  /**
   * Represents a name followed immediately by <code>*</code>.
   */
  public static final int TOK_NAME_ASTERISK = TOK_NAME_QUESTION + 1;

  /**
   * Represents a name followed immediately by <code>+</code>.
   */
  public static final int TOK_NAME_PLUS = TOK_NAME_ASTERISK + 1;

  /**
   * Represents <code>&lt;![</code> in the prolog.
   */
  public static final int TOK_COND_SECT_OPEN = TOK_NAME_PLUS + 1;

  /**
   * Represents <code>]]&gt;</code> in the prolog.
   */
  public static final int TOK_COND_SECT_CLOSE = TOK_COND_SECT_OPEN + 1;

  /**
   * Represents <code>)?</code> in the prolog.
   */
  public static final int TOK_CLOSE_PAREN_QUESTION = TOK_COND_SECT_CLOSE + 1;

  /**
   * Represents <code>)*</code> in the prolog.
   */
  public static final int TOK_CLOSE_PAREN_ASTERISK = TOK_CLOSE_PAREN_QUESTION + 1;

  /**
   * Represents <code>)+</code> in the prolog.
   */
  public static final int TOK_CLOSE_PAREN_PLUS = TOK_CLOSE_PAREN_ASTERISK + 1;

  /**
   * Represents <code>,</code> in the prolog.
   */
  public static final int TOK_COMMA = TOK_CLOSE_PAREN_PLUS + 1;

  /**
   * Convert bytes to characters.
   * The bytes on <code>sourceBuf</code> between <code>sourceStart</code>
   * and <code>sourceEnd</code> are converted to characters and stored
   * in <code>targetBuf</code> starting at <code>targetStart</code>.
   * <code>(targetBuf.length - targetStart) * getMinBytesPerChar()</code>
   * must be at greater than or equal to
   * <code>sourceEnd - sourceStart</code>.
   * If <code>getFixedBytesPerChar</code> returns a value greater than 0,
   * then the return value will be equal to
   * <code>(sourceEnd - sourceStart)/getFixedBytesPerChar()</code>.
   * @return the number of characters stored into <code>targetBuf</code>
   * @see #getFixedBytesPerChar
   */
  public abstract int convert(byte[] sourceBuf, int sourceStart, int sourceEnd,
			      char[] targetBuf, int targetStart);

  /**
   * Returns the number of bytes required to represent each <code>char</code>,
   * or zero if different <code>char</code>s are represented by different
   * numbers of bytes.  The value returned will 0, 1, 2, or 4.
   */
  public abstract int getFixedBytesPerChar();

  private static Encoding utf8Encoding;
  private static Encoding utf16LittleEndianEncoding;
  private static Encoding utf16BigEndianEncoding;
  private static Encoding internalEncoding;
  private static Encoding iso8859_1Encoding;
  private static Encoding asciiEncoding;

  private static final byte UTF8_ENCODING = 0;
  private static final byte UTF16_LITTLE_ENDIAN_ENCODING = 1;
  private static final byte UTF16_BIG_ENDIAN_ENCODING = 2;
  private static final byte INTERNAL_ENCODING = 3;
  private static final byte ISO8859_1_ENCODING = 4;
  private static final byte ASCII_ENCODING = 5;

  private static synchronized Encoding getEncoding(byte enc) {
    switch (enc) {
    case UTF8_ENCODING:
      if (utf8Encoding == null)
	utf8Encoding = new UTF8Encoding();
      return utf8Encoding;
    case UTF16_LITTLE_ENDIAN_ENCODING:
      if (utf16LittleEndianEncoding == null)
	utf16LittleEndianEncoding = new UTF16LittleEndianEncoding();
      return utf16LittleEndianEncoding;
    case UTF16_BIG_ENDIAN_ENCODING:
      if (utf16BigEndianEncoding == null)
	utf16BigEndianEncoding = new UTF16BigEndianEncoding();
      return utf16BigEndianEncoding;
    case INTERNAL_ENCODING:
      if (internalEncoding == null)
	internalEncoding = new InternalEncoding();
      return internalEncoding;
    case ISO8859_1_ENCODING:
      if (iso8859_1Encoding == null)
	iso8859_1Encoding = new ISO8859_1Encoding();
      return iso8859_1Encoding;
    case ASCII_ENCODING:
      if (asciiEncoding == null)
	asciiEncoding = new ASCIIEncoding();
      return asciiEncoding;
    }
    return null;
  }

  Encoding getUTF16Encoding() {
    return getEncoding(UTF16_BIG_ENDIAN_ENCODING);
  }

  // Bytes with type < 0 may not be data in content.
  // The negation of the lead byte type gives the total number of bytes.
  static final int BT_LEAD2 = -2;
  static final int BT_LEAD3 = BT_LEAD2 - 1;
  static final int BT_LEAD4 = BT_LEAD3 - 1;
  static final int BT_NONXML = BT_LEAD4 - 1;
  static final int BT_MALFORM = BT_NONXML - 1;
  static final int BT_LT = BT_MALFORM - 1;
  static final int BT_AMP = BT_LT - 1;
  static final int BT_RSQB = BT_AMP - 1;
  static final int BT_CR = BT_RSQB - 1;
  static final int BT_LF = BT_CR - 1;
  // Bytes with type >= 0 are treated as data in content.
  static final int BT_GT = 0;
  static final int BT_QUOT = BT_GT + 1;
  static final int BT_APOS = BT_QUOT + 1;
  static final int BT_EQUALS = BT_APOS + 1;
  static final int BT_QUEST = BT_EQUALS + 1;
  static final int BT_EXCL = BT_QUEST + 1;
  static final int BT_SOL = BT_EXCL + 1;
  static final int BT_SEMI = BT_SOL + 1;
  static final int BT_NUM = BT_SEMI + 1;
  static final int BT_LSQB = BT_NUM + 1;
  static final int BT_S = BT_LSQB + 1;
  static final int BT_NMSTRT = BT_S + 1;
  static final int BT_NAME = BT_NMSTRT + 1;
  static final int BT_MINUS = BT_NAME + 1;
  static final int BT_OTHER = BT_MINUS + 1;
  static final int BT_PERCNT = BT_OTHER + 1;
  static final int BT_LPAR = BT_PERCNT + 1;
  static final int BT_RPAR = BT_LPAR + 1;
  static final int BT_AST = BT_RPAR + 1;
  static final int BT_PLUS = BT_AST + 1;
  static final int BT_COMMA = BT_PLUS + 1;
  static final int BT_VERBAR = BT_COMMA + 1;

  final static byte[] asciiTypeTable = {
    /* 0x00 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
    /* 0x04 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
    /* 0x08 */ BT_NONXML, BT_S, BT_LF, BT_NONXML,
    /* 0x0C */ BT_NONXML, BT_CR, BT_NONXML, BT_NONXML,
    /* 0x10 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
    /* 0x14 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
    /* 0x18 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
    /* 0x1C */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
    /* 0x20 */ BT_S, BT_EXCL, BT_QUOT, BT_NUM,
    /* 0x24 */ BT_OTHER, BT_PERCNT, BT_AMP, BT_APOS,
    /* 0x28 */ BT_LPAR, BT_RPAR, BT_AST, BT_PLUS,
    /* 0x2C */ BT_COMMA, BT_MINUS, BT_NAME, BT_SOL,
    /* 0x30 */ BT_NAME, BT_NAME, BT_NAME, BT_NAME,
    /* 0x34 */ BT_NAME, BT_NAME, BT_NAME, BT_NAME,
    /* 0x38 */ BT_NAME, BT_NAME, BT_NMSTRT, BT_SEMI,
    /* 0x3C */ BT_LT, BT_EQUALS, BT_GT, BT_QUEST,
    /* 0x40 */ BT_OTHER, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
    /* 0x44 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
    /* 0x48 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
    /* 0x4C */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
    /* 0x50 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
    /* 0x54 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
    /* 0x58 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_LSQB,
    /* 0x5C */ BT_OTHER, BT_RSQB, BT_OTHER, BT_NMSTRT,
    /* 0x60 */ BT_OTHER, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
    /* 0x64 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
    /* 0x68 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
    /* 0x6C */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
    /* 0x70 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
    /* 0x74 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
    /* 0x78 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_OTHER,
    /* 0x7C */ BT_VERBAR, BT_OTHER, BT_OTHER, BT_OTHER
  };

  // The minimum number of bytes per character.
  private /* final */ int minBPC;

  Encoding(int minBPC) {
    this.minBPC = minBPC;
  }

  // There are guaranteed to be minBPC available bytes starting at off.
  abstract int byteType(byte[] buf, int off);

  abstract int byteToAscii(byte[] buf, int off);

  // This must only be called when c is an (XML significant) ASCII character.
  abstract boolean charMatches(byte[] buf, int off, char c);

  // Called only when byteType(buf, off) == BT_LEAD2
  int byteType2(byte[] buf, int off) {
    return BT_OTHER;
  }
  // Called only when byteType(buf, off) == BT_LEAD3
  int byteType3(byte[] buf, int off) {
    return BT_OTHER;
  }
  // Called only when byteType(buf, off) == BT_LEAD4
  int byteType4(byte[] buf, int off) {
    return BT_OTHER;
  }

  void check2(byte[] buf, int off) throws InvalidTokenException { }
  void check3(byte[] buf, int off) throws InvalidTokenException { }
  void check4(byte[] buf, int off) throws InvalidTokenException { }

  /**
   * Moves a position forward.
   * On entry, <code>pos</code> gives the position of the byte at index
   * <code>off</code> in <code>buf</code>.
   * On exit, it <code>pos</code> will give the position of the byte at index
   * <code>end</code>, which must be greater than or equal to <code>off</code>.
   * The bytes between <code>off</code> and <code>end</code> must encode
   * one or more complete characters.
   * A carriage return followed by a line feed will be treated as a single
   * line delimiter provided that they are given to <code>movePosition</code>
   * together.
   */
  public abstract void movePosition(byte[] buf, int off, int end, Position pos);

  // end encoding specific part

  private final
  void checkCharMatches(byte[] buf, int off, char c) throws InvalidTokenException {
    if (!charMatches(buf, off, c))
      throw new InvalidTokenException(off);
  }

  /* off points to character following "<!-" */

  private final
  int scanComment(byte[] buf, int off, int end, Token token)
       throws InvalidTokenException, PartialTokenException {
    if (off != end) {
      checkCharMatches(buf, off, '-');
      off += minBPC;
      while (off != end) {
	switch (byteType(buf, off)) {
	case BT_LEAD2:
	  if (end - off < 2)
	    throw new PartialCharException(off);
	  check2(buf, off);
	  off += 2;
	  break;
	case BT_LEAD3:
	  if (end - off < 3)
	    throw new PartialCharException(off);
	  check3(buf, off);
	  off += 3;
	  break;
	case BT_LEAD4:
	  if (end - off < 4)
	    throw new PartialCharException(off);
	  check4(buf, off);
	  off += 4;
	  break;
	case BT_NONXML:
	case BT_MALFORM:
	  throw new InvalidTokenException(off);
	case BT_MINUS:
	  if ((off += minBPC) == end)
	    throw new PartialTokenException();
	  if (charMatches(buf, off, '-')) {
	    if ((off += minBPC) == end)
	      throw new PartialTokenException();
	    checkCharMatches(buf, off, '>');
	    token.tokenEnd = off + minBPC;
	    return TOK_COMMENT;
	  }
	  break;
	default:
	  off += minBPC;
	  break;
	}
      }
    }
    throw new PartialTokenException();
  }

  /* off points to character following "<!" */

  private final
  int scanDecl(byte[] buf, int off, int end, Token token)
       throws InvalidTokenException, PartialTokenException {
    if (off == end)
      throw new PartialTokenException();
    switch (byteType(buf, off)) {
    case BT_MINUS:
      return scanComment(buf, off + minBPC, end, token);
    case BT_LSQB:
      token.tokenEnd = off + minBPC;
      return TOK_COND_SECT_OPEN;
    case BT_NMSTRT:
      off += minBPC;
      break;
    default:
      throw new InvalidTokenException(off);
    }
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_PERCNT:
	if (off + minBPC == end)
	  throw new PartialTokenException();
	/* don't allow <!ENTITY% foo "whatever"> */
	switch (byteType(buf, off + minBPC)) {
	case BT_S:
	case BT_CR:
	case BT_LF:
	case BT_PERCNT:
	  throw new InvalidTokenException(off);
	}
	/* fall through */
      case BT_S:
      case BT_CR:
      case BT_LF:
	token.tokenEnd = off;
	return TOK_DECL_OPEN;
      case BT_NMSTRT:
	off += minBPC;
	break;
      default:
	throw new InvalidTokenException(off);
      }
    }
    throw new PartialTokenException();
  }

  private final
  boolean targetIsXml(byte[] buf, int off, int end) throws InvalidTokenException {
    boolean upper = false;
    if (end - off != minBPC*3)
      return false;
    switch (byteToAscii(buf, off)) {
    case 'x':
      break;
    case 'X':
      upper = true;
      break;
    default:
      return false;
    }
    off += minBPC;
    switch (byteToAscii(buf, off)) {
    case 'm':
      break;
    case 'M':
      upper = true;
      break;
    default:
      return false;
    }
    off += minBPC;
    switch (byteToAscii(buf, off)) {
    case 'l':
      break;
    case 'L':
      upper = true;
      break;
    default:
      return false;
    }
    if (upper)
      throw new InvalidTokenException(off, InvalidTokenException.XML_TARGET);
    return true;
  }

  /* off points to character following "<?" */

  private final
  int scanPi(byte[] buf, int off, int end, Token token)
       throws PartialTokenException, InvalidTokenException {
    int target = off;
    if (off == end)
      throw new PartialTokenException();
    switch (byteType(buf, off)) {
    case BT_NMSTRT:
      off += minBPC;
      break;
    case BT_LEAD2:
      if (end - off < 2)
	throw new PartialCharException(off);
      if (byteType2(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 2;
      break;
    case BT_LEAD3:
      if (end - off < 3)
	throw new PartialCharException(off);
      if (byteType3(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 3;
      break;
    case BT_LEAD4:
      if (end - off < 4)
	throw new PartialCharException(off);
      if (byteType4(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 4;
      break;
    default:
      throw new InvalidTokenException(off);
    }
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_NMSTRT:
      case BT_NAME:
      case BT_MINUS:
	off += minBPC;
	break;
      case BT_LEAD2:
	if (end - off < 2)
	  throw new PartialCharException(off);
	if (!isNameChar2(buf, off))
	  throw new InvalidTokenException(off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  throw new PartialCharException(off);
	if (!isNameChar3(buf, off))
	  throw new InvalidTokenException(off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  throw new PartialCharException(off);
	if (!isNameChar4(buf, off))
	  throw new InvalidTokenException(off);
	off += 4;
	break;
      case BT_S:
      case BT_CR:
      case BT_LF:
	boolean isXml = targetIsXml(buf, target, off);
	token.nameEnd = off;
	off += minBPC;
	while (off != end) {
	  switch (byteType(buf, off)) {
	  case BT_LEAD2:
	    if (end - off < 2)
	      throw new PartialCharException(off);
	    check2(buf, off);
	    off += 2;
	    break;
	  case BT_LEAD3:
	    if (end - off < 3)
	      throw new PartialCharException(off);
	    check3(buf, off);
	    off += 3;
	    break;
	  case BT_LEAD4:
	    if (end - off < 4)
	      throw new PartialCharException(off);
	    check4(buf, off);
	    off += 4;
	    break;
	  case BT_NONXML:
	  case BT_MALFORM:
	    throw new InvalidTokenException(off);
	  case BT_QUEST:
	    off += minBPC;
	    if (off == end)
	      throw new PartialTokenException();
	    if (charMatches(buf, off, '>')) {
	      token.tokenEnd = off + minBPC;
	      if (isXml)
		return TOK_XML_DECL;
	      else
		return TOK_PI;
	    }
	    break;
	  default:
	    off += minBPC;
	    break;
	  }
	}
	throw new PartialTokenException();
      case BT_QUEST:
	token.nameEnd = off;
	off += minBPC;
	if (off == end)
	  throw new PartialTokenException();
	checkCharMatches(buf, off, '>');
	token.tokenEnd = off + minBPC;
	return (targetIsXml(buf, target, token.nameEnd)
		? TOK_XML_DECL
		: TOK_PI);
      default:
	throw new InvalidTokenException(off);
      }
    }
    throw new PartialTokenException();
  }

  /* off points to character following "<![" */

  private static final String CDATA = "CDATA[";

  private final
  int scanCdataSection(byte[] buf, int off, int end, Token token) 
       throws PartialTokenException, InvalidTokenException {
    /* "CDATA[".length() == 6 */
    if (end - off < 6 * minBPC)
      throw new PartialTokenException();
    for (int i = 0; i < CDATA.length(); i++, off += minBPC)
      checkCharMatches(buf, off, CDATA.charAt(i));
    token.tokenEnd = off;
    return TOK_CDATA_SECT_OPEN;
  }

  /**
   * Scans the first token of a byte subarrary that starts with the
   * content of a CDATA section.
   * Returns one of the following integers according to the type of token
   * that the subarray starts with:
   * <ul>
   * <li><code>TOK_DATA_CHARS</code>
   * <li><code>TOK_DATA_NEWLINE</code>
   * <li><code>TOK_CDATA_SECT_CLOSE</code>
   * </ul>
   * <p>
   * Information about the token is stored in <code>token</code>.
   * <p>
   * After <code>TOK_CDATA_SECT_CLOSE</code> is returned, the application
   * should use <code>tokenizeContent</code>.
   *
   * @exception EmptyTokenException if the subarray is empty
   * @exception PartialTokenException if the subarray contains only part of
   * a legal token
   * @exception InvalidTokenException if the subarrary does not start
   * with a legal token or part of one
   * @exception ExtensibleTokenException if the subarray encodes just a carriage
   * return ('\r')
   *
   * @see #TOK_DATA_CHARS
   * @see #TOK_DATA_NEWLINE
   * @see #TOK_CDATA_SECT_CLOSE
   * @see Token
   * @see EmptyTokenException
   * @see PartialTokenException
   * @see InvalidTokenException
   * @see ExtensibleTokenException
   * @see #tokenizeContent
   */
  public final int tokenizeCdataSection(byte[] buf, int off, int end, Token token) throws EmptyTokenException, PartialTokenException, InvalidTokenException, ExtensibleTokenException {
    if (minBPC > 1)
      end = adjustEnd(off, end);
    if (off == end)
      throw new EmptyTokenException();
    switch (byteType(buf, off)) {
    case BT_RSQB:
      off += minBPC;
      if (off == end)
	throw new PartialTokenException();
      if (!charMatches(buf, off, ']'))
	break;
      off += minBPC;
      if (off == end)
	throw new PartialTokenException();
      if (!charMatches(buf, off, '>')) {
	off -= minBPC;
	break;
      }
      token.tokenEnd = off + minBPC;
      return TOK_CDATA_SECT_CLOSE;
    case BT_CR:
      off += minBPC;
      if (off == end)
	throw new ExtensibleTokenException(TOK_DATA_NEWLINE);
      if (byteType(buf, off) == BT_LF)
	off += minBPC;
      token.tokenEnd = off;
      return TOK_DATA_NEWLINE;
    case BT_LF:
      token.tokenEnd = off + minBPC;
      return TOK_DATA_NEWLINE;
    case BT_NONXML:
    case BT_MALFORM:
      throw new InvalidTokenException(off);
    case BT_LEAD2:
      if (end - off < 2)
	throw new PartialCharException(off);
      check2(buf, off);
      off += 2;
      break;
    case BT_LEAD3:
      if (end - off < 3)
	throw new PartialCharException(off);
      check3(buf, off);
      off += 3;
      break;
    case BT_LEAD4:
      if (end - off < 4)
	throw new PartialCharException(off);
      check4(buf, off);
      off += 4;
      break;
    default:
      off += minBPC;
      break;
    }
    token.tokenEnd = extendCdata(buf, off, end);
    return TOK_DATA_CHARS;
  }

  int extendCdata(final byte[] buf, int off, final int end) throws InvalidTokenException {
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_LEAD2:
	if (end - off < 2)
	  return off;
	check2(buf, off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  return off;
	check3(buf, off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  return off;
	check4(buf, off);
	off += 4;
	break;
      case BT_RSQB:
      case BT_NONXML:
      case BT_MALFORM:
      case BT_CR:
      case BT_LF:
	return off;
      default:
	off += minBPC;
	break;
      }
    }
    return off;
  }


  /* off points to character following "</" */

  private final
  int scanEndTag(byte[] buf, int off, int end, Token token)
       throws PartialTokenException, InvalidTokenException {
    if (off == end)
      throw new PartialTokenException();
    switch (byteType(buf, off)) {
    case BT_NMSTRT:
      off += minBPC;
      break;
    case BT_LEAD2:
      if (end - off < 2)
	throw new PartialCharException(off);
      if (byteType2(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 2;
      break;
    case BT_LEAD3:
      if (end - off < 3)
	throw new PartialCharException(off);
      if (byteType3(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 3;
      break;
    case BT_LEAD4:
      if (end - off < 4)
	throw new PartialCharException(off);
      if (byteType4(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 4;
      break;
    default:
      throw new InvalidTokenException(off);
    }
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_NMSTRT:
      case BT_NAME:
      case BT_MINUS:
	off += minBPC;
	break;
      case BT_LEAD2:
	if (end - off < 2)
	  throw new PartialCharException(off);
	if (!isNameChar2(buf, off))
	  throw new InvalidTokenException(off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  throw new PartialCharException(off);
	if (!isNameChar3(buf, off))
	  throw new InvalidTokenException(off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  throw new PartialCharException(off);
	if (!isNameChar4(buf, off))
	  throw new InvalidTokenException(off);
	off += 4;
	break;
      case BT_S:
      case BT_CR:
      case BT_LF:
	token.nameEnd = off;
	for (off += minBPC; off != end; off += minBPC) {
	  switch (byteType(buf, off)) {
	  case BT_S:
	  case BT_CR:
	  case BT_LF:
	    break;
	  case BT_GT:
	    token.tokenEnd = off + minBPC;
	    return TOK_END_TAG;
	  default:
	    throw new InvalidTokenException(off);
	  }
	}
	throw new PartialTokenException();
      case BT_GT:
	token.nameEnd = off;
	token.tokenEnd = off + minBPC;
	return TOK_END_TAG;
      default:
	throw new InvalidTokenException(off);
      }
    }
    throw new PartialTokenException();
  }

  /* off points to character following "&#X" */

  private final
  int scanHexCharRef(byte[] buf, int off, int end, Token token)
       throws PartialTokenException, InvalidTokenException {
    if (off != end) {
      int c = byteToAscii(buf, off);
      int num;
      switch (c) {
      case '0': case '1': case '2': case '3': case '4':
      case '5': case '6': case '7': case '8': case '9':
	num = c - '0';
	break;
      case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
	num = c - ('A' - 10);
	break;
      case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 
	num = c - ('a' - 10);
	break;
      default:
	throw new InvalidTokenException(off);
      }
      for (off += minBPC; off != end; off += minBPC) {
	c = byteToAscii(buf, off);
	switch (c) {
	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9':
	  num = (num << 4) + c - '0';
	  break;
	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
	  num = (num << 4) + c - ('A' - 10);
	  break;
	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 
	  num = (num << 4) + c - ('a' - 10);
	  break;
	case ';':
	  token.tokenEnd = off + minBPC;
	  return setRefChar(num, token);
	default:
	  throw new InvalidTokenException(off);
	}
	if (num >= 0x110000)
	  throw new InvalidTokenException(off);
      }
    }
    throw new PartialTokenException();
  }

  /* off points to character following "&#" */
  
  private final
  int scanCharRef(byte[] buf, int off, int end, Token token)
       throws PartialTokenException, InvalidTokenException {
    if (off != end) {
      int c = byteToAscii(buf, off);
      switch (c) {
      case 'x':
	return scanHexCharRef(buf, off + minBPC, end, token);
      case '0':
      case '1':
      case '2':
      case '3':
      case '4':
      case '5':
      case '6':
      case '7':
      case '8':
      case '9':
	break;
      default:
	throw new InvalidTokenException(off);
      }
      int num = c - '0';
      for (off += minBPC; off != end; off += minBPC) {
	c = byteToAscii(buf, off);
	switch (c) {
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
	  num = num * 10 + (c - '0');
	  if (num < 0x110000)
	    break;
	  /* fall through */
	default:
	  throw new InvalidTokenException(off);
	case ';':
	  token.tokenEnd = off + minBPC;
	  return setRefChar(num, token);
	}
      }
    }
    throw new PartialTokenException();
  }

  /* num is known to be < 0x110000; return the token code */
  private final int setRefChar(int num, Token token) 
     throws InvalidTokenException {
    if (num < 0x10000) {
      switch (charTypeTable[num >> 8][num & 0xFF]) {
      case BT_NONXML:
      case BT_LEAD4:
      case BT_MALFORM:
	throw new InvalidTokenException(token.tokenEnd - minBPC);
      }
      token.refChar1 = (char)num;
      return TOK_CHAR_REF;
    }
    else {
      num -= 0x10000;
      token.refChar1 = (char)((num >> 10) + 0xD800);
      token.refChar2 = (char)((num & ((1 << 10) - 1)) + 0xDC00);
      return TOK_CHAR_PAIR_REF;
    }
  }

  private final
  boolean isMagicEntityRef(byte[] buf, int off, int end, Token token) {
    switch (byteToAscii(buf, off)) {
    case 'a':
      if (end - off < minBPC*4)
	break;
      switch (byteToAscii(buf, off + minBPC)) {
      case 'm':
	if (charMatches(buf, off + minBPC*2, 'p')
	    && charMatches(buf, off + minBPC*3, ';')) {
	  token.tokenEnd = off + minBPC*4;
	  token.refChar1 = '&';
	  return true;
	}
	break;
      case 'p':
	if (end - off >= minBPC*5
	    && charMatches(buf, off + minBPC*2, 'o')
	    && charMatches(buf, off + minBPC*3, 's')
	    && charMatches(buf, off + minBPC*4, ';')) {
	  token.tokenEnd = off + minBPC*5;
	  token.refChar1 = '\'';
	  return true;
	}
	break;
      }
      break;
    case 'l':
      if (end - off >= minBPC*3
	  && charMatches(buf, off + minBPC, 't')
	  && charMatches(buf, off + minBPC*2, ';')) {
	token.tokenEnd = off + minBPC*3;
	token.refChar1 = '<';
	return true;
      }
      break;
    case 'g':
      if (end - off >= minBPC*3
	  && charMatches(buf, off + minBPC, 't')
	  && charMatches(buf, off + minBPC*2, ';')) {
	token.tokenEnd = off + minBPC*3;
	token.refChar1 = '>';
	return true;
      }
      break;
    case 'q':
      if (end - off >= minBPC*5
	  && charMatches(buf, off + minBPC, 'u')
	  && charMatches(buf, off + minBPC*2, 'o')
	  && charMatches(buf, off + minBPC*3, 't')
	  && charMatches(buf, off + minBPC*4, ';')) {
	token.tokenEnd = off + minBPC*5;
	token.refChar1 = '"';
	return true;
      }
      break;
    }
    return false;
  }
  /* off points to character following "&" */

  private final
  int scanRef(byte[] buf, int off, int end, Token token)
       throws PartialTokenException, InvalidTokenException {
    if (off == end)
      throw new PartialTokenException();
    if (isMagicEntityRef(buf, off, end, token))
      return TOK_MAGIC_ENTITY_REF;
    switch (byteType(buf, off)) {
    case BT_NMSTRT:
      off += minBPC;
      break;
    case BT_LEAD2:
      if (end - off < 2)
	throw new PartialCharException(off);
      if (byteType2(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 2;
      break;
    case BT_LEAD3:
      if (end - off < 3)
	throw new PartialCharException(off);
      if (byteType3(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 3;
      break;
    case BT_LEAD4:
      if (end - off < 4)
	throw new PartialCharException(off);
      if (byteType4(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 4;
      break;
    case BT_NUM:
      return scanCharRef(buf, off + minBPC, end, token);
    default:
      throw new InvalidTokenException(off);
    }
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_NMSTRT:
      case BT_NAME:
      case BT_MINUS:
	off += minBPC;
	break;
      case BT_LEAD2:
	if (end - off < 2)
	  throw new PartialCharException(off);
	if (!isNameChar2(buf, off))
	  throw new InvalidTokenException(off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  throw new PartialCharException(off);
	if (!isNameChar3(buf, off))
	  throw new InvalidTokenException(off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  throw new PartialCharException(off);
	if (!isNameChar4(buf, off))
	  throw new InvalidTokenException(off);
	off += 4;
	break;
      case BT_SEMI:
	token.nameEnd = off;
	token.tokenEnd = off + minBPC;
	return TOK_ENTITY_REF;
      default:
	throw new InvalidTokenException(off);
      }
    }
    throw new PartialTokenException();
  }

  /* off points to character following first character of attribute name */

  private final
  int scanAtts(int nameStart, byte[] buf, int off, int end, ContentToken token)
       throws PartialTokenException, InvalidTokenException {
    int nameEnd = -1;
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_NMSTRT:
      case BT_NAME:
      case BT_MINUS:
	off += minBPC;
	break;
      case BT_LEAD2:
	if (end - off < 2)
	  throw new PartialCharException(off);
	if (!isNameChar2(buf, off))
	  throw new InvalidTokenException(off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  throw new PartialCharException(off);
	if (!isNameChar3(buf, off))
	  throw new InvalidTokenException(off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  throw new PartialCharException(off);
	if (!isNameChar4(buf, off))
	  throw new InvalidTokenException(off);
	off += 4;
	break;
      case BT_S:
      case BT_CR:
      case BT_LF:
	nameEnd = off;
      loop:
	for (;;) {
	  off += minBPC;
	  if (off == end)
	    throw new PartialTokenException();
	  switch (byteType(buf, off)) {
	  case BT_EQUALS:
	    break loop;
	  case BT_S:
	  case BT_LF:
	  case BT_CR:
	    break;
	  default:
	    throw new InvalidTokenException(off);
	  }
	}
	/* fall through */
      case BT_EQUALS:
	{
	  if (nameEnd < 0)
	    nameEnd = off;
	  int open;
	  for (;;) {
	  
	    off += minBPC;
	    if (off == end)
	      throw new PartialTokenException();
	    open = byteType(buf, off);
	    if (open == BT_QUOT || open == BT_APOS)
	      break;
	    switch (open) {
	    case BT_S:
	    case BT_LF:
	    case BT_CR:
	      break;
	    default:
	      throw new InvalidTokenException(off);
	    }
	  }
	  off += minBPC;
	  int valueStart = off;
	  boolean normalized = true;
	  /* in attribute value */
	  for (;;) {
	    int t;
	    if (off == end)
	      throw new PartialTokenException();
	    t = byteType(buf, off);
	    if (t == open)
	      break;
	    switch (t) {
	    case BT_NONXML:
	    case BT_MALFORM:
	      throw new InvalidTokenException(off);
	    case BT_LEAD2:
	      if (end - off < 2)
		throw new PartialCharException(off);
	      check2(buf, off);
	      off += 2;
	      break;
	    case BT_LEAD3:
	      if (end - off < 3)
		throw new PartialCharException(off);
	      check3(buf, off);
	      off += 3;
	      break;
	    case BT_LEAD4:
	      if (end - off < 4)
		throw new PartialCharException(off);
	      check4(buf, off);
	      off += 4;
	      break;
	    case BT_AMP:
	      {
		normalized = false;
		int saveNameEnd = token.nameEnd;
		scanRef(buf, off + minBPC, end, token);
		token.nameEnd = saveNameEnd;
		off = token.tokenEnd;
		break;
	      }
	    case BT_S:
	      if (normalized
		  && (off == valueStart
		      || byteToAscii(buf, off) != ' '
		      || (off + minBPC != end
			  && (byteToAscii(buf, off + minBPC) == ' '
			      || byteType(buf, off + minBPC) == open))))
		normalized = false;
	      off += minBPC;
	      break;
	    case BT_LT:
	      throw new InvalidTokenException(off);
	    case BT_LF:
	    case BT_CR:
	      normalized = false;
	      /* fall through */
	    default:
	      off += minBPC;
	      break;
	    }
	  }
	  token.appendAttribute(nameStart, nameEnd, valueStart, off,
				normalized);
	  off += minBPC;
	  if (off == end)
	    throw new PartialTokenException();
	  int t = byteType(buf, off);
	  switch (t) {
	  case BT_S:
	  case BT_CR:
	  case BT_LF:
	    off += minBPC;
	    if (off == end)
	      throw new PartialTokenException();
	    t = byteType(buf, off);
	    break;
	  case BT_GT:
	  case BT_SOL:
	    break;
	  default:
	    throw new InvalidTokenException(off);
	  }
	  /* off points to closing quote */
	skipToName:
	  for (;;) {
	    switch (t) {
	    case BT_NMSTRT:
	      nameStart = off;
	      off += minBPC;
	      break skipToName;
	    case BT_LEAD2:
	      if (end - off < 2)
		throw new PartialCharException(off);
	      if (byteType2(buf, off) != BT_NMSTRT)
		throw new InvalidTokenException(off);
	      nameStart = off;
	      off += 2;
	      break skipToName;
	    case BT_LEAD3:
	      if (end - off < 3)
		throw new PartialCharException(off);
	      if (byteType3(buf, off) != BT_NMSTRT)
		throw new InvalidTokenException(off);
	      nameStart = off;
	      off += 3;
	      break skipToName;
	    case BT_LEAD4:
	      if (end - off < 4)
		throw new PartialCharException(off);
	      if (byteType4(buf, off) != BT_NMSTRT)
		throw new InvalidTokenException(off);
	      nameStart = off;
	      off += 4;
	      break skipToName;
	    case BT_S:
	    case BT_CR:
	    case BT_LF:
	      break;
	    case BT_GT:
	      token.checkAttributeUniqueness(buf);
	      token.tokenEnd = off + minBPC;
	      return TOK_START_TAG_WITH_ATTS;
	    case BT_SOL:
	      off += minBPC;
	      if (off == end)
		throw new PartialTokenException();
	      checkCharMatches(buf, off, '>');
	      token.checkAttributeUniqueness(buf);
	      token.tokenEnd = off + minBPC;
	      return TOK_EMPTY_ELEMENT_WITH_ATTS;
	    default:
	      throw new InvalidTokenException(off);
	    }
	    off += minBPC;
	    if (off == end)
	      throw new PartialTokenException();
	    t = byteType(buf, off);
	  }
	  nameEnd = -1;
	  break;
	}
      default:
	throw new InvalidTokenException(off);
      }
    }
    throw new PartialTokenException();
  }

  /* off points to character following "<" */

  private final
  int scanLt(byte[] buf, int off, int end, ContentToken token)
       throws PartialTokenException, InvalidTokenException {
    if (off == end)
      throw new PartialTokenException();
    switch (byteType(buf, off)) {
    case BT_NMSTRT:
      off += minBPC;
      break;
    case BT_LEAD2:
      if (end - off < 2)
	throw new PartialCharException(off);
      if (byteType2(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 2;
      break;
    case BT_LEAD3:
      if (end - off < 3)
	throw new PartialCharException(off);
      if (byteType3(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 3;
      break;
    case BT_LEAD4:
      if (end - off < 4)
	throw new PartialCharException(off);
      if (byteType4(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 4;
      break;
    case BT_EXCL:
      if ((off += minBPC) == end)
	throw new PartialTokenException();
      switch (byteType(buf, off)) {
      case BT_MINUS:
	return scanComment(buf, off + minBPC, end, token);
      case BT_LSQB:
	return scanCdataSection(buf, off + minBPC, end, token);
      }
      throw new InvalidTokenException(off);
    case BT_QUEST:
      return scanPi(buf, off + minBPC, end, token);
    case BT_SOL:
      return scanEndTag(buf, off + minBPC, end, token);
    default:
      throw new InvalidTokenException(off);
    }
    /* we have a start-tag */
    token.nameEnd = -1;
    token.clearAttributes();
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_NMSTRT:
      case BT_NAME:
      case BT_MINUS:
	off += minBPC;
	break;
      case BT_LEAD2:
	if (end - off < 2)
	  throw new PartialCharException(off);
	if (!isNameChar2(buf, off))
	  throw new InvalidTokenException(off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  throw new PartialCharException(off);
	if (!isNameChar3(buf, off))
	  throw new InvalidTokenException(off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  throw new PartialCharException(off);
	if (!isNameChar4(buf, off))
	  throw new InvalidTokenException(off);
	off += 4;
	break;
      case BT_S:
      case BT_CR:
      case BT_LF:
	token.nameEnd = off;
	off += minBPC;
      loop:
	for (;;) {
	  if (off == end)
	    throw new PartialTokenException();
	  switch (byteType(buf, off)) {
	  case BT_NMSTRT:
	    return scanAtts(off, buf, off + minBPC, end, token);
	  case BT_LEAD2:
	    if (end - off < 2)
	      throw new PartialCharException(off);
	    if (byteType2(buf, off) != BT_NMSTRT)
	      throw new InvalidTokenException(off);
	    return scanAtts(off, buf, off + 2, end, token);
	  case BT_LEAD3:
	    if (end - off < 3)
	      throw new PartialCharException(off);
	    if (byteType3(buf, off) != BT_NMSTRT)
	      throw new InvalidTokenException(off);
	    return scanAtts(off, buf, off + 3, end, token);
	  case BT_LEAD4:
	    if (end - off < 4)
	      throw new PartialCharException(off);
	    if (byteType4(buf, off) != BT_NMSTRT)
	      throw new InvalidTokenException(off);
	    return scanAtts(off, buf, off + 4, end, token);
	  case BT_GT:
	  case BT_SOL:
	    break loop;
	  case BT_S:
	  case BT_CR:
	  case BT_LF:
	    off += minBPC;
	    break;
	  default:
	    throw new InvalidTokenException(off);
	  }
	}
	break;
      case BT_GT:
	if (token.nameEnd < 0)
	  token.nameEnd = off;
	token.tokenEnd = off + minBPC;
	return TOK_START_TAG_NO_ATTS;
      case BT_SOL:
	if (token.nameEnd < 0)
	  token.nameEnd = off;
	off += minBPC;
	if (off == end)
	  throw new PartialTokenException();
	checkCharMatches(buf, off, '>');
	token.tokenEnd = off + minBPC;
	return TOK_EMPTY_ELEMENT_NO_ATTS;
      default:
	throw new InvalidTokenException(off);
      }
    }
    throw new PartialTokenException();
  }

  // Ensure that we always scan a multiple of minBPC bytes.

  private final int adjustEnd(int off, int end) throws PartialCharException {
    int n = end - off;
    if ((n & (minBPC - 1)) != 0) {
      n &= ~(minBPC - 1);
      if (n == 0)
	throw new PartialCharException(off);
      return off + n;
    }
    else
      return end;
  }

  /**
   * Scans the first token of a byte subarrary that contains content.
   * Returns one of the following integers according to the type of token
   * that the subarray starts with:
   * <ul>
   * <li><code>TOK_START_TAG_NO_ATTS</code>
   * <li><code>TOK_START_TAG_WITH_ATTS</code>
   * <li><code>TOK_EMPTY_ELEMENT_NO_ATTS</code>
   * <li><code>TOK_EMPTY_ELEMENT_WITH_ATTS</code>
   * <li><code>TOK_END_TAG</code>
   * <li><code>TOK_DATA_CHARS</code>
   * <li><code>TOK_DATA_NEWLINE</code>
   * <li><code>TOK_CDATA_SECT_OPEN</code>
   * <li><code>TOK_ENTITY_REF</code>
   * <li><code>TOK_MAGIC_ENTITY_REF</code>
   * <li><code>TOK_CHAR_REF</code>
   * <li><code>TOK_CHAR_PAIR_REF</code>
   * <li><code>TOK_PI</code>
   * <li><code>TOK_XML_DECL</code>
   * <li><code>TOK_COMMENT</code>
   * </ul>
   * <p>
   * Information about the token is stored in <code>token</code>.
   * <p>
   * When <code>TOK_CDATA_SECT_OPEN</code> is returned,
   * <code>tokenizeCdataSection</code> should be called until
   * it returns <code>TOK_CDATA_SECT</code>.
   *
   * @exception EmptyTokenException if the subarray is empty
   * @exception PartialTokenException if the subarray contains only part of
   * a legal token
   * @exception InvalidTokenException if the subarrary does not start
   * with a legal token or part of one
   * @exception ExtensibleTokenException if the subarray encodes just a carriage
   * return ('\r')
   *
   * @see #TOK_START_TAG_NO_ATTS
   * @see #TOK_START_TAG_WITH_ATTS
   * @see #TOK_EMPTY_ELEMENT_NO_ATTS
   * @see #TOK_EMPTY_ELEMENT_WITH_ATTS
   * @see #TOK_END_TAG
   * @see #TOK_DATA_CHARS
   * @see #TOK_DATA_NEWLINE
   * @see #TOK_CDATA_SECT_OPEN
   * @see #TOK_ENTITY_REF
   * @see #TOK_MAGIC_ENTITY_REF
   * @see #TOK_CHAR_REF
   * @see #TOK_CHAR_PAIR_REF
   * @see #TOK_PI
   * @see #TOK_XML_DECL
   * @see #TOK_COMMENT
   * @see ContentToken
   * @see EmptyTokenException
   * @see PartialTokenException
   * @see InvalidTokenException
   * @see ExtensibleTokenException
   * @see #tokenizeCdataSection
   */
  public final int tokenizeContent(byte[] buf, int off, int end, ContentToken token)
       throws PartialTokenException, InvalidTokenException, EmptyTokenException, ExtensibleTokenException {

    if (minBPC > 1)
      end = adjustEnd(off, end);
    if (off == end)
      throw new EmptyTokenException();
    switch (byteType(buf, off)) {
    case BT_LT:
      return scanLt(buf, off + minBPC, end, token);
    case BT_AMP:
      return scanRef(buf, off + minBPC, end, token);
    case BT_CR:
      off += minBPC;
      if (off == end)
	throw new ExtensibleTokenException(TOK_DATA_NEWLINE);
      if (byteType(buf, off) == BT_LF)
	off += minBPC;
      token.tokenEnd = off;
      return TOK_DATA_NEWLINE;
    case BT_LF:
      token.tokenEnd = off + minBPC;
      return TOK_DATA_NEWLINE;
    case BT_RSQB:
      off += minBPC;
      if (off == end)
	throw new ExtensibleTokenException(TOK_DATA_CHARS);
      if (!charMatches(buf, off, ']'))
	break;
      off += minBPC;
      if (off == end)
	throw new ExtensibleTokenException(TOK_DATA_CHARS);
      if (!charMatches(buf, off, '>')) {
	off -= minBPC;
	break;
      }
      throw new InvalidTokenException(off);
    case BT_NONXML:
    case BT_MALFORM:
      throw new InvalidTokenException(off);
    case BT_LEAD2:
      if (end - off < 2)
	throw new PartialCharException(off);
      check2(buf, off);
      off += 2;
      break;
    case BT_LEAD3:
      if (end - off < 3)
	throw new PartialCharException(off);
      check3(buf, off);
      off += 3;
      break;
    case BT_LEAD4:
      if (end - off < 4)
	throw new PartialCharException(off);
      check4(buf, off);
      off += 4;
      break;
    default:
      off += minBPC;
      break;
    }
    token.tokenEnd = extendData(buf, off, end);
    return TOK_DATA_CHARS;
  }

  int extendData(final byte[] buf, int off, final int end) throws InvalidTokenException {
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_LEAD2:
	if (end - off < 2)
	  return off;
	check2(buf, off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  return off;
	check3(buf, off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  return off;
	check4(buf, off);
	off += 4;
	break;
      case BT_RSQB:
      case BT_AMP:
      case BT_LT:
      case BT_NONXML:
      case BT_MALFORM:
      case BT_CR:
      case BT_LF:
	return off;
      default:
	off += minBPC;
	break;
      }
    }
    return off;
  }

  /* off points to character following "%" */

  private final
  int scanPercent(byte[] buf, int off, int end, Token token)
       throws PartialTokenException, InvalidTokenException {
    if (off == end)
      throw new PartialTokenException();
    switch (byteType(buf, off)) {
    case BT_NMSTRT:
      off += minBPC;
      break;
    case BT_LEAD2:
      if (end - off < 2)
	throw new PartialCharException(off);
      if (byteType2(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 2;
      break;
    case BT_LEAD3:
      if (end - off < 3)
	throw new PartialCharException(off);
      if (byteType3(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 3;
      break;
    case BT_LEAD4:
      if (end - off < 4)
	throw new PartialCharException(off);
      if (byteType4(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 4;
      break;
    case BT_S:
    case BT_LF:
    case BT_CR:
    case BT_PERCNT:
      token.tokenEnd = off;
      return TOK_PERCENT;
    default:
      throw new InvalidTokenException(off);
    }
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_NMSTRT:
      case BT_NAME:
      case BT_MINUS:
	off += minBPC;
	break;
      case BT_LEAD2:
	if (end - off < 2)
	  throw new PartialCharException(off);
	if (!isNameChar2(buf, off))
	  throw new InvalidTokenException(off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  throw new PartialCharException(off);
	if (!isNameChar3(buf, off))
	  throw new InvalidTokenException(off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  throw new PartialCharException(off);
	if (!isNameChar4(buf, off))
	  throw new InvalidTokenException(off);
	off += 4;
	break;
      case BT_SEMI:
	token.nameEnd = off;
	token.tokenEnd = off + minBPC;
	return TOK_PARAM_ENTITY_REF;
      default:
	throw new InvalidTokenException(off);
      }
    }
    throw new PartialTokenException();
  }


  private final
  int scanPoundName(byte[] buf, int off, int end, Token token) 
       throws PartialTokenException, InvalidTokenException, ExtensibleTokenException {
    if (off == end)
      throw new PartialTokenException();
    switch (byteType(buf, off)) {
    case BT_NMSTRT:
      off += minBPC;
      break;
    case BT_LEAD2:
      if (end - off < 2)
	throw new PartialCharException(off);
      if (byteType2(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 2;
      break;
    case BT_LEAD3:
      if (end - off < 3)
	throw new PartialCharException(off);
      if (byteType3(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 3;
      break;
    case BT_LEAD4:
      if (end - off < 4)
	throw new PartialCharException(off);
      if (byteType4(buf, off) != BT_NMSTRT)
	throw new InvalidTokenException(off);
      off += 4;
      break;
    default:
      throw new InvalidTokenException(off);
    }
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_NMSTRT:
      case BT_NAME:
      case BT_MINUS:
	off += minBPC;
	break;
      case BT_LEAD2:
	if (end - off < 2)
	  throw new PartialCharException(off);
	if (!isNameChar2(buf, off))
	  throw new InvalidTokenException(off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  throw new PartialCharException(off);
	if (!isNameChar3(buf, off))
	  throw new InvalidTokenException(off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  throw new PartialCharException(off);
	if (!isNameChar4(buf, off))
	  throw new InvalidTokenException(off);
	off += 4;
	break;
      case BT_CR:
      case BT_LF:
      case BT_S:
      case BT_RPAR:
      case BT_GT:
      case BT_PERCNT:
      case BT_VERBAR:
	token.tokenEnd = off;
	return TOK_POUND_NAME;
      default:
	throw new InvalidTokenException(off);
      }
    }
    throw new ExtensibleTokenException(TOK_POUND_NAME);
  }

  private final
  int scanLit(int open, byte[] buf, int off, int end, Token token)
       throws PartialTokenException, InvalidTokenException, ExtensibleTokenException {
    while (off != end) {
      int t = byteType(buf, off);
      switch (t) {
      case BT_LEAD2:
	if (end - off < 2)
	  throw new PartialTokenException();
	check2(buf, off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  throw new PartialTokenException();
	check3(buf, off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  throw new PartialTokenException();
	check4(buf, off);
	off += 4;
	break;
      case BT_NONXML:
      case BT_MALFORM:
	throw new InvalidTokenException(off);
      case BT_QUOT:
      case BT_APOS:
	off += minBPC;
	if (t != open)
	  break;
	if (off == end)
	  throw new ExtensibleTokenException(TOK_LITERAL);
	switch (byteType(buf, off)) {
	case BT_S:
	case BT_CR:
	case BT_LF:
	case BT_GT:
	case BT_PERCNT:
	case BT_LSQB:
	  token.tokenEnd = off;
	  return TOK_LITERAL;
	default:
	  throw new InvalidTokenException(off);
	}
      default:
	off += minBPC;
	break;
      }
    }
    throw new PartialTokenException();
  }

  /**
   * Returns an encoding object to be used to start parsing an external entity.
   * The encoding is chosen based on the initial 4 bytes of the entity.
   * 
   * @param buf the byte array containing the initial bytes of the entity
   * @param off the index in <code>buf</code> of the first byte of the entity 
   * @param end the index in <code>buf</code> following the last available
   * byte of the entity; <code>end - off</code> must be greater than or equal
   * to 4 unless the entity has fewer that 4 bytes, in which case it must
   * be equal to the length of the entity
   * @param token receives information about the presence of a byte order
   * mark; if the entity starts with a byte order mark
   * then <code>token.getTokenEnd()</code>
   * will return <code>off + 2</code>, otherwise it will return
   * <code>off</code>
   *
   * @see TextDecl
   * @see XmlDecl
   * @see #TOK_XML_DECL
   * @see #getEncoding
   * @see #getInternalEncoding
   */
  public static final
  Encoding getInitialEncoding(byte[] buf, int off, int end, Token token) {
    token.tokenEnd = off;
    switch (end - off) {
    case 0:
      break;
    case 1:
      if (buf[off] < 0)
	return null;
      break;
    default:
      int b0 = buf[off] & 0xFF;
      int b1 = buf[off + 1] & 0xFF;
      switch ((b0 << 8) | b1) {
      case 0xFEFF:
	token.tokenEnd = off + 2;
	/* fall through */
      case '<': /* not legal; but not a fatal error */
	return getEncoding(UTF16_BIG_ENDIAN_ENCODING);
      case 0xFFFE:
	token.tokenEnd = off + 2;
	/* fall through */
      case '<' << 8:  /* not legal; but not a fatal error */
	return getEncoding(UTF16_LITTLE_ENDIAN_ENCODING);
      }
    }
    return getEncoding(UTF8_ENCODING);
  }

  /**
   * Returns an <code>Encoding</code> corresponding to
   * the specified IANA character set name.
   * Returns this <code>Encoding</code> if the name is null.
   * Returns null if the specified encoding is not supported.
   * Note that there are two distinct <code>Encoding</code> objects
   * associated with the name <code>UTF-16</code>, one for
   * each possible byte order; if this <code>Encoding</code>
   * is UTF-16 with little-endian byte ordering, then
   * <code>getEncoding("UTF-16")</code> will return this,
   * otherwise it will return an <code>Encoding</code> for
   * UTF-16 with big-endian byte ordering.
   * @param name a string specifying the IANA name of the encoding; this is
   * case insensitive
   */
  public final
  Encoding getEncoding(String name) {
    if (name == null)
      return this;
    if (name.equalsIgnoreCase("UTF-8"))
      return getEncoding(UTF8_ENCODING);
    if (name.equalsIgnoreCase("UTF-16"))
      return getUTF16Encoding();
    if (name.equalsIgnoreCase("ISO-8859-1"))
      return getEncoding(ISO8859_1_ENCODING);
    if (name.equalsIgnoreCase("US-ASCII"))
      return getEncoding(ASCII_ENCODING);
    return null;
  }

  /**
   * Returns an <code>Encoding</code> for entities encoded with
   * a single-byte encoding (an encoding in which each byte represents
   * exactly one character).
   * @param map a string specifying the character represented by each byte;
   * the string must have a length of 256; <code>map.charAt(b)</code>
   * specifies the character encoded by byte <code>b</code>; bytes that do
   * not represent any character should be mapped to <code>\uFFFD</code>
   */
  public final
  Encoding getSingleByteEncoding(String map) {
    return new SingleByteEncoding(map);
  }

  /**
   * Returns an <code>Encoding</code> object for use with internal entities.
   * This is a UTF-16 big endian encoding, except that newlines
   * are assumed to have been normalized into line feed,
   * so carriage return is treated like a space.
   */
  public final static
  Encoding getInternalEncoding() {
    return getEncoding(INTERNAL_ENCODING);
  }

  /**
   * Scans the first token of a byte subarray that contains part of a
   * prolog.
   * Returns one of the following integers according to the type of token
   * that the subarray starts with:
   * <ul>
   * <li><code>TOK_PI</code>
   * <li><code>TOK_XML_DECL</code>
   * <li><code>TOK_COMMENT</code>
   * <li><code>TOK_PARAM_ENTITY_REF</code>
   * <li><code>TOK_PROLOG_S</code>
   * <li><code>TOK_DECL_OPEN</code>
   * <li><code>TOK_DECL_CLOSE</code>
   * <li><code>TOK_NAME</code>
   * <li><code>TOK_NMTOKEN</code>
   * <li><code>TOK_POUND_NAME</code>
   * <li><code>TOK_OR</code>
   * <li><code>TOK_PERCENT</code>
   * <li><code>TOK_OPEN_PAREN</code>
   * <li><code>TOK_CLOSE_PAREN</code>
   * <li><code>TOK_OPEN_BRACKET</code>
   * <li><code>TOK_CLOSE_BRACKET</code>
   * <li><code>TOK_LITERAL</code>
   * <li><code>TOK_NAME_QUESTION</code>
   * <li><code>TOK_NAME_ASTERISK</code>
   * <li><code>TOK_NAME_PLUS</code>
   * <li><code>TOK_COND_SECT_OPEN</code>
   * <li><code>TOK_COND_SECT_CLOSE</code>
   * <li><code>TOK_CLOSE_PAREN_QUESTION</code>
   * <li><code>TOK_CLOSE_PAREN_ASTERISK</code>
   * <li><code>TOK_CLOSE_PAREN_PLUS</code>
   * <li><code>TOK_COMMA</code>
   * </ul>
   * @exception EmptyTokenException if the subarray is empty
   * @exception PartialTokenException if the subarray contains only part of
   * a legal token
   * @exception InvalidTokenException if the subarrary does not start
   * with a legal token or part of one
   * @exception EndOfPrologException if the subarray starts with the document
   * element; <code>tokenizeContent</code> should be used on the remainder
   * of the entity
   * @exception ExtensibleTokenException if the subarray is a legal token
   * but subsequent bytes in the same entity could be part of the token
   * @see #TOK_PI
   * @see #TOK_XML_DECL
   * @see #TOK_COMMENT
   * @see #TOK_PARAM_ENTITY_REF
   * @see #TOK_PROLOG_S
   * @see #TOK_DECL_OPEN
   * @see #TOK_DECL_CLOSE
   * @see #TOK_NAME
   * @see #TOK_NMTOKEN
   * @see #TOK_POUND_NAME
   * @see #TOK_OR
   * @see #TOK_PERCENT
   * @see #TOK_OPEN_PAREN
   * @see #TOK_CLOSE_PAREN
   * @see #TOK_OPEN_BRACKET
   * @see #TOK_CLOSE_BRACKET
   * @see #TOK_LITERAL
   * @see #TOK_NAME_QUESTION
   * @see #TOK_NAME_ASTERISK
   * @see #TOK_NAME_PLUS
   * @see #TOK_COND_SECT_OPEN
   * @see #TOK_COND_SECT_CLOSE
   * @see #TOK_CLOSE_PAREN_QUESTION
   * @see #TOK_CLOSE_PAREN_ASTERISK
   * @see #TOK_CLOSE_PAREN_PLUS
   * @see #TOK_COMMA
   * @see ContentToken
   * @see EmptyTokenException
   * @see PartialTokenException
   * @see InvalidTokenException
   * @see ExtensibleTokenException
   * @see EndOfPrologException
   */

  public final
  int tokenizeProlog(byte[] buf, int off, int end, Token token) 
       throws PartialTokenException,
              InvalidTokenException,
              EmptyTokenException,
              ExtensibleTokenException,
              EndOfPrologException {
    int tok;
    if (minBPC > 1)
      end = adjustEnd(off, end);
    if (off == end)
      throw new EmptyTokenException();
    switch (byteType(buf, off)) {
    case BT_QUOT:
      return scanLit(BT_QUOT, buf, off + minBPC, end, token);
    case BT_APOS:
      return scanLit(BT_APOS, buf, off + minBPC, end, token);
    case BT_LT:
      {
	off += minBPC;
	if (off == end)
	  throw new PartialTokenException();
	switch (byteType(buf, off)) {
	case BT_EXCL:
	  return scanDecl(buf, off + minBPC, end, token);
	case BT_QUEST:
	  return scanPi(buf, off + minBPC, end, token);
	case BT_NMSTRT:
	case BT_LEAD2:
	case BT_LEAD3:
	case BT_LEAD4:
	  token.tokenEnd = off - minBPC;
	  throw new EndOfPrologException();
	}
	throw new InvalidTokenException(off);
      }
    case BT_CR:
      if (off + minBPC == end)
	throw new ExtensibleTokenException(TOK_PROLOG_S);
      /* fall through */
    case BT_S:
    case BT_LF:
      for (;;) {
	off += minBPC;
	if (off == end)
	  break;
	switch (byteType(buf, off)) {
	case BT_S:
	case BT_LF:
	  break;
	case BT_CR:
	  /* don't split CR/LF pair */
	  if (off + minBPC != end)
	    break;
	  /* fall through */
	default:
	  token.tokenEnd = off;
	  return TOK_PROLOG_S;
	}
      }
      token.tokenEnd = off;
      return TOK_PROLOG_S;
    case BT_PERCNT:
      return scanPercent(buf, off + minBPC, end, token);
    case BT_COMMA:
      token.tokenEnd = off + minBPC;
      return TOK_COMMA;
    case BT_LSQB:
      token.tokenEnd = off + minBPC;
      return TOK_OPEN_BRACKET;
    case BT_RSQB:
      off += minBPC;
      if (off == end)
	throw new ExtensibleTokenException(TOK_CLOSE_BRACKET);
      if (charMatches(buf, off, ']')) {
	if (off + minBPC == end)
	  throw new PartialTokenException();
	if (charMatches(buf, off + minBPC, '>')) {
	  token.tokenEnd = off + 2*minBPC;
	  return TOK_COND_SECT_CLOSE;
	}
      }
      token.tokenEnd = off;
      return TOK_CLOSE_BRACKET;
    case BT_LPAR:
      token.tokenEnd = off + minBPC;
      return TOK_OPEN_PAREN;
    case BT_RPAR:
      off += minBPC;
      if (off == end)
	throw new ExtensibleTokenException(TOK_CLOSE_PAREN);
      switch (byteType(buf, off)) {
      case BT_AST:
	token.tokenEnd = off + minBPC;
	return TOK_CLOSE_PAREN_ASTERISK;
      case BT_QUEST:
	token.tokenEnd = off + minBPC;
	return TOK_CLOSE_PAREN_QUESTION;
      case BT_PLUS:
	token.tokenEnd = off + minBPC;
	return TOK_CLOSE_PAREN_PLUS;
      case BT_CR:
      case BT_LF:
      case BT_S:
      case BT_GT:
      case BT_COMMA:
      case BT_VERBAR:
      case BT_RPAR:
	token.tokenEnd = off;
	return TOK_CLOSE_PAREN;
      }
      throw new InvalidTokenException(off);
    case BT_VERBAR:
      token.tokenEnd = off + minBPC;
      return TOK_OR;
    case BT_GT:
      token.tokenEnd = off + minBPC;
      return TOK_DECL_CLOSE;
    case BT_NUM:
      return scanPoundName(buf, off + minBPC, end, token);
    case BT_LEAD2:
      if (end - off < 2)
	throw new PartialCharException(off);
      switch (byteType2(buf, off)) {
      case BT_NMSTRT:
	off += 2;
	tok = TOK_NAME;
	break;
      case BT_NAME:
	off += 2;
	tok = TOK_NMTOKEN;
	break;
      default:
	throw new InvalidTokenException(off);
      }
      break;
    case BT_LEAD3:
      if (end - off < 3)
	throw new PartialCharException(off);
      switch (byteType3(buf, off)) {
      case BT_NMSTRT:
	off += 3;
	tok = TOK_NAME;
	break;
      case BT_NAME:
	off += 3;
	tok = TOK_NMTOKEN;
	break;
      default:
	throw new InvalidTokenException(off);
      }
      break;
    case BT_LEAD4:
      if (end - off < 4)
	throw new PartialCharException(off);
      switch (byteType4(buf, off)) {
      case BT_NMSTRT:
	off += 4;
	tok = TOK_NAME;
	break;
      case BT_NAME:
	off += 4;
	tok = TOK_NMTOKEN;
	break;
      default:
	throw new InvalidTokenException(off);
      }
      break;
    case BT_NMSTRT:
      tok = TOK_NAME;
      off += minBPC;
      break;
    case BT_NAME:
    case BT_MINUS:
      tok = TOK_NMTOKEN;
      off += minBPC;
      break;
    default:
      throw new InvalidTokenException(off);
    }
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_NMSTRT:
      case BT_NAME:
      case BT_MINUS:
	off += minBPC;
	break;
      case BT_LEAD2:
	if (end - off < 2)
	  throw new PartialCharException(off);
	if (!isNameChar2(buf, off))
	  throw new InvalidTokenException(off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  throw new PartialCharException(off);
	if (!isNameChar3(buf, off))
	  throw new InvalidTokenException(off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  throw new PartialCharException(off);
	if (!isNameChar4(buf, off))
	  throw new InvalidTokenException(off);
	off += 4;
	break;
      case BT_GT:
      case BT_RPAR:
      case BT_COMMA:
      case BT_VERBAR:
      case BT_LSQB:
      case BT_PERCNT:
      case BT_S:
      case BT_CR:
      case BT_LF:
	token.tokenEnd = off;
	return tok;
      case BT_PLUS:
	if (tok != TOK_NAME)
	  throw new InvalidTokenException(off);
	token.tokenEnd = off + minBPC;
	return TOK_NAME_PLUS;
      case BT_AST:
	if (tok != TOK_NAME)
	  throw new InvalidTokenException(off);
	token.tokenEnd = off + minBPC;
	return TOK_NAME_ASTERISK;
      case BT_QUEST:
	if (tok != TOK_NAME)
	  throw new InvalidTokenException(off);
	token.tokenEnd = off + minBPC;
	return TOK_NAME_QUESTION;
      default:
	throw new InvalidTokenException(off);
      }
    }
    throw new ExtensibleTokenException(tok);
  }

  /**
   * Scans the first token of a byte subarrary that contains part of
   * literal attribute value.  The opening and closing delimiters
   * are not included in the subarrary.
   * Returns one of the following integers according to the type of
   * token that the subarray starts with:
   * <ul>
   * <li><code>TOK_DATA_CHARS</code>
   * <li><code>TOK_DATA_NEWLINE</code>
   * <li><code>TOK_ATTRIBUTE_VALUE_S</code>
   * <li><code>TOK_MAGIC_ENTITY_REF</code>
   * <li><code>TOK_ENTITY_REF</code>
   * <li><code>TOK_CHAR_REF</code>
   * <li><code>TOK_CHAR_PAIR_REF</code>
   * </ul>
   * @exception EmptyTokenException if the subarray is empty
   * @exception PartialTokenException if the subarray contains only part of
   * a legal token
   * @exception InvalidTokenException if the subarrary does not start
   * with a legal token or part of one
   * @exception ExtensibleTokenException if the subarray encodes just a carriage
   * return ('\r')
   * @see #TOK_DATA_CHARS
   * @see #TOK_DATA_NEWLINE
   * @see #TOK_ATTRIBUTE_VALUE_S
   * @see #TOK_MAGIC_ENTITY_REF
   * @see #TOK_ENTITY_REF
   * @see #TOK_CHAR_REF
   * @see #TOK_CHAR_PAIR_REF
   * @see Token
   * @see EmptyTokenException
   * @see PartialTokenException
   * @see InvalidTokenException
   * @see ExtensibleTokenException
   */
  public final
  int tokenizeAttributeValue(byte[] buf, int off, int end, Token token)
       throws PartialTokenException, InvalidTokenException, 
              EmptyTokenException, ExtensibleTokenException {
    if (minBPC > 1)
      end = adjustEnd(off, end);
    if (off == end)
      throw new EmptyTokenException();
    int start = off;
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_LEAD2:
	if (end - off < 2)
	  throw new PartialCharException(off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  throw new PartialCharException(off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  throw new PartialCharException(off);
	off += 4;
	break;
      case BT_AMP:
	if (off == start)
	  return scanRef(buf, off + minBPC, end, token);
	token.tokenEnd = off;
	return TOK_DATA_CHARS;
      case BT_LT:
	/* this is for inside entity references */
	throw new InvalidTokenException(off);
      case BT_S:
	if (off == start) {
	  token.tokenEnd = off + minBPC;
	  return TOK_ATTRIBUTE_VALUE_S;
	}
	token.tokenEnd = off;
	return TOK_DATA_CHARS;
      case BT_LF:
	if (off == start) {
	  token.tokenEnd = off + minBPC;
	  return TOK_DATA_NEWLINE;
	}
	token.tokenEnd = off;
	return TOK_DATA_CHARS;
      case BT_CR:
	if (off == start) {
	  off += minBPC;
	  if (off == end)
	    throw new ExtensibleTokenException(TOK_DATA_NEWLINE);
	  if (byteType(buf, off) == BT_LF)
	    off += minBPC;
	  token.tokenEnd = off;
	  return TOK_DATA_NEWLINE;
	}
	token.tokenEnd = off;
	return TOK_DATA_CHARS;
      default:
	off += minBPC;
	break;
      }
    }
    token.tokenEnd = off;
    return TOK_DATA_CHARS;
  }

  /**
   * Scans the first token of a byte subarrary that contains part of
   * literal entity value.  The opening and closing delimiters
   * are not included in the subarrary.
   * Returns one of the following integers according to the type of
   * token that the subarray starts with:
   * <ul>
   * <li><code>TOK_DATA_CHARS</code>
   * <li><code>TOK_DATA_NEWLINE</code>
   * <li><code>TOK_PARAM_ENTITY_REF</code>
   * <li><code>TOK_MAGIC_ENTITY_REF</code>
   * <li><code>TOK_ENTITY_REF</code>
   * <li><code>TOK_CHAR_REF</code>
   * <li><code>TOK_CHAR_PAIR_REF</code>
   * </ul>
   * @exception EmptyTokenException if the subarray is empty
   * @exception PartialTokenException if the subarray contains only part of
   * a legal token
   * @exception InvalidTokenException if the subarrary does not start
   * with a legal token or part of one
   * @exception ExtensibleTokenException if the subarray encodes just a carriage
   * return ('\r')
   * @see #TOK_DATA_CHARS
   * @see #TOK_DATA_NEWLINE
   * @see #TOK_MAGIC_ENTITY_REF
   * @see #TOK_ENTITY_REF
   * @see #TOK_PARAM_ENTITY_REF
   * @see #TOK_CHAR_REF
   * @see #TOK_CHAR_PAIR_REF
   * @see Token
   * @see EmptyTokenException
   * @see PartialTokenException
   * @see InvalidTokenException
   * @see ExtensibleTokenException
   */
  public final
  int tokenizeEntityValue(byte[] buf, int off, int end, Token token)
       throws PartialTokenException, InvalidTokenException, 
              EmptyTokenException, ExtensibleTokenException {
    if (minBPC > 1)
      end = adjustEnd(off, end);
    if (off == end)
      throw new EmptyTokenException();
    int start = off;
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_LEAD2:
	if (end - off < 2)
	  throw new PartialCharException(off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  throw new PartialCharException(off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  throw new PartialCharException(off);
	off += 4;
	break;
      case BT_AMP:
	if (off == start)
	  return scanRef(buf, off + minBPC, end, token);
	token.tokenEnd = off;
	return TOK_DATA_CHARS;
      case BT_PERCNT:
	if (off == start)
	  return scanPercent(buf, off + minBPC, end, token);
	token.tokenEnd = off;
	return TOK_DATA_CHARS;
      case BT_LF:
	if (off == start) {
	  token.tokenEnd = off + minBPC;
	  return TOK_DATA_NEWLINE;
	}
	token.tokenEnd = off;
	return TOK_DATA_CHARS;
      case BT_CR:
	if (off == start) {
	  off += minBPC;
	  if (off == end)
	    throw new ExtensibleTokenException(TOK_DATA_NEWLINE);
	  if (byteType(buf, off) == BT_LF)
	    off += minBPC;
	  token.tokenEnd = off;
	  return TOK_DATA_NEWLINE;
	}
	token.tokenEnd = off;
	return TOK_DATA_CHARS;
      default:
	off += minBPC;
	break;
      }
    }
    token.tokenEnd = off;
    return TOK_DATA_CHARS;
  }

  /**
   * Skips over an ignored conditional section.
   * The subarray starts following the <code>&lt;![ IGNORE [</code>.
   *
   * @return the index of the character following the closing
   * <code>]]></code>
   *
   * @exception PartialTokenException if the subarray does not contain the
   * complete ignored conditional section
   * @exception InvalidTokenException if the ignored conditional section
   * contains illegal characters
   */
  public final
  int skipIgnoreSect(byte[] buf, int off, int end) throws PartialTokenException, InvalidTokenException {
    if (minBPC > 1)
      end = adjustEnd(off, end);
    int level = 0;
  loop:
    while (off != end) {
      switch (byteType(buf, off)) {
      case BT_LEAD2:
	if (end - off < 2)
	  throw new PartialCharException(off);
	check2(buf, off);
	off += 2;
	break;
      case BT_LEAD3:
	if (end - off < 3)
	  throw new PartialCharException(off);
	check3(buf, off);
	off += 3;
	break;
      case BT_LEAD4:
	if (end - off < 4)
	  throw new PartialCharException(off);
	check4(buf, off);
	off += 4;
	break;
      case BT_NONXML:
      case BT_MALFORM:
	throw new InvalidTokenException(off);
      case BT_LT:
	off += minBPC;
	if (off == end)
	  break loop;
	if (!charMatches(buf, off, '!'))
	  break;
	off += minBPC;
	if (off == end)
	  break loop;
	if (!charMatches(buf, off, '['))
	  break;
	level++;
	off += minBPC;
	break;
      case BT_RSQB:
	off += minBPC;
	if (off == end)
	  break loop;
	if (!charMatches(buf, off, ']'))
	  break;
	off += minBPC;
	if (off == end)
	  break loop;
	if (charMatches(buf, off, '>')) {
	  if (level == 0)
	    return off + minBPC;
	  level--;
	}
	else if (charMatches(buf, off, ']'))
	  break;
	off += minBPC;
	break;
      default:
	off += minBPC;
	break;
      }
    }
    throw new PartialTokenException();
  }

  /**
   * Checks that a literal contained in the specified byte subarray
   * is a legal public identifier and returns a string with
   * the normalized content of the public id.
   * The subarray includes the opening and closing quotes.
   * @exception InvalidTokenException if it is not a legal public identifier
   */
  public final
  String getPublicId(byte[] buf, int off, int end) throws InvalidTokenException {
    StringBuffer sbuf = new StringBuffer();
    off += minBPC;
    end -= minBPC;
    for (; off != end; off += minBPC) {
      char c = (char)byteToAscii(buf, off);
      switch (byteType(buf, off)) {
      case BT_MINUS:
      case BT_APOS:
      case BT_LPAR:
      case BT_RPAR:
      case BT_PLUS:
      case BT_COMMA:
      case BT_SOL:
      case BT_EQUALS:
      case BT_QUEST:
      case BT_SEMI:
      case BT_EXCL:
      case BT_AST:
      case BT_PERCNT:
      case BT_NUM:
	sbuf.append(c);
	break;
      case BT_S:
	if (charMatches(buf, off, '\t'))
	  throw new InvalidTokenException(off);
	/* fall through */
      case BT_CR:
      case BT_LF:
	if (sbuf.length() > 0 && sbuf.charAt(sbuf.length() - 1) != ' ')
	  sbuf.append(' ');
	break;
      case BT_NAME:
      case BT_NMSTRT:
	if ((c & ~0x7f) == 0) {
	  sbuf.append(c);
	  break;
	}
	// fall through
      default:
	switch (c) {
	case '$':
	case '@':
	  break;
	default:
	  throw new InvalidTokenException(off);
	}
	break;
      }
    }
    if (sbuf.length() > 0 && sbuf.charAt(sbuf.length() - 1) == ' ')
      sbuf.setLength(sbuf.length() - 1);
    return sbuf.toString();
  }

  /**
   * Returns true if the specified byte subarray is equal to the string.
   * The string must contain only XML significant characters.
   */
  public final
  boolean matchesXMLString(byte[] buf, int off, int end, String str) {
    int len = str.length();
    if (len*minBPC != end - off)
      return false;
    for (int i = 0; i < len; off += minBPC, i++) {
      if (!charMatches(buf, off, str.charAt(i)))
	return false;
    }
    return true;
  }

  /**
   * Skips over XML whitespace characters at the start of the specified
   * subarray.
   *
   * @return the index of the first non-whitespace character,
   * <code>end</code> if there is the subarray is all whitespace
   */
  public final
  int skipS(byte[] buf, int off, int end) {
  loop:
    while (off < end) {
      switch (byteType(buf, off)) {
      case BT_S:
      case BT_CR:
      case BT_LF:
	off += minBPC;
	break;
      default:
	break loop;
      }
    }
    return off;
  }

  private final boolean isNameChar2(byte[] buf, int off) {
    int bt = byteType2(buf, off);
    return bt == BT_NAME || bt == BT_NMSTRT;
  }
  private final boolean isNameChar3(byte[] buf, int off) {
    int bt = byteType3(buf, off);
    return bt == BT_NAME || bt == BT_NMSTRT;
  }
  private final boolean isNameChar4(byte[] buf, int off) {
    int bt = byteType4(buf, off);
    return bt == BT_NAME || bt == BT_NMSTRT;
  }

  private static final String nameStartSingles =
  "\u003a\u005f\u0386\u038c\u03da\u03dc\u03de\u03e0\u0559\u06d5\u093d\u09b2" +
  "\u0a5e\u0a8d\u0abd\u0ae0\u0b3d\u0b9c\u0cde\u0e30\u0e84\u0e8a\u0e8d\u0ea5" +
  "\u0ea7\u0eb0\u0ebd\u1100\u1109\u113c\u113e\u1140\u114c\u114e\u1150\u1159" +
  "\u1163\u1165\u1167\u1169\u1175\u119e\u11a8\u11ab\u11ba\u11eb\u11f0\u11f9" +
  "\u1f59\u1f5b\u1f5d\u1fbe\u2126\u212e\u3007";
  private static final String nameStartRanges =
  "\u0041\u005a\u0061\u007a\u00c0\u00d6\u00d8\u00f6\u00f8\u00ff\u0100\u0131" +
  "\u0134\u013e\u0141\u0148\u014a\u017e\u0180\u01c3\u01cd\u01f0\u01f4\u01f5" +
  "\u01fa\u0217\u0250\u02a8\u02bb\u02c1\u0388\u038a\u038e\u03a1\u03a3\u03ce" +
  "\u03d0\u03d6\u03e2\u03f3\u0401\u040c\u040e\u044f\u0451\u045c\u045e\u0481" +
  "\u0490\u04c4\u04c7\u04c8\u04cb\u04cc\u04d0\u04eb\u04ee\u04f5\u04f8\u04f9" +
  "\u0531\u0556\u0561\u0586\u05d0\u05ea\u05f0\u05f2\u0621\u063a\u0641\u064a" +
  "\u0671\u06b7\u06ba\u06be\u06c0\u06ce\u06d0\u06d3\u06e5\u06e6\u0905\u0939" +
  "\u0958\u0961\u0985\u098c\u098f\u0990\u0993\u09a8\u09aa\u09b0\u09b6\u09b9" +
  "\u09dc\u09dd\u09df\u09e1\u09f0\u09f1\u0a05\u0a0a\u0a0f\u0a10\u0a13\u0a28" +
  "\u0a2a\u0a30\u0a32\u0a33\u0a35\u0a36\u0a38\u0a39\u0a59\u0a5c\u0a72\u0a74" +
  "\u0a85\u0a8b\u0a8f\u0a91\u0a93\u0aa8\u0aaa\u0ab0\u0ab2\u0ab3\u0ab5\u0ab9" +
  "\u0b05\u0b0c\u0b0f\u0b10\u0b13\u0b28\u0b2a\u0b30\u0b32\u0b33\u0b36\u0b39" +
  "\u0b5c\u0b5d\u0b5f\u0b61\u0b85\u0b8a\u0b8e\u0b90\u0b92\u0b95\u0b99\u0b9a" +
  "\u0b9e\u0b9f\u0ba3\u0ba4\u0ba8\u0baa\u0bae\u0bb5\u0bb7\u0bb9\u0c05\u0c0c" +
  "\u0c0e\u0c10\u0c12\u0c28\u0c2a\u0c33\u0c35\u0c39\u0c60\u0c61\u0c85\u0c8c" +
  "\u0c8e\u0c90\u0c92\u0ca8\u0caa\u0cb3\u0cb5\u0cb9\u0ce0\u0ce1\u0d05\u0d0c" +
  "\u0d0e\u0d10\u0d12\u0d28\u0d2a\u0d39\u0d60\u0d61\u0e01\u0e2e\u0e32\u0e33" +
  "\u0e40\u0e45\u0e81\u0e82\u0e87\u0e88\u0e94\u0e97\u0e99\u0e9f\u0ea1\u0ea3" +
  "\u0eaa\u0eab\u0ead\u0eae\u0eb2\u0eb3\u0ec0\u0ec4\u0f40\u0f47\u0f49\u0f69" +
  "\u10a0\u10c5\u10d0\u10f6\u1102\u1103\u1105\u1107\u110b\u110c\u110e\u1112" +
  "\u1154\u1155\u115f\u1161\u116d\u116e\u1172\u1173\u11ae\u11af\u11b7\u11b8" +
  "\u11bc\u11c2\u1e00\u1e9b\u1ea0\u1ef9\u1f00\u1f15\u1f18\u1f1d\u1f20\u1f45" +
  "\u1f48\u1f4d\u1f50\u1f57\u1f5f\u1f7d\u1f80\u1fb4\u1fb6\u1fbc\u1fc2\u1fc4" +
  "\u1fc6\u1fcc\u1fd0\u1fd3\u1fd6\u1fdb\u1fe0\u1fec\u1ff2\u1ff4\u1ff6\u1ffc" +
  "\u212a\u212b\u2180\u2182\u3041\u3094\u30a1\u30fa\u3105\u312c\uac00\ud7a3" +
  "\u4e00\u9fa5\u3021\u3029";
  private static final String nameSingles =
  "\u002d\u002e\u05bf\u05c4\u0670\u093c\u094d\u09bc\u09be\u09bf\u09d7\u0a02" +
  "\u0a3c\u0a3e\u0a3f\u0abc\u0b3c\u0bd7\u0d57\u0e31\u0eb1\u0f35\u0f37\u0f39" +
  "\u0f3e\u0f3f\u0f97\u0fb9\u20e1\u3099\u309a\u00b7\u02d0\u02d1\u0387\u0640" +
  "\u0e46\u0ec6\u3005";
  private static final String nameRanges =
  "\u0300\u0345\u0360\u0361\u0483\u0486\u0591\u05a1\u05a3\u05b9\u05bb\u05bd" +
  "\u05c1\u05c2\u064b\u0652\u06d6\u06dc\u06dd\u06df\u06e0\u06e4\u06e7\u06e8" +
  "\u06ea\u06ed\u0901\u0903\u093e\u094c\u0951\u0954\u0962\u0963\u0981\u0983" +
  "\u09c0\u09c4\u09c7\u09c8\u09cb\u09cd\u09e2\u09e3\u0a40\u0a42\u0a47\u0a48" +
  "\u0a4b\u0a4d\u0a70\u0a71\u0a81\u0a83\u0abe\u0ac5\u0ac7\u0ac9\u0acb\u0acd" +
  "\u0b01\u0b03\u0b3e\u0b43\u0b47\u0b48\u0b4b\u0b4d\u0b56\u0b57\u0b82\u0b83" +
  "\u0bbe\u0bc2\u0bc6\u0bc8\u0bca\u0bcd\u0c01\u0c03\u0c3e\u0c44\u0c46\u0c48" +
  "\u0c4a\u0c4d\u0c55\u0c56\u0c82\u0c83\u0cbe\u0cc4\u0cc6\u0cc8\u0cca\u0ccd" +
  "\u0cd5\u0cd6\u0d02\u0d03\u0d3e\u0d43\u0d46\u0d48\u0d4a\u0d4d\u0e34\u0e3a" +
  "\u0e47\u0e4e\u0eb4\u0eb9\u0ebb\u0ebc\u0ec8\u0ecd\u0f18\u0f19\u0f71\u0f84" +
  "\u0f86\u0f8b\u0f90\u0f95\u0f99\u0fad\u0fb1\u0fb7\u20d0\u20dc\u302a\u302f" +
  "\u0030\u0039\u0660\u0669\u06f0\u06f9\u0966\u096f\u09e6\u09ef\u0a66\u0a6f" +
  "\u0ae6\u0aef\u0b66\u0b6f\u0be7\u0bef\u0c66\u0c6f\u0ce6\u0cef\u0d66\u0d6f" +
  "\u0e50\u0e59\u0ed0\u0ed9\u0f20\u0f29\u3031\u3035\u309d\u309e\u30fc\u30fe";

  /* final */ static byte[][] charTypeTable;
  private static void setCharType(char c, int type) {
    if (c < 0x80)
      return;
    int hi = c >> 8;
    if (charTypeTable[hi] == null) {
      charTypeTable[hi] = new byte[256];
      for (int i = 0; i < 256; i++)
	charTypeTable[hi][i] = BT_OTHER;
    }
    charTypeTable[hi][c & 0xFF] = (byte)type;
  }

  private static void setCharType(char min, char max, int type) {
    byte[] shared = null;
    do {
      if ((min & 0xFF) == 0) {
	for (; min + 0xFF <= max; min += 0x100) {
	  if (shared == null) {
	    shared = new byte[256];
	    for (int i = 0; i < 256; i++)
	      shared[i] = (byte)type;
	  }
	  charTypeTable[min >> 8] = shared;
	  if (min + 0xFF == max)
	    return;
	}
      }
      setCharType(min, type);
    } while (min++ != max);
  }

  static {
    charTypeTable = new byte[256][];
    for (int i = 0; i < nameSingles.length(); i++)
      setCharType(nameSingles.charAt(i), BT_NAME);
    for (int i = 0; i < nameRanges.length(); i += 2)
      setCharType(nameRanges.charAt(i), nameRanges.charAt(i + 1), BT_NAME);
    for (int i = 0; i < nameStartSingles.length(); i++)
      setCharType(nameStartSingles.charAt(i), BT_NMSTRT);
    for (int i = 0; i < nameStartRanges.length(); i += 2)
      setCharType(nameStartRanges.charAt(i), nameStartRanges.charAt(i + 1),
		  BT_NMSTRT);
    setCharType('\uD800', '\uDBFF', BT_LEAD4);
    setCharType('\uDC00', '\uDFFF', BT_MALFORM);
    setCharType('\uFFFE', '\uFFFF', BT_NONXML);
    byte[] other = new byte[256];
    for (int i = 0; i < 256; i++)
      other[i] = BT_OTHER;
    for (int i = 0; i < 256; i++)
      if (charTypeTable[i] == null)
	charTypeTable[i] = other;
    System.arraycopy(asciiTypeTable, 0, charTypeTable[0], 0, 128);
  }

  /**
   * Returns the minimum number of bytes required to represent a single
   * character in this encoding.  The value will be 1, 2 or 4.
   */
  public final int getMinBytesPerChar() {
    return minBPC;
  }
}
