/* * $Id: XmlReader.java,v 1.1 2004/08/19 05:30:22 aslom Exp $ * * The Apache Software License, Version 1.1 * * * Copyright (c) 2000 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Crimson" and "Apache Software Foundation" must * not be used to endorse or promote products derived from this * software without prior written permission. For written * permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", * nor may "Apache" appear in their name, without prior written * permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation and was * originally based on software copyright (c) 1999, Sun Microsystems, Inc., * http://www.sun.com. For more information on the Apache Software * Foundation, please see . */ package com.bea.xml.stream.reader; import java.io.*; import java.util.Hashtable; /** * This handles several XML-related tasks that normal java.io Readers * don't support, inluding use of IETF standard encoding names and * automatic detection of most XML encodings. The former is needed * for interoperability; the latter is needed to conform with the XML * spec. This class also optimizes reading some common encodings by * providing low-overhead unsynchronized Reader support. * *

Note that the autodetection facility should be used only on * data streams which have an unknown character encoding. For example, * it should never be used on MIME text/xml entities. * *

Note that XML processors are only required to support UTF-8 and * UTF-16 character encodings. Autodetection permits the underlying Java * implementation to provide support for many other encodings, such as * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP. * * @author David Brownell * @version $Revision: 1.1 $ */ final public class XmlReader extends Reader { private static final int MAXPUSHBACK = 512; private Reader in; private String assignedEncoding; private boolean closed; // // This class always delegates I/O to a reader, which gets // its data from the very beginning of the XML text. It needs // to use a pushback stream since (a) autodetection can read // partial UTF-8 characters which need to be fully processed, // (b) the "Unicode" readers swallow characters that they think // are byte order marks, so tests fail if they don't see the // real byte order mark. // // It's got do this efficiently: character I/O is solidly on the // critical path. (So keep buffer length over 2 Kbytes to avoid // excess buffering. Many URL handlers stuff a BufferedInputStream // between here and the real data source, and larger buffers keep // that from slowing you down.) // /** * Constructs the reader from an input stream, autodetecting * the encoding to use according to the heuristic specified * in the XML 1.0 recommendation. * * @param in the input stream from which the reader is constructed * @exception IOException on error, such as unrecognized encoding */ public static Reader createReader (InputStream in) throws IOException { return new XmlReader (in); } /** * Creates a reader supporting the given encoding, mapping * from standard encoding names to ones that understood by * Java where necessary. * * @param in the input stream from which the reader is constructed * @param encoding the IETF standard name of the encoding to use; * if null, autodetection is used. * @exception IOException on error, including unrecognized encoding */ public static Reader createReader (InputStream in, String encoding) throws IOException { if (encoding == null) { return new XmlReader(in); } if ("UTF-8".equalsIgnoreCase (encoding) || "UTF8".equalsIgnoreCase (encoding)) { return new Utf8Reader (in); } if ("US-ASCII".equalsIgnoreCase (encoding) || "ASCII".equalsIgnoreCase (encoding)) { return new AsciiReader (in); } if ("ISO-8859-1".equalsIgnoreCase (encoding) // plus numerous aliases ... ) { return new Iso8859_1Reader (in); } // What we really want is an administerable resource mapping // encoding names/aliases to classnames. For example a property // file resource, "readers/mapping.props", holding and a set // of readers in that (sub)package... defaulting to this call // only if no better choice is available. // return new InputStreamReader (in, std2java (encoding)); } // JDK doesn't know all of the standard encoding names, and // in particular none of the EBCDIC ones IANA defines (and // which IBM encourages). static private final Hashtable charsets = new Hashtable (31); static { charsets.put ("UTF-16", "Unicode"); charsets.put ("ISO-10646-UCS-2", "Unicode"); // NOTE: no support for ISO-10646-UCS-4 yet. charsets.put ("EBCDIC-CP-US", "cp037"); charsets.put ("EBCDIC-CP-CA", "cp037"); charsets.put ("EBCDIC-CP-NL", "cp037"); charsets.put ("EBCDIC-CP-WT", "cp037"); charsets.put ("EBCDIC-CP-DK", "cp277"); charsets.put ("EBCDIC-CP-NO", "cp277"); charsets.put ("EBCDIC-CP-FI", "cp278"); charsets.put ("EBCDIC-CP-SE", "cp278"); charsets.put ("EBCDIC-CP-IT", "cp280"); charsets.put ("EBCDIC-CP-ES", "cp284"); charsets.put ("EBCDIC-CP-GB", "cp285"); charsets.put ("EBCDIC-CP-FR", "cp297"); charsets.put ("EBCDIC-CP-AR1", "cp420"); charsets.put ("EBCDIC-CP-HE", "cp424"); charsets.put ("EBCDIC-CP-BE", "cp500"); charsets.put ("EBCDIC-CP-CH", "cp500"); charsets.put ("EBCDIC-CP-ROECE", "cp870"); charsets.put ("EBCDIC-CP-YU", "cp870"); charsets.put ("EBCDIC-CP-IS", "cp871"); charsets.put ("EBCDIC-CP-AR2", "cp918"); // IANA also defines two that JDK 1.2 doesn't handle: // EBCDIC-CP-GR --> CP423 // EBCDIC-CP-TR --> CP905 } // returns an encoding name supported by JDK >= 1.1.6 // for some cases required by the XML spec private static String std2java (String encoding) { String temp = encoding.toUpperCase (); temp = (String) charsets.get (temp); return (temp != null) ? temp : encoding; } /** Returns the standard name of the encoding in use */ public String getEncoding () { return assignedEncoding; } private XmlReader (InputStream stream) throws IOException { super (stream); PushbackInputStream pb; byte buf []; int len; /*if (stream instanceof PushbackInputStream) pb = (PushbackInputStream) stream; else*/ /** * Commented out the above code to make sure it works when the * document is accessed using http. URL connection in the code uses * a PushbackInputStream with size 7 and when we try to push back * MAX which default value is set to 512 we get and exception. So * that's why we need to wrap the stream irrespective of what type * of stream we start off with. */ pb = new PushbackInputStream (stream, MAXPUSHBACK); // // See if we can figure out the character encoding used // in this file by peeking at the first few bytes. // buf = new byte [4]; len = pb.read (buf); if (len > 0) pb.unread (buf, 0, len); if (len == 4) switch (buf [0] & 0x0ff) { case 0: // 00 3c 00 3f == illegal UTF-16 big-endian if (buf [1] == 0x3c && buf [2] == 0x00 && buf [3] == 0x3f) { setEncoding (pb, "UnicodeBig"); return; } // else it's probably UCS-4 break; case '<': // 0x3c: the most common cases! switch (buf [1] & 0x0ff) { // First character is '<'; could be XML without // an XML directive such as "", "