1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2004-2005, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: xmlparser.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2004jul21
* created by: Andy Heninger
*
* Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
* Not suitable for production use. Not supported.
* Not conformant. Not efficient.
* But very small.
*/
#ifndef __XMLPARSER_H__
#define __XMLPARSER_H__
#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/regex.h"
#include "uvector.h"
#include "hash.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
enum UXMLNodeType {
/** Node type string (text contents), stored as a UnicodeString. */
UXML_NODE_TYPE_STRING,
/** Node type element, stored as a UXMLElement. */
UXML_NODE_TYPE_ELEMENT,
UXML_NODE_TYPE_COUNT
};
U_NAMESPACE_BEGIN
class UXMLParser;
/**
* This class represents an element node in a parsed XML tree.
*/
class U_TOOLUTIL_API UXMLElement : public UObject {
public:
/**
* Destructor.
*/
virtual ~UXMLElement();
/**
* Get the tag name of this element.
*/
const UnicodeString &getTagName() const;
/**
* Get the text contents of the element.
* Append the contents of all text child nodes.
* @param recurse If true, also recursively appends the contents of all
* text child nodes of element children.
* @return The text contents.
*/
UnicodeString getText(UBool recurse) const;
/**
* Get the number of attributes.
*/
int32_t countAttributes() const;
/**
* Get the i-th attribute.
* @param i Index of the attribute.
* @param name Output parameter, receives the attribute name.
* @param value Output parameter, receives the attribute value.
* @return A pointer to the attribute value (may be &value or a pointer to an
* internal string object), or nullptr if i is out of bounds.
*/
const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
/**
* Get the value of the attribute with the given name.
* @param name Attribute name to be looked up.
* @return A pointer to the attribute value, or nullptr if this element
* does not have this attribute.
*/
const UnicodeString *getAttribute(const UnicodeString &name) const;
/**
* Get the number of child nodes.
*/
int32_t countChildren() const;
/**
* Get the i-th child node.
* @param i Index of the child node.
* @param type The child node type.
* @return A pointer to the child node object, or nullptr if i is out of bounds.
*/
const UObject *getChild(int32_t i, UXMLNodeType &type) const;
/**
* Get the next child element node, skipping non-element child nodes.
* @param i Enumeration index; initialize to 0 before getting the first child element.
* @return A pointer to the next child element, or nullptr if there is none.
*/
const UXMLElement *nextChildElement(int32_t &i) const;
/**
* Get the immediate child element with the given name.
* If there are multiple child elements with this name, then return
* the first one.
* @param name Element name to be looked up.
* @return A pointer to the element node, or nullptr if this element
* does not have this immediate child element.
*/
const UXMLElement *getChildElement(const UnicodeString &name) const;
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*/
virtual UClassID getDynamicClassID() const override;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*/
static UClassID U_EXPORT2 getStaticClassID();
private:
// prevent default construction etc.
UXMLElement();
UXMLElement(const UXMLElement &other);
UXMLElement &operator=(const UXMLElement &other);
void appendText(UnicodeString &text, UBool recurse) const;
friend class UXMLParser;
UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
const UXMLParser *fParser;
const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser)
UnicodeString fContent; // The text content of this node. All element content is
// concatenated even when there are intervening nested elements
// (which doesn't happen with most xml files we care about)
// Sections of content containing only white space are dropped,
// which gets rid the bogus white space content from
// elements which are primarily containers for nested elements.
UVector fAttNames; // A vector containing the names of this element's attributes
// The names are UnicodeString objects, owned by the UXMLParser.
UVector fAttValues; // A vector containing the attribute values for
// this element's attributes. The order is the same
// as that of the attribute name vector.
UVector fChildren; // The child nodes of this element (a Vector)
UXMLElement *fParent; // A pointer to the parent element of this element.
};
/**
* A simple XML parser; it is neither efficient nor conformant and only useful for
* restricted types of XML documents.
*
* The parse methods parse whole documents and return the parse trees via their
* root elements.
*/
class U_TOOLUTIL_API UXMLParser : public UObject {
public:
/**
* Create an XML parser.
*/
static UXMLParser *createParser(UErrorCode &errorCode);
/**
* Destructor.
*/
virtual ~UXMLParser();
/**
* Parse an XML document, create the entire document tree, and
* return a pointer to the root element of the parsed tree.
* The caller must delete the element.
*/
UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
/**
* Parse an XML file, create the entire document tree, and
* return a pointer to the root element of the parsed tree.
* The caller must delete the element.
*/
UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*/
virtual UClassID getDynamicClassID() const override;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*/
static UClassID U_EXPORT2 getStaticClassID();
private:
// prevent default construction etc.
UXMLParser();
UXMLParser(const UXMLParser &other);
UXMLParser &operator=(const UXMLParser &other);
// constructor
UXMLParser(UErrorCode &status);
void parseMisc(UErrorCode &status);
UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status);
void error(const char *message, UErrorCode &status);
UnicodeString scanContent(UErrorCode &status);
void replaceCharRefs(UnicodeString &s, UErrorCode &status);
const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
public:
// public for UXMLElement only
const UnicodeString *findName(const UnicodeString &s) const;
private:
// There is one ICU regex matcher for each of the major XML syntax items
// that are recognized.
RegexMatcher mXMLDecl;
RegexMatcher mXMLComment;
RegexMatcher mXMLSP;
RegexMatcher mXMLDoctype;
RegexMatcher mXMLPI;
RegexMatcher mXMLElemStart;
RegexMatcher mXMLElemEnd;
RegexMatcher mXMLElemEmpty;
RegexMatcher mXMLCharData;
RegexMatcher mAttrValue;
RegexMatcher mAttrNormalizer;
RegexMatcher mNewLineNormalizer;
RegexMatcher mAmps;
Hashtable fNames; // interned element/attribute name strings
UStack fElementStack; // Stack holds the parent elements when nested
// elements are being parsed. All items on this
// stack are of type UXMLElement.
int32_t fPos; // String index of the current scan position in
// xml source (in fSrc).
UnicodeString fOneLF;
};
U_NAMESPACE_END
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
#endif
|