File: tinyxmlparser.cpp

package info (click to toggle)
criticalmass 0.97-1
links: PTS
area: main
in suites: woody
size: 5,376 kB
ctags: 2,652
sloc: cpp: 15,548; xml: 3,182; sh: 334; makefile: 170
file content (535 lines) | stat: -rwxr-xr-x 11,123 bytes
parent folder | download | duplicates (8)
/*
Copyright (c) 2000 Lee Thomason (www.grinninglizard.com)

This software is provided 'as-is', without any express or implied 
warranty. In no event will the authors be held liable for any 
damages arising from the use of this software.

Permission is granted to anyone to use this software for any 
purpose, including commercial applications, and to alter it and 
redistribute it freely, subject to the following restrictions:

1. The origin of this software must not be misrepresented; you must 
not claim that you wrote the original software. If you use this 
software in a product, an acknowledgment in the product documentation 
would be appreciated but is not required.

2. Altered source versions must be plainly marked as such, and 
must not be misrepresented as being the original software.

3. This notice may not be removed or altered from any source 
distribution.
*/


#include "tinyxml.h"
#include <ctype.h>

const char* TiXmlBase::SkipWhiteSpace( const char* p )
{
	while ( p && *p && 
	        ( isspace( *p ) || *p == '\n' || *p == '\r' ) )
		p++;
	return p;
}

const char* TiXmlBase::ReadName( const char* p, std::string* name )
{
	*name = "";
	const char* start = p;

	// Names start with letters or underscores.
	// After that, they can be letters, underscores, numbers,
	// hyphens, or colons. (Colons are valid ony for namespaces,
	// but tinyxml can't tell namespaces from names.)
	if ( p && ( isalpha( *p ) || *p == '_' ) )
	{
		p++;
		while( p && *p && 
			   (   isalnum( *p ) 
			     || *p == '_'
				 || *p == '-'
				 || *p == ':' ) )
		{
			p++;
		}
		name->append( start, p - start );
		return p;
	}
	return 0;
}


const char* TiXmlDocument::Parse( const char* start )
{
	// Parse away, at the document level. Since a document
	// contains nothing but other tags, most of what happens
	// here is skipping white space.
	
	const char* p = start;

 	p = SkipWhiteSpace( p );
	if ( !p || !*p )
	{
		error = true;
		errorDesc = "Document empty.";
	}
	
	while ( p && *p )
	{	
		if ( *p != '<' )
		{
			error = true;
			errorDesc = "The '<' symbol that starts a tag was not found.";
			break;
		}
		else
		{
			TiXmlNode* node = IdentifyAndParse( &p );
			if ( node )
			{
				LinkEndChild( node );
			}				
		}
		p = SkipWhiteSpace( p );
	}
	return 0;	// Return null is fine for a document: once it is read, the parsing is over.
}


TiXmlNode* TiXmlNode::IdentifyAndParse( const char** where )
{
	const char* p = *where;
	TiXmlNode* returnNode = 0;
	assert( *p == '<' );
	TiXmlDocument* doc = GetDocument();

	p = SkipWhiteSpace( p+1 );

	// What is this thing? 
	// - Elements start with a letter or underscore, but xml is reserved.
	// - Comments: <!--
	// - Everthing else is unknown to tinyxml.
	//
	if ( 	   tolower( *(p+0) ) == '?'
			&& tolower( *(p+1) ) == 'x' 
			&& tolower( *(p+2) ) == 'm'
			&& tolower( *(p+3) ) == 'l' )
	{
		#ifdef DEBUG_PARSER
			printf( "XML parsing Declaration\n" );
		#endif
		returnNode = new TiXmlDeclaration();
	}
	else if ( isalpha( *p ) || *p == '_' )
	{
		#ifdef DEBUG_PARSER
			printf( "XML parsing Element\n" );
		#endif
		returnNode = new TiXmlElement( "" );
	}
	else if (    *(p+0) == '!'
			  && *(p+1) == '-'
			  && *(p+2) == '-' )
	{
		#ifdef DEBUG_PARSER
			printf( "XML parsing Comment\n" );
		#endif
		returnNode = new TiXmlComment();
	}
	else
	{
		#ifdef DEBUG_PARSER
			printf( "XML parsing Comment\n" );
		#endif
		returnNode = new TiXmlUnknown();
	}

	if ( returnNode )
	{
		// Set the parent, so it can report errors
		returnNode->parent = this;
		p = returnNode->Parse( p );
	}
	else
	{
		if ( doc )
			doc->SetError( TIXML_ERROR_OUT_OF_MEMORY );
		p = 0;
	}
	*where = p;
	return returnNode;
}


const char* TiXmlElement::Parse( const char* p )
{
	TiXmlDocument* document = GetDocument();
	p = SkipWhiteSpace( p );
	if ( !p || !*p )
	{
		if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT );
		return 0;
	}

	// Read the name.
	p = ReadName( p, &value );
	if ( !p )
	{
		if ( document )	document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME );
		return 0;
	}

	std::string endTag = "</";
	endTag += value;
	endTag += ">";

	// Check for and read attributes. Also look for an empty
	// tag or an end tag.
	while ( p && *p )
	{
		p = SkipWhiteSpace( p );
		if ( !p || !*p )
		{
			if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
			return 0;
		}
		if ( *p == '/' )
		{
			// Empty tag.
			if ( *(p+1) != '>' )
			{
				if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY );		
				return 0;
			}
			return p+2;
		}
		else if ( *p == '>' )
		{
			// Done with attributes (if there were any.)
			// Read the value -- which can include other
			// elements -- read the end tag, and return.
			p = ReadValue( p+1 );		// Note this is an Element method, and will set the error if one happens.
			if ( !p )
				return 0;

			// We should find the end tag now
			std::string buf( p, endTag.size() );
			if ( endTag == buf )
			{
				return p+endTag.size();
			}
			else
			{
				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG );
				return 0;
			}
		}
		else
		{
			// Try to read an element:
			TiXmlAttribute attrib;
			attrib.SetDocument( document );
			p = attrib.Parse( p );

			if ( p )
			{
				SetAttribute( attrib.Name(), attrib.Value() );
			}
		}
	}
	return 0;
}


const char* TiXmlElement::ReadValue( const char* p )
{
	TiXmlDocument* document = GetDocument();

	// Read in text and elements in any order.
	p = SkipWhiteSpace( p );
	while ( p && *p )
	{
		const char* start = p;
		while ( *p && *p != '<' )
			p++;

		if ( !*p )
		{
			if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE );
			return 0;
		}
		if ( p != start )
		{
			// Take what we have, make a text element.
			TiXmlText* text = new TiXmlText();

			if ( !text )
			{
				if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY );
				return 0;
			}
			text->Parse( start );
			if ( !text->Blank() )
				LinkEndChild( text );
			else
				delete text;
		} 
		else 
		{
			// We hit a '<'
			// Have we hit a new element or an end tag?
			if ( *(p+1) == '/' )
			{
				return p;	// end tag
			}
			else
			{
// 				TiXmlElement* element = new TiXmlElement( "" );
// 
// 				if ( element )
// 				{
// 					p = element->Parse( p+1 );
// 					if ( p )
// 						LinkEndChild( element );
// 				}
// 				else
// 				{
// 					if ( document ) document->SetError( ERROR_OUT_OF_MEMORY );
// 					return 0;
// 				}
				TiXmlNode* node = IdentifyAndParse( &p );
				if ( node )
				{
					LinkEndChild( node );
				}				
				else
				{
					return 0;
				}
			}
		}
	}
	return 0;
}


const char* TiXmlUnknown::Parse( const char* p )
{
	const char* end = strchr( p, '>' );
	if ( !end )
	{
		TiXmlDocument* document = GetDocument();
		if ( document )
			document->SetError( TIXML_ERROR_PARSING_UNKNOWN );
		return 0;
	}
	else
	{
		value = std::string( p, end-p );
// 		value.resize( end - p );
		return end + 1;			// return just past the '>'
	}
}


const char* TiXmlComment::Parse( const char* p )
{
	assert( *p == '!' && *(p+1) == '-' && *(p+2) == '-' );

	// Find the end, copy the parts between to the value of
	// this object, and return.
	const char* start = p+3;
	const char* end = strstr( p, "-->" );
	if ( !end )
	{
		TiXmlDocument* document = GetDocument();
		if ( document )
			document->SetError( TIXML_ERROR_PARSING_COMMENT );
		return 0;
	}
	else
	{
		// Assemble the comment, removing the white space.
		bool whiteSpace = false;

		const char* q;
		for( q=start; q<end; q++ )
		{
			if ( isspace( *q ) )
			{
				if ( !whiteSpace )
				{
					value += ' ';
					whiteSpace = true;
				}
			}
			else
			{
				value += *q;
				whiteSpace = false;
			}
		}				
// 		value = std::string( start, end-start );
		return end + 3;			// return just past the '>'
	}
}


const char* TiXmlAttribute::Parse( const char* p )
{
	// Read the name, the '=' and the value.
	p = ReadName( p, &name );
	if ( !p )
	{
		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
		return 0;
	}
	p = SkipWhiteSpace( p );
	if ( !p || *p != '=' )
	{
		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
		return 0;
	}

	p = SkipWhiteSpace( p+1 );
	if ( !p || !*p )
	{
		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
		return 0;
	}
	
	const char* end = 0;
	const char* start = p+1;
	const char* past = 0;

	if ( *p == '\'' )
	{
		end = strchr( start, '\'' );
		past = end+1;
	}
	else if ( *p == '"' )
	{
		end = strchr( start, '"' );
		past = end+1;
	}
	else
	{
		// All attribute values should be in single or double quotes.
		// But this is such a common error that the parser will try
		// its best, even without them.
		start--;
		for ( end = start; *end; end++ )
		{
			if ( isspace( *end ) || *end == '/' || *end == '>' )
				break;
		}
		past = end;
	}
	value = std::string( start, end-start );
	return past;
}


const char* TiXmlText::Parse( const char* p )
{
	value = "";
	bool whitespace = false;

	// Remove leading white space:
	p = SkipWhiteSpace( p );
	while ( *p && *p != '<' )
	{
		if ( *p == '\r' || *p == '\n' )
		{
			whitespace = true;
		}
		else if ( isspace( *p ) )
		{
			whitespace = true;
		}
		else
		{
			// If we've found whitespace, add it before the
			// new character. Any whitespace just becomes a space.
			if ( whitespace )
			{
				value += ' ';
				whitespace = false;
			}
			value += *p;
		}
		p++;
	}
	// Keep white space before the '<' 
	if ( whitespace )
		value += ' ';

	return p;
}


const char* TiXmlDeclaration::Parse( const char* p )
{
	// Find the beginning, find the end, and look for
	// the stuff in-between.
	const char* start = p+4;
	const char* end  = strstr( start, "?>" );

	// Be nice to the user:
	if ( !end )
	{
		end = strstr( start, ">" );
		end++;
	}
	else
	{
		end += 2;
	}

	if ( !end )
	{
		TiXmlDocument* document = GetDocument();
		if ( document )
			document->SetError( TIXML_ERROR_PARSING_DECLARATION );
		return 0;
	}
	else
	{
		const char* p;
		
		p = strstr( start, "version" );
		if ( p && p < end )
		{
			TiXmlAttribute attrib;
			attrib.Parse( p );		
			version = attrib.Value();
		}

		p = strstr( start, "encoding" );
		if ( p && p < end )
		{
			TiXmlAttribute attrib;
			attrib.Parse( p );		
			encoding = attrib.Value();
		}

		p = strstr( start, "standalone" );
		if ( p && p < end )
		{
			TiXmlAttribute attrib;
			attrib.Parse( p );		
			standalone = attrib.Value();
		}
	}
	return end;
}

bool TiXmlText::Blank()
{
	for ( unsigned i=0; i<value.size(); i++ )
		if ( !isspace( value[i] ) )
			return false;
	return true;
}