from xml.sax.handler import ContentHandler
from xml.sax.xmlreader import Locator
import sys
import xml.sax
import xml.sax.handler

class AimlParserError(Exception): pass

class AimlHandler(ContentHandler):
	# The legal states of the AIML parser
	_STATE_OutsideAiml    = 0
	_STATE_InsideAiml     = 1
	_STATE_InsideCategory = 2
	_STATE_InsidePattern  = 3
	_STATE_AfterPattern   = 4
	_STATE_InsideThat     = 5
	_STATE_AfterThat      = 6
	_STATE_InsideTemplate = 7
	_STATE_AfterTemplate  = 8
	
	def __init__(self, encoding = "UTF-8"):
		self.categories = {}
		self._encoding = encoding
		self._state = self._STATE_OutsideAiml
		self._version = ""
		self._namespace = ""
		self._forwardCompatibleMode = False
		self._currentPattern = ""
		self._currentThat    = ""
		self._currentTopic   = ""
		self._insideTopic = False
		self._currentUnknown = "" # the name of the current unknown element

		# This is set to true when a parse error occurs in a category.
		self._skipCurrentCategory = False

		# Counts the number of parse errors in a particular AIML document.
		# query with getNumErrors().  If 0, the document is AIML-compliant.
		self._numParseErrors = 0

		# TODO: select the proper validInfo table based on the version number.
		self._validInfo = self._validationInfo101

		# This stack of bools is used when parsing <li> elements inside
		# <condition> elements, to keep track of whether or not an
		# attribute-less "default" <li> element has been found yet.  Only
		# one default <li> is allowed in each <condition> element.  We need
		# a stack in order to correctly handle nested <condition> tags.
		self._foundDefaultLiStack = []

		# This stack of strings indicates what the current whitespace-handling
		# behavior should be.  Each string in the stack is either "default" or
		# "preserve".  When a new AIML element is encountered, a new string is
		# pushed onto the stack, based on the value of the element's "xml:space"
		# attribute (if absent, the top of the stack is pushed again).  When
		# ending an element, pop an object off the stack.
		self._whitespaceBehaviorStack = ["default"]
		
		self._elemStack = []
		self._locator = Locator()
		self.setDocumentLocator(self._locator)

	def getNumErrors(self):
		"Return the number of errors found while parsing the current document."
		return self._numParseErrors

	def setEncoding(self, encoding):
		"""Set the text encoding to use when encoding strings read from XML.

		Defaults to 'UTF-8'.

		"""
		self._encoding = encoding

	def _location(self):
		"Return a string describing the current location in the source file."
		line = self._locator.getLineNumber()
		column = self._locator.getColumnNumber()
		return "(line %d, column %d)" % (line, column)

	def _pushWhitespaceBehavior(self, attr):
		"""Push a new string onto the whitespaceBehaviorStack.

		The string's value is taken from the "xml:space" attribute, if it exists
		and has a legal value ("default" or "preserve").  Otherwise, the previous
		stack element is duplicated.

		"""
		assert len(self._whitespaceBehaviorStack) > 0, "Whitespace behavior stack should never be empty!"
		try:
			if attr["xml:space"] == "default" or attr["xml:space"] == "preserve":
				self._whitespaceBehaviorStack.append(attr["xml:space"])
			else:
				raise AimlParserError, "Invalid value for xml:space attribute "+self._location()
		except KeyError:
			self._whitespaceBehaviorStack.append(self._whitespaceBehaviorStack[-1])

	def startElementNS(self, name, qname, attr):
		print "QNAME:", qname
		print "NAME:", name
		uri,elem = name
		if (elem == "bot"): print "name:", attr.getValueByQName("name"), "a'ite?"
		self.startElement(elem, attr)
		pass

	def startElement(self, name, attr):
		# Wrapper around _startElement, which catches errors in _startElement()
		# and keeps going.
		
		# If we're inside an unknown element, ignore everything until we're
		# out again.
		if self._currentUnknown != "":
			return
		# If we're skipping the current category, ignore everything until
		# it's finished.
		if self._skipCurrentCategory:
			return

		# process this start-element.
		try: self._startElement(name, attr)
		except AimlParserError, msg:
			# Print the error message
			sys.stderr.write("PARSE ERROR: %s\n" % msg)
			
			self._numParseErrors += 1 # increment error count
			# In case of a parse error, if we're inside a category, skip it.
			if self._state >= self._STATE_InsideCategory:
				self._skipCurrentCategory = True
			
	def _startElement(self, name, attr):
		if name == "aiml":
			# <aiml> tags are only legal in the OutsideAiml state
			if self._state != self._STATE_OutsideAiml:
				raise AimlParserError, "Unexpected <aiml> tag "+self._location()
			self._state = self._STATE_InsideAiml
			self._insideTopic = False
			self._currentTopic = u""
			try: self._version = attr["version"]
			except KeyError:
				# This SHOULD be a syntax error, but so many AIML sets out there are missing
				# "version" attributes that it just seems nicer to let it slide.
				#raise AimlParserError, "Missing 'version' attribute in <aiml> tag "+self._location()
				#print "WARNING: Missing 'version' attribute in <aiml> tag "+self._location()
				#print "         Defaulting to version 1.0"
				self._version = "1.0"
			self._forwardCompatibleMode = (self._version != "1.0.1")
			self._pushWhitespaceBehavior(attr)			
			# Not sure about this namespace business yet...
			#try:
			#	self._namespace = attr["xmlns"]
			#	if self._version == "1.0.1" and self._namespace != "http://alicebot.org/2001/AIML-1.0.1":
			#		raise AimlParserError, "Incorrect namespace for AIML v1.0.1 "+self._location()
			#except KeyError:
			#	if self._version != "1.0":
			#		raise AimlParserError, "Missing 'version' attribute(s) in <aiml> tag "+self._location()
		elif self._state == self._STATE_OutsideAiml:
			# If we're outside of an AIML element, we ignore all tags.
			return
		elif name == "topic":
			# <topic> tags are only legal in the InsideAiml state, and only
			# if we're not already inside a topic.
			if (self._state != self._STATE_InsideAiml) or self._insideTopic:
				raise AimlParserError, "Unexpected <topic> tag", self._location()
			try: self._currentTopic = unicode(attr['name'])
			except KeyError:
				raise AimlParserError, "Required \"name\" attribute missing in <topic> element "+self._location()
			self._insideTopic = True
		elif name == "category":
			# <category> tags are only legal in the InsideAiml state
			if self._state != self._STATE_InsideAiml:
				raise AimlParserError, "Unexpected <category> tag "+self._location()
			self._state = self._STATE_InsideCategory
			self._currentPattern = u""
			self._currentThat = u""
			# If we're not inside a topic, the topic is implicitly set to *
			if not self._insideTopic: self._currentTopic = u"*"
			self._elemStack = []
			self._pushWhitespaceBehavior(attr)
		elif name == "pattern":
			# <pattern> tags are only legal in the InsideCategory state
			if self._state != self._STATE_InsideCategory:
				raise AimlParserError, "Unexpected <pattern> tag "+self._location()
			self._state = self._STATE_InsidePattern
		elif name == "that" and self._state == self._STATE_AfterPattern:
			# <that> are legal either inside a <template> element, or
			# inside a <category> element, between the <pattern> and the
			# <template> elements.  This clause handles the latter case.
			self._state = self._STATE_InsideThat
		elif name == "template":
			# <template> tags are only legal in the AfterPattern and AfterThat
			# states
			if self._state not in [self._STATE_AfterPattern, self._STATE_AfterThat]:
				raise AimlParserError, "Unexpected <template> tag "+self._location()
			# if no <that> element was specified, it is implicitly set to *
			if self._state == self._STATE_AfterPattern:
				self._currentThat = u"*"
			self._state = self._STATE_InsideTemplate
			self._elemStack.append(['template',{}])
			self._pushWhitespaceBehavior(attr)
		elif self._state == self._STATE_InsidePattern:
			# Certain tags are allowed inside <pattern> elements.
			if name == "bot" and attr.has_key("name") and attr["name"] == u"name":
				# Insert a special character string that the PatternMgr will
				# replace with the bot's name.
				self._currentPattern += u" BOT_NAME "
			else:
				raise AimlParserError, ("Unexpected <%s> tag " % name)+self._location()
		elif self._state == self._STATE_InsideThat:
			# Certain tags are allowed inside <that> elements.
			if name == "bot" and attr.has_key("name") and attr["name"] == u"name":
				# Insert a special character string that the PatternMgr will
				# replace with the bot's name.
				self._currentThat += u" BOT_NAME "
			else:
				raise AimlParserError, ("Unexpected <%s> tag " % name)+self._location()
		elif self._state == self._STATE_InsideTemplate and self._validInfo.has_key(name):
			# Starting a new element inside the current pattern. First
			# we need to convert 'attr' into a native Python dictionary,
			# so it can later be marshaled.
			attrDict = {}
			for k,v in attr.items():
				#attrDict[k[1].encode(self._encoding)] = v.encode(self._encoding)
				attrDict[k.encode(self._encoding)] = unicode(v)
			self._validateElemStart(name, attrDict, self._version)
			# Push the current element onto the element stack.
			self._elemStack.append([name.encode(self._encoding),attrDict])
			self._pushWhitespaceBehavior(attr)
			# If this is a condition element, push a new entry onto the
			# foundDefaultLiStack
			if name == "condition":
				self._foundDefaultLiStack.append(False)
		else:
			# we're now inside an unknown element.
			if self._forwardCompatibleMode:
				# In Forward Compatibility Mode, we ignore the element and its
				# contents.
				self._currentUnknown = name
			else:
				# Otherwise, unknown elements are grounds for error!
				raise AimlParserError, ("Unexpected <%s> tag " % name)+self._location()

	def characters(self, ch):
		# Wrapper around _characters which catches errors in _characters()
		# and keeps going.
		if self._state == self._STATE_OutsideAiml:
			# If we're outside of an AIML element, we ignore all text
			return
		if self._currentUnknown != "":
			# If we're inside an unknown element, ignore all text
			return
		if self._skipCurrentCategory:
			# If we're skipping the current category, ignore all text.
			return
		try: self._characters(ch)
		except AimlParserError, msg:
			# Print the message
			sys.stderr.write("PARSE ERROR: %s\n" % msg)
			self._numParseErrors += 1 # increment error count
			# In case of a parse error, if we're inside a category, skip it.
			if self._state >= self._STATE_InsideCategory:
				self._skipCurrentCategory = True
			
	def _characters(self, ch):
		text = unicode(ch)
		if self._state == self._STATE_InsidePattern:
			# TODO: text inside patterns must be upper-case!
			self._currentPattern += text
		elif self._state == self._STATE_InsideThat:
			self._currentThat += text
		elif self._state == self._STATE_InsideTemplate:
			# First, see whether the element at the top of the element stack
			# is permitted to contain text.
			try:
				parent = self._elemStack[-1][0]
				parentAttr = self._elemStack[-1][1]
				required, optional, canBeParent = self._validInfo[parent]
				nonBlockStyleCondition = (parent == "condition" and not (parentAttr.has_key("name") and parentAttr.has_key("value")))
				if not canBeParent:
					raise AimlParserError, ("Unexpected text inside <%s> element "%parent)+self._location()
				elif parent == "random" or nonBlockStyleCondition:
					# <random> elements can only contain <li> subelements. However,
					# there's invariably some whitespace around the <li> that we need
					# to ignore. Same for non-block-style <condition> elements (i.e.
					# those which don't have both a "name" and a "value" attribute).
					if len(text.strip()) == 0:
						# ignore whitespace inside these elements.
						return
					else:
						# non-whitespace text inside these elements is a syntax error.
						raise AimlParserError, ("Unexpected text inside <%s> element "%parent)+self._location()
			except IndexError:
				# the element stack is empty. This should never happen.
				raise AimlParserError, "Element stack is empty while validating text "+self._location()
			
			# Add a new text element to the element at the top of the element
			# stack. If there's already a text element there, simply append the
			# new characters to its contents.
			try: textElemOnStack = (self._elemStack[-1][-1][0] == "text")
			except IndexError: textElemOnStack = False
			except KeyError: textElemOnStack = False
			if textElemOnStack:
				self._elemStack[-1][-1][2] += text
			else:
				self._elemStack[-1].append(["text", {"xml:space": self._whitespaceBehaviorStack[-1]}, text])
		else:
			# all other text is ignored
			pass

	def endElementNS(self, name, qname):
		uri, elem = name
		self.endElement(elem)
		
	def endElement(self, name):
		"""Wrapper around _endElement which catches errors in _characters()
		and keeps going.

		"""		
		if self._state == self._STATE_OutsideAiml:
			# If we're outside of an AIML element, ignore all tags
			return
		if self._currentUnknown != "":
			# see if we're at the end of an unknown element.  If so, we can
			# stop ignoring everything.
			if name == self._currentUnknown:
				self._currentUnknown = ""
			return
		if self._skipCurrentCategory:
			# If we're skipping the current category, see if it's ending. We
			# stop on ANY </category> tag, since we're not keeping track of
			# state in ignore-mode.
			if name == "category":
				self._skipCurrentCategory = False
				self._state = self._STATE_InsideAiml
			return
		try: self._endElement(name)
		except AimlParserError, msg:
			# Print the message
			sys.stderr.write("PARSE ERROR: %s\n" % msg)
			self._numParseErrors += 1 # increment error count
			# In case of a parse error, if we're inside a category, skip it.
			if self._state >= self._STATE_InsideCategory:
				self._skipCurrentCategory = True

	def _endElement(self, name):
		"""Verify that an AIML end element is valid in the current
		context.

		Raises an AimlParserError if an illegal end element is encountered.

		"""
		if name == "aiml":
			# </aiml> tags are only legal in the InsideAiml state
			if self._state != self._STATE_InsideAiml:
				raise AimlParserError, "Unexpected </aiml> tag "+self._location()
			self._state = self._STATE_OutsideAiml
			self._whitespaceBehaviorStack.pop()
		elif name == "topic":
			# </topic> tags are only legal in the InsideAiml state, and
			# only if _insideTopic is true.
			if self._state != self._STATE_InsideAiml or not self._insideTopic:
				raise AimlParserError, "Unexpected </topic> tag "+self._location()
			self._insideTopic = False
			self._currentTopic = u""
		elif name == "category":
			# </category> tags are only legal in the AfterTemplate state
			if self._state != self._STATE_AfterTemplate:
				raise AimlParserError, "Unexpected </category> tag "+self._location()
			self._state = self._STATE_InsideAiml
			# End the current category.  Store the current pattern/that/topic and
			# element in the categories dictionary.
			key = (self._currentPattern.strip(), self._currentThat.strip(),self._currentTopic.strip())
			self.categories[key] = self._elemStack[-1]
			self._whitespaceBehaviorStack.pop()
		elif name == "pattern":
			# </pattern> tags are only legal in the InsidePattern state
			if self._state != self._STATE_InsidePattern:
				raise AimlParserError, "Unexpected </pattern> tag "+self._location()
			self._state = self._STATE_AfterPattern
		elif name == "that" and self._state == self._STATE_InsideThat:
			# </that> tags are only allowed inside <template> elements or in
			# the InsideThat state.  This clause handles the latter case.
			self._state = self._STATE_AfterThat
		elif name == "template":
			# </template> tags are only allowed in the InsideTemplate state.
			if self._state != self._STATE_InsideTemplate:
				raise AimlParserError, "Unexpected </template> tag "+self._location()
			self._state = self._STATE_AfterTemplate
			self._whitespaceBehaviorStack.pop()
		elif self._state == self._STATE_InsidePattern:
			# Certain tags are allowed inside <pattern> elements.
			if name not in ["bot"]:
				raise AimlParserError, ("Unexpected </%s> tag " % name)+self._location()
		elif self._state == self._STATE_InsideThat:
			# Certain tags are allowed inside <that> elements.
			if name not in ["bot"]:
				raise AimlParserError, ("Unexpected </%s> tag " % name)+self._location()
		elif self._state == self._STATE_InsideTemplate:
			# End of an element inside the current template.  Append the
			# element at the top of the stack onto the one beneath it.
			elem = self._elemStack.pop()
			self._elemStack[-1].append(elem)
			self._whitespaceBehaviorStack.pop()
			# If the element was a condition, pop an item off the
			# foundDefaultLiStack as well.
			if elem[0] == "condition": self._foundDefaultLiStack.pop()
		else:
			# Unexpected closing tag
			raise AimlParserError, ("Unexpected </%s> tag " % name)+self._location()

	# A dictionary containing a validation information for each AIML
	# element. The keys are the names of the elements.  The values are a
	# tuple of three items. The first is a list containing the names of
	# REQUIRED attributes, the second is a list of OPTIONAL attributes,
	# and the third is a boolean value indicating whether or not the
	# element can contain other elements and/or text (if False, the
	# element can only appear in an atomic context, such as <date/>).
	_validationInfo101 = {
		"bot":      	( ["name"], [], False ),
		"condition":    ( [], ["name", "value"], True ), # can only contain <li> elements
		"date":         ( [], [], False ),
		"formal":       ( [], [], True ),
		"gender":       ( [], [], True ),
		"get":          ( ["name"], [], False ),
		"gossip":		( [], [], True ),
		"id":           ( [], [], False ),
		"input":        ( [], ["index"], False ),
		"javascript":	( [], [], True ),
		"learn":        ( [], [], True ),
		"li":           ( [], ["name", "value"], True ),
		"lowercase":    ( [], [], True ),
		"person":       ( [], [], True ),
		"person2":      ( [], [], True ),
		"random":       ( [], [], True ), # can only contain <li> elements
		"sentence":     ( [], [], True ),
		"set":          ( ["name"], [], True),
		"size":         ( [], [], False ),
		"sr":           ( [], [], False ),
		"srai":         ( [], [], True ),
		"star":         ( [], ["index"], False ),
		"system":       ( [], [], True ),
		"template":		( [], [], True ), # needs to be in the list because it can be a parent.
		"that":         ( [], ["index"], False ),
		"thatstar":     ( [], ["index"], False ),
		"think":        ( [], [], True ),
		"topicstar":    ( [], ["index"], False ),
		"uppercase":    ( [], [], True ),
		"version":      ( [], [], False ),
	}

	def _validateElemStart(self, name, attr, version):
		"""Test the validity of an element starting inside a <template>
		element.

		This function raises an AimlParserError exception if it the tag is
		invalid.  Otherwise, no news is good news.

		"""		
		# Check the element's attributes.  Make sure that all required
		# attributes are present, and that any remaining attributes are
		# valid options.		
		required, optional, canBeParent = self._validInfo[name]
		for a in required:
			if a not in attr and not self._forwardCompatibleMode:
				raise AimlParserError, ("Required \"%s\" attribute missing in <%s> element " % (a,name))+self._location()
		for a in attr:
			if a in required: continue
			if a[0:4] == "xml:": continue # attributes in the "xml" namespace can appear anywhere
			if a not in optional and not self._forwardCompatibleMode:
				raise AimlParserError, ("Unexpected \"%s\" attribute in <%s> element " % (a,name))+self._location()

		# special-case: several tags contain an optional "index" attribute.
		# This attribute's value must be a positive integer.
		if name in ["star", "thatstar", "topicstar"]:
			for k,v in attr.items():
				if k == "index":
					temp = 0
					try: temp = int(v)
					except:
						raise AimlParserError, ("Bad type for \"%s\" attribute (expected integer, found \"%s\") " % (k,v))+self._location()
					if temp < 1:
						raise AimlParserError, ("\"%s\" attribute must have non-negative value " % (k))+self._location()

		# See whether the containing element is permitted to contain
		# subelements. If not, this element is invalid no matter what it is.
		try:
			parent = self._elemStack[-1][0]
			parentAttr = self._elemStack[-1][1]
		except IndexError:
			# If the stack is empty, no parent is present.  This should never
			# happen.
			raise AimlParserError, ("Element stack is empty while validating <%s> " % name)+self._location()
		required, optional, canBeParent = self._validInfo[parent]
		nonBlockStyleCondition = (parent == "condition" and not (parentAttr.has_key("name") and parentAttr.has_key("value")))
		if not canBeParent:
			raise AimlParserError, ("<%s> elements cannot have any contents "%parent)+self._location()
		# Special-case test if the parent element is <condition> (the
		# non-block-style variant) or <random>: these elements can only
		# contain <li> subelements.
		elif (parent == "random" or nonBlockStyleCondition) and name!="li":
			raise AimlParserError, ("<%s> elements can only contain <li> subelements "%parent)+self._location()
		# Special-case test for <li> elements, which can only be contained
		# by non-block-style <condition> and <random> elements, and whose
		# required attributes are dependent upon which attributes are
		# present in the <condition> parent.
		elif name=="li":
			if not (parent=="random" or nonBlockStyleCondition):
				raise AimlParserError, ("Unexpected <li> element contained by <%s> element "%parent)+self._location()
			if nonBlockStyleCondition:
				if parentAttr.has_key("name"):
					# Single-predicate condition.  Each <li> element except the
					# last must have a "value" attribute.
					if len(attr) == 0:
						# This could be the default <li> element for this <condition>,
						# unless we've already found one.
						if self._foundDefaultLiStack[-1]:
							raise AimlParserError, "Unexpected default <li> element inside <condition> "+self._location()
						else:
							self._foundDefaultLiStack[-1] = True
					elif len(attr) == 1 and attr.has_key("value"):
						pass # this is the valid case
					else:
						raise AimlParserError, "Invalid <li> inside single-predicate <condition> "+self._location()
				elif len(parentAttr) == 0:
					# Multi-predicate condition.  Each <li> element except the
					# last must have a "name" and a "value" attribute.
					if len(attr) == 0:
						# This could be the default <li> element for this <condition>,
						# unless we've already found one.
						if self._foundDefaultLiStack[-1]:
							raise AimlParserError, "Unexpected default <li> element inside <condition> "+self._location()
						else:
							self._foundDefaultLiStack[-1] = True
					elif len(attr) == 2 and attr.has_key("value") and attr.has_key("name"):
						pass # this is the valid case
					else:
						raise AimlParserError, "Invalid <li> inside multi-predicate <condition> "+self._location()
		# All is well!
		return True

def create_parser():
	"""Create and return an AIML parser object."""
	parser = xml.sax.make_parser()
	handler = AimlHandler("UTF-8")
	parser.setContentHandler(handler)
	#parser.setFeature(xml.sax.handler.feature_namespaces, True)
	return parser
