""" Constants for writing tag tables

    The documentation in this file is obsoleted by the HTML docs
    in the Doc/ subdirectory of the package.

    (c) Copyright Marc-Andre Lemburg; All Rights Reserved.
    See the documentation for further information on copyrights,
    or contact the author (mal@lemburg.com).
"""
#########################################################################
# This file contains the definitions and constants used by the tagging
# engine:
#
# 1. Matching Tables
# 2. Commands & Constants
# 3. Matching Functions
# 4. Callable tagobjects
# 5. Calling the engine & Taglists
#

#########################################################################
# 1. Matching Tables:
#
# these are tuples of tuples, each entry having the following meaning:
#
# syntax: (tag, cmd, chars|table|fct [,jne] [,je=1])
#          tag = object used to mark this section, if it matches
#          cmd = command (see below)
#          chars = match one or more of these characters
#          table = table to use for matching characters
#          fct = function to call (see below)
#          jne = if the current character doesn't match, jump this
#                many table entries relative to the current entry
#          je = if we have a match make a relative jump of this length
#
# * a table matches a string iff the end of the table is reached
#   (that is: an index is requested that is beyond the end-of-table)
# * a table is not matched if a tag is not matched and no jne is given;
#   if it is matched then processing simply moves on to the next entry
# * marking is done by adding the matching slice in the string
#   together with the marking object to the tag list; if the object is
#   None, then it will not be appended to the taglist list
# * if the flag CallTag is set in cmd, then instead of appending
#   matches to the taglist, the tagobj will be called (see below) 
#
# TIP: if you are getting an error 'call of a non-function' while
#      writing a table definition, you probably have a missing ','
#      somewhere in the tuple !
#
# For examples see the tag*.py - files that came with this engine.
#

#########################################################################
# 2. Commands & Constants
# 
#

#
# some useful constants for writing matching tables
#

To = None		# good for cmd=Jump
Here = None		# good for cmd=Fail and EOF
MatchOk = 20000		# somewhere out of bounds...
ToEOF = -1		# good for cmd=Move

ThisTable = 999		# to recursively match using the current table;
			# can be passed as argument to Table and SubTable
			# instead of a tuple

#
# commands and flags passed in cmd (see below)
#
# note: I might add some further commands to this list, if needed
#       (the numbers will then probably change, but not the
#        names)
#
# convention: a command "matches", if and only if it moves the
#       current position at least one character; a command "reads" 
#       characters the characters, if they match ok
#
# notations:
#
#  x    refers to the current position in the string
#  len  refers to the string length or what the function tag() is told to
#       believe it to be (i.e. the engine only looks at the slice text[x:len])
#  text refers to the text string
#  jne  is the optional relative jump distance in case the command
#       did not match, i.e. x before and after applying the command
#       are the same (if not given the current table is considered
#       not to match)
#  je   is the optional relative jump distance in case the command
#       did match (it defaults to +1)
#

# commands
Fail = 0           # this will always fail (position remains unchanged)
Jump = 0           # jump to jne (position remains unchanged)

# match & read chars
AllIn = 11         # all chars in match (at least one)
AllNotIn = 12      # all chars not in match (at least one)
Is = 13            # current char must be == match (matches one char)
IsIn = 14          # current char must be in match (matches one char)
IsNot = 15         # current char must be be != match (matches one char)
IsNotIn = 15       # current char must be not be in match (matches one char)

AllInSet = 31
IsInSet = 32

# match & read for whole words
Word = 21          # the next chars must be those in match
WordStart = 22	   # all chars up to the first occ. of match (at least one)
WordEnd = 23	   # same as WordStart, accept that the text pointer
                   # is moved behind the match
NoWord = WordStart # all chars up to the first occ. of match (at least one)


# match using search objects BMS or FS
sWordStart = 111   # all chars up to the first occ. of match (may be 0 chars)
sWordEnd = 112	   # same as WordStart, accept that the text pointer
                   # is moved behind the match
sFindWord = 113    # find match and process the found slice only (ignoring
		   # the chars that lead up to the match); positions
		   # the text pointer right after the match like WordEnd

# functions & tables
Call = 201         # call match(text,x,len) as function (see above)
CallArg = 202      # match has to be a 2-tuple (fct,arg), then
                   # fct(text,x,len,arg) is called; the return value is taken
		   # as new x; it is considered matching if the new x is
		   # different than the x before the call -- like always
		   # (note: arg has to be *one* object, e.g. a tuple)
Table = 203        # match using table (given in match)
SubTable = 207     # match using sub table (given in match); the sub table
		   # uses the same taglist as the calling table
TableInList = 204  # same as Table, but match is a tuple (list,index)
                   # and the table list[index] is used as matching
		   # table

# specials
EOF = 1            # current position must be EOF, e.g. >= len(string)
Skip = 2           # skip match (must be an integer) chars; note: this cmd
                   # always matches ok, so jne doesn't have any meaning in
		   # this context
Move = 3	   # move the current text position to match (if negative,
		   # the text length + 1 (!) is added, thus -1 moves to the
		   # EOF, -2 to the last char and so on); note: this cmd
                   # always matches ok, so jne doesn't have any meaning in
		   # this context

# loops
Loop = 205         # loop-construct
                   #               
                   # (tagobj,Loop,Count,jne,je) - sets/decrements the
		   # loop variable for current table according to the
		   # following rules:
		   # 1. the first time the engine passes this entry
		   #    sets the loop variable to Count and continues
		   #    without reading any character, but saving the
		   #    current position in text
		   # 2. the next time, it decrements the loop variable
		   #    and checks if it is < 0:
		   #    (a) if it is, then the tagobj is added to the
		   #        taglist with the slice (saved position, current
		   #        position) and processing continues at entry
		   #        current + jne
		   #    (b) else, processing continues at entry current + je
		   # Note: if you jump out of the loop while the loop
		   #       variable is still > 0, then you *must*
		   #       reset the loop mechanism with 
		   #       (None,LoopControl,Reset)
		   # Note: you can skip the remaining loops by calling
		   #       (None,LoopControl,Break) and jumping back
		   #       to the Loop-entry; this sets the loop
		   #       variable to 0
		   # Note: tables cannot have nested loops within their
		   #       context; you can have nested loops in nested
		   #       tables though (there is one loop var per
		   #       tag()-call which takes place every time
		   #       a table match is done)
		   #
LoopControl = 206  # controls the loop variable (always succeeds, i.e.
                   #                             jne has no meaning);
                   # match may be one of:
Break = 0          # * sets the loop variable to 0, thereby allowing
                   #   to skip the remaining loops
Reset = -1         # * resets the loop mechanism (see note above)
                   #
		   # See tagLoop.py for some examples.

# flags (to be '+'ed with the above command code)
CallTag = 256      # call tagobj(taglist,text,l,r,subtags)
		   # upon successfully matching the slice [l:r] in text
		   # * taglist is the current list tags found (may be None)
                   # * subtags is a sub-list, passed when a subtable was used
                   #   to do the matching -- it is None otherwise !)
#
# example entry with CallTag-flag set:
#
# (found_a_tag,CallTag+Table,tagtable)
#  -- if tagtable matches the current text position, 
#     found_a_tag(taglist,text,l,r,newtaglist) is called and
#     the match is *not* appended to the taglist by the tagging
#     engine (the function would have to do this, in case it is needed)

AppendToTag = 512  # this appends the slice found to the tagobj, assuming
                   # that it is a Python list:
		   # does a tagobj.append((None,l,r,subtags)) call

AppendTagobj = 1024 # don't append (tagobj,l,r,subtags) to the taglist,
                   # but only tagobj itself; the information in l,r,subtags
		   # is lost, yet this can be used to write tag tables
		   # whose output can be used directly by tag.join()

#########################################################################
# 3. Matching Functions
#
# syntax:
#
# fct(s,x,len_s)
#          where s = string we are working on
#                x = current index in s where we wnat to match something
#                len_s = 'length' of s, this is how far the search may be
#                    conducted in s, not necessarily the true length of s
# 
# * the function has to return the index of the char right after
#   matched string, e.g.
#
#   'xyzabc' ---> 'xyz' matches ---> return x+3
#
# * if the string doesn't match simply return x; in other words:
#   the function has to return the matching slice's right index
# * you can use this to match e.g. 10 characters of a certain kind,
#   or any word out of a given list, etc.
# * note: you cannot give the function additional parameters from within
#   the matching table, so it has to know everything it needs to
#   know a priori; use dynamic programming !
#
# some examples (not needed, since all are implemented by commands)
#
#
#def matchword(x):
#    s = """
#def a(s,x,len_text):
#    y = x+%i
#    if s[x:y] == %s: return y
#    return x
#"""
#    exec s % (len(x),repr(x))
#    return a
#
#def rejectword(x):
#    s = """
#def a(s,x,len_text):
#    while x < len(s) and s[x:x+%i] != %s:
#	x = x + 1
#    return x
#"""
#    exec s % (len(x),repr(x))
#    return a
#
#def HTML_Comment(s,x,len_text):
#    while x < len_text and s[x:x+3] != '-->':
#	x = x + 1
#    return x
#
#

#########################################################################
# 4. Callable tagobjects
#
# a sample callable tagobj:
#
#
#def test(taglist,text,l,r,newtaglist):
#
#    print 'found',repr(text[l:r])[:40],(l,r)
#
#

#########################################################################
# 5. Calling the engine & Taglists
#
# The function
#      tag(text,table,start=0,len_text=len(text),taglistinit=[])
# found in mxTextTools:
#
# This function does all the matching according to the above rules.
# You give it a text string and a tag table and it will
# start processing the string starting from 'start' (which defaults to 0)
# and continue working until it reaches the 'EOF', i.e. len_text (which
# defaults to the text length). It thus tags the slice text[start:len_text].
#
# The function will create a list of found tags in the following
# format (which I call taglist):
#
#      (tagobj,l,r,subtaglist)
#
# where: tagobj = specified tag object taken from the table
#        [l:r] = slice that matched the tag in text
#        subtaglist = if matching was done using a subtable
#                     this is the taglist it produced; in all other
#                     cases this will be None
#
# * if you pass None as taglistinit, then no taglist will be created,
#   i.e. only CallTag commands will have any effect. (This saves
#   temporary memory for big files)
# * the function returns a tuple:
#      (success, taglist, nextindex)
#   where: success = 0/1
#          taglist = the produced list or None
#          nextindex = the index+1 of the last char that matched
#                    (in case of failure, this points to the beginning
#                     of the substring that caused the problem)
# 

### Module init.

def _module_init():

    global id2cmd

    import types
    id2cmd = {}
    IntType = types.IntType
    for cmd,value in globals().items():
	if type(value) == IntType:
	    if value == 0:
		id2cmd[0] = 'Fail/Jump'
	    else:
		id2cmd[value] = cmd

_module_init()
