File: sparse.py

package info (click to toggle)
python-jtoolkit 0.7.8-2
links: PTS
area: main
in suites: etch, etch-m68k
size: 1,436 kB
ctags: 2,536
sloc: python: 15,143; makefile: 20
file content (227 lines) | stat: -rwxr-xr-x 8,364 bytes
parent folder | download | duplicates (3)
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""simple parser / string tokenizer
rather than returning a list of token types etc, we simple return a list of tokens...
each tokenizing function takes a string as input and returns a list of tokens
"""

# Copyright 2002, 2003 St James Software
# 
# This file is part of jToolkit.
#
# jToolkit is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# jToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with jToolkit; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

import bisect

def stringeval(input):
  """takes away repeated quotes (escapes) and returns the string represented by the input"""
  stringchar = input[0]
  if input[-1] != stringchar or stringchar not in ("'",'"'):
    # scratch your head
    raise ValueError, "error parsing escaped string: %r" % input
  return input[1:-1].replace(stringchar+stringchar,stringchar)

def stringquote(input):
  """escapes quotes as neccessary and returns a string representing the input"""
  if "'" in input:
    if '"' in input:
      return '"' + input.replace('"', '""') + '"'
    else:
      return '"' + input + '"'
  else:
    return "'" + input + "'"

def findall(src, substr):
  pos = 0
  while True:
    pos = src.find(substr, pos)
    if pos == -1:
      break
    yield pos
    pos += 1

class ParserError(ValueError):
  """Intelligent parser error"""
  def __init__(self, parser, message, tokennum):
    """takes a message and the number of the token that caused the error"""
    tokenpos = parser.findtokenpos(tokennum)
    line, charpos = parser.getlinepos(tokenpos)
    ValueError.__init__(self, "%s at line %d, char %d (token %r)" % \
        (message, line, charpos, parser.tokens[tokennum]))
    self.parser = parser
    self.tokennum = tokennum

class SimpleParser:
  """this is a simple parser"""
  def __init__(self, defaulttokenlist=None, whitespacechars=" \t\r\n", includewhitespacetokens=0):
    if defaulttokenlist is None:
      self.defaulttokenlist = ['<=', '>=', '==', '!=', '+=', '-=', '*=', '/=', '<>']
      self.defaulttokenlist.extend('(),[]:=+-')
    else:
      self.defaulttokenlist = defaulttokenlist
    self.whitespacechars = whitespacechars
    self.includewhitespacetokens = includewhitespacetokens
    self.standardtokenizers = [self.stringtokenize, self.removewhitespace, self.separatetokens]
    self.quotechars = ('"', "'")
    self.endquotechars = {'"':'"',"'":"'"}
    self.unicodeprefix = None
    self.stringescaping = 1
    self.tokenposcache = {}
    self.lineposcache = {}

  def stringtokenize(self, input):
    """makes strings in input into tokens..."""
    tokens = []
    laststart = 0
    instring = 0
    endstringchar, escapechar = '', '\\'
    gotclose, gotescape = 0, 0
    for pos in range(len(input)):
      char = input[pos]
      if instring:
        if self.stringescaping and (gotescape or char == escapechar) and not gotclose:
          gotescape = not gotescape
        elif char == endstringchar:
          gotclose = not gotclose
        elif gotclose:
          tokens.append(input[laststart:pos])
          instring, laststart, endstringchar = 0, pos, ''
      if not instring:
        if char in self.quotechars:
          if self.unicodeprefix and input[pos-len(self.unicodeprefix):pos] == self.unicodeprefix:
            pos -= 1
          if pos > laststart: tokens.append(input[laststart:pos])
          instring, laststart, endstringchar, gotclose = 1, pos, self.endquotechars[char], 0
    if laststart < len(input): tokens.append(input[laststart:])
    return tokens

  def keeptogether(self, input):
    """checks whether a token should be kept together"""
    return self.isstringtoken(input)

  def isstringtoken(self, input):
    """checks whether a token is a string token"""
    return input[:1] in self.quotechars or input[:1] == self.unicodeprefix and input[1:][:1] in self.quotechars

  def separatetokens(self, input, tokenlist = None):
    """this separates out tokens in tokenlist from whitespace etc"""
    if self.keeptogether(input): return [input]
    if tokenlist is None:
      tokenlist = self.defaulttokenlist
    # loop through and put tokens into a list
    tokens = []
    pos = 0
    laststart = 0
    while pos < len(input):
      foundtoken = 0
      for token in tokenlist:
        if input[pos:pos+len(token)] == token:
          if laststart < pos: tokens.append(input[laststart:pos])
          tokens.append(token)
          pos += len(token)
          foundtoken, laststart = 1, pos
          break
      if not foundtoken: pos += 1
    if laststart < len(input): tokens.append(input[laststart:])
    return tokens

  def removewhitespace(self, input):
    """this removes whitespace but lets it separate things out into separate tokens"""
    if self.keeptogether(input): return [input]
    # loop through and put tokens into a list
    tokens = []
    pos = 0
    inwhitespace = 0
    laststart = 0
    for pos in range(len(input)):
      char = input[pos]
      if inwhitespace:
        if char not in self.whitespacechars:
          if laststart < pos and self.includewhitespacetokens: tokens.append(input[laststart:pos])
          inwhitespace, laststart = 0, pos
      else:
        if char in self.whitespacechars:
          if laststart < pos: tokens.append(input[laststart:pos])
          inwhitespace, laststart = 1, pos
    if laststart < len(input) and (not inwhitespace or self.includewhitespacetokens):
      tokens.append(input[laststart:])
    return tokens

  def applytokenizer(self, inputlist, tokenizer):
    """apply a tokenizer to a set of input, flattening the result"""
    tokenizedlists = [tokenizer(input) for input in inputlist]
    joined = []
    map(joined.extend, tokenizedlists)
    return joined

  def applytokenizers(self, inputlist, tokenizers):
    """apply a set of tokenizers to a set of input, flattening each time"""
    for tokenizer in tokenizers:
      inputlist = self.applytokenizer(inputlist, tokenizer)
    return inputlist

  def tokenize(self, source, tokenizers=None):
    """tokenize the input string with the standard tokenizers"""
    self.source = source
    if tokenizers is None:
      tokenizers = self.standardtokenizers
    self.tokens = self.applytokenizers([self.source], tokenizers)
    return self.tokens

  def refreshposcache(self):
    """refreshes the cache of token positions"""
    self.tokenposcache = {}
    self.lineposcache = list(findall(self.source, "\n"))
    self.findtokenpos(len(self.tokens)-1)

  def findtokenpos(self, tokennum):
    """finds the position of the given token in the input"""
    if tokennum in self.tokenposcache:
      return self.tokenposcache[tokennum]
    cachedtokennums = [cachedtokennum for cachedtokennum in self.tokenposcache if cachedtokennum <= tokennum]
    if cachedtokennums:
      starttokennum = max(cachedtokennums)
      currenttokenpos = self.tokenposcache[starttokennum]
    else:
      starttokennum = 0
      currenttokenpos = 0
    for currenttokennum in range(starttokennum, tokennum+1):
      currenttokenpos = self.source.find(self.tokens[currenttokennum], currenttokenpos)
      self.tokenposcache[currenttokennum] = currenttokenpos
    return currenttokenpos

  def getlinepos(self, tokenpos):
    """finds the line and character position of the given character"""
    if self.lineposcache:
      line = bisect.bisect_left(self.lineposcache, tokenpos)
      if line:
        linestart = self.lineposcache[line-1]
        charpos = tokenpos - linestart
      else:
        linestart = 0
        charpos = tokenpos + 1
      line += 1
    else:
      sourcecut = self.source[:tokenpos]
      line = sourcecut.count("\n")+1
      charpos = tokenpos - sourcecut.rfind("\n")
    return line, charpos

  def raiseerror(self, message, tokennum):
    """raises a ParserError"""
    raise ParserError(self, message, tokennum)