File: tokenizer.py

package info (click to toggle)
uncrustify 0.68.1%2Bdfsg1-2
links: PTS, VCS
area: main
in suites: buster
size: 18,000 kB
sloc: cpp: 56,567; ansic: 19,840; cs: 3,097; python: 2,717; objc: 1,650; java: 510; sh: 390; awk: 150; perl: 63; makefile: 7
file content (316 lines) | stat: -rwxr-xr-x 10,961 bytes
parent folder | download | duplicates (3)
#! /usr/bin/env python
# tokenize.py
#
# Parses a C/C++/C#/D/Java/Pawn/whatever file in an array of
# tuples (string, type)
#

# punctuator lookup table
punc_table = [
   [ '!',  25,  26, '!'   ],   #   0: '!'
   [ '#',  24,  35, '#'   ],   #   1: '#'
   [ '$',  23,   0, '$'   ],   #   2: '$'
   [ '%',  22,  36, '%'   ],   #   3: '%'
   [ '&',  21,  41, '&'   ],   #   4: '&'
   [ '(',  20,   0, '('   ],   #   5: '('
   [ ')',  19,   0, ')'   ],   #   6: ')'
   [ '*',  18,  43, '*'   ],   #   7: '*'
   [ '+',  17,  44, '+'   ],   #   8: '+'
   [ ',',  16,   0, ','   ],   #   9: ','
   [ '-',  15,  46, '-'   ],   #  10: '-'
   [ '.',  14,  50, '.'   ],   #  11: '.'
   [ '/',  13,  53, '/'   ],   #  12: '/'
   [ ':',  12,  54, ':'   ],   #  13: ':'
   [ ';',  11,   0, ';'   ],   #  14: ';'
   [ '<',  10,  56, '<'   ],   #  15: '<'
   [ '=',   9,  63, '='   ],   #  16: '='
   [ '>',   8,  65, '>'   ],   #  17: '>'
   [ '?',   7,   0, '?'   ],   #  18: '?'
   [ '[',   6,  70, '['   ],   #  19: '['
   [ ']',   5,   0, ']'   ],   #  20: ']'
   [ '^',   4,  71, '^'   ],   #  21: '^'
   [ '{',   3,   0, '{'   ],   #  22: '{'
   [ '|',   2,  72, '|'   ],   #  23: '|'
   [ '}',   1,   0, '}'   ],   #  24: '}'
   [ '~',   0,  74, '~'   ],   #  25: '~'
   [ '<',   3,  30, '!<'  ],   #  26: '!<'
   [ '=',   2,  33, '!='  ],   #  27: '!='
   [ '>',   1,  34, '!>'  ],   #  28: '!>'
   [ '~',   0,   0, '!~'  ],   #  29: '!~'
   [ '=',   1,   0, '!<=' ],   #  30: '!<='
   [ '>',   0,  32, '!<>' ],   #  31: '!<>'
   [ '=',   0,   0, '!<>='],   #  32: '!<>='
   [ '=',   0,   0, '!==' ],   #  33: '!=='
   [ '=',   0,   0, '!>=' ],   #  34: '!>='
   [ '#',   0,   0, '##'  ],   #  35: '##'
   [ ':',   2,  39, '%:'  ],   #  36: '%:'
   [ '=',   1,   0, '%='  ],   #  37: '%='
   [ '>',   0,   0, '%>'  ],   #  38: '%>'
   [ '%',   0,  40, None  ],   #  39: '%:%'
   [ ':',   0,   0, '%:%:'],   #  40: '%:%:'
   [ '&',   1,   0, '&&'  ],   #  41: '&&'
   [ '=',   0,   0, '&='  ],   #  42: '&='
   [ '=',   0,   0, '*='  ],   #  43: '*='
   [ '+',   1,   0, '++'  ],   #  44: '++'
   [ '=',   0,   0, '+='  ],   #  45: '+='
   [ '-',   2,   0, '--'  ],   #  46: '--'
   [ '=',   1,   0, '-='  ],   #  47: '-='
   [ '>',   0,  49, '->'  ],   #  48: '->'
   [ '*',   0,   0, '->*' ],   #  49: '->*'
   [ '*',   1,   0, '.*'  ],   #  50: '.*'
   [ '.',   0,  52, '..'  ],   #  51: '..'
   [ '.',   0,   0, '...' ],   #  52: '...'
   [ '=',   0,   0, '/='  ],   #  53: '/='
   [ ':',   1,   0, '::'  ],   #  54: '::'
   [ '>',   0,   0, ':>'  ],   #  55: ':>'
   [ '%',   4,   0, '<%'  ],   #  56: '<%'
   [ ':',   3,   0, '<:'  ],   #  57: '<:'
   [ '<',   2,  61, '<<'  ],   #  58: '<<'
   [ '=',   1,   0, '<='  ],   #  59: '<='
   [ '>',   0,  62, '<>'  ],   #  60: '<>'
   [ '=',   0,   0, '<<=' ],   #  61: '<<='
   [ '=',   0,   0, '<>=' ],   #  62: '<>='
   [ '=',   0,  64, '=='  ],   #  63: '=='
   [ '=',   0,   0, '===' ],   #  64: '==='
   [ '=',   1,   0, '>='  ],   #  65: '>='
   [ '>',   0,  67, '>>'  ],   #  66: '>>'
   [ '=',   1,   0, '>>=' ],   #  67: '>>='
   [ '>',   0,  69, '>>>' ],   #  68: '>>>'
   [ '=',   0,   0, '>>>='],   #  69: '>>>='
   [ ']',   0,   0, '[]'  ],   #  70: '[]'
   [ '=',   0,   0, '^='  ],   #  71: '^='
   [ '=',   1,   0, '|='  ],   #  72: '|='
   [ '|',   0,   0, '||'  ],   #  73: '||'
   [ '=',   1,   0, '~='  ],   #  74: '~='
   [ '~',   0,   0, '~~'  ],   #  75: '~~'
]


#
# Token types:
#  0 = newline
#  1 = punctuator
#  2 = integer
#  3 = float
#  4 = string
#  5 = identifier
#
class Tokenizer:
    def __init__(self):
        self.tokens = []
        self.text = ''
        self.text_idx = 0

    def tokenize_text(self, in_text):
        self.tokens = []
        self.text = in_text
        self.text_idx = 0

        print(in_text)
        try:
            while self.text_idx < len(self.text):
                if self.parse_whitespace():
                    continue
                elif self.text[self.text_idx] == '\\' and self.text[self.text_idx + 1] == '\n':
                    self.text_idx += 2
                    continue
                elif self.parse_comment():
                    continue
                elif self.parse_number():
                    continue
                elif self.parse_identifier():
                    continue
                elif self.parse_string():
                    continue
                elif self.parse_punctuator():
                    continue
                else:
                    print("confused: %s" % self.text[self.text_idx:])
                    break
        except:
            print("bombed")
            raise

    def parse_whitespace(self):
        start_idx = self.text_idx
        hit_newline = False
        while self.text_idx < len(self.text):
            if self.text[self.text_idx] in '\n\r':
                hit_newline = True
            elif not self.text[self.text_idx] in ' \t':
                break
            self.text_idx += 1

        if hit_newline:
            self.tokens.append(('\n', 0))
        return start_idx != self.text_idx

    def parse_comment(self):
        if not self.text[self.text_idx] == '/' or not self.text[self.text_idx + 1] in '/*':
            return False
        if self.text[self.text_idx + 1] == '/':
            while self.text_idx < len(self.text):
                if self.text[self.text_idx] in '\n\r':
                    break
                self.text_idx += 1
        else:
            while self.text_idx < len(self.text) - 1:
                if self.text[self.text_idx] == '*' and self.text[self.text_idx + 1] == '/':
                    self.text_idx += 2
                    break
                self.text_idx += 1
        return True

    def parse_identifier(self):
        if not self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ':
            return False
        start_idx = self.text_idx
        while self.text_idx < len(self.text) and \
                self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890':
            self.text_idx += 1
        self.tokens.append((self.text[start_idx : self.text_idx], 5))
        return True

    def parse_string(self):
        starter = 0
        start_ch = self.text[self.text_idx]
        if start_ch == 'L':
            starter = 1
            start_ch = self.text[self.text_idx + 1]
        if not start_ch in '"\'':
            return False
        start_idx = self.text_idx
        self.text_idx += starter + 1
        escaped = False
        while self.text_idx < len(self.text):
            if escaped:
                escaped = False
            else:
                if self.text[self.text_idx] == '\\':
                    escaped = True
                elif self.text[self.text_idx] == start_ch:
                    self.text_idx += 1
                    break
            self.text_idx += 1

        self.tokens.append((self.text[start_idx : self.text_idx], 4))
        return True

    # Checks for punctuators
    # Returns whether a punctuator was consumed (True or False)
    def parse_punctuator(self):
        tab_idx = 0
        punc_len = 0
        saved_punc = None
        while 1:
            pte = punc_table[tab_idx]
            if pte[0] == self.text[self.text_idx]:
                if pte[3] is not None:
                    saved_punc = pte[3]
                self.text_idx += 1
                tab_idx = pte[2]
                if tab_idx == 0:
                    break
            elif pte[1] == 0:
                break
            else:
                tab_idx += 1
        if saved_punc is not None:
            self.tokens.append((saved_punc, 1))
            return True
        return False

    def parse_number(self):
        # A number must start with a digit or a dot followed by a digit
        ch = self.text[self.text_idx]
        if not ch.isdigit() and (ch != '.' or not self.text[self.text_idx + 1].isdigit()):
            return False
        token_type = 2 # integer
        if ch == '.':
            token_type = 3 # float
        did_hex = False
        start_idx = self.text_idx

        # Check for Hex, Octal, or Binary
        # Note that only D and Pawn support binary, but who cares?
        #
        if ch == '0':
            self.text_idx += 1
            ch = self.text[self.text_idx].upper()
            if ch == 'X':                # hex
                did_hex = True
                self.text_idx += 1
                while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
                    self.text_idx += 1
            elif ch == 'B':              # binary
                self.text_idx += 1
                while self.text[self.text_idx] in '_01':
                    self.text_idx += 1
            elif ch >= '0' and ch <= 7:  # octal (but allow decimal)
                self.text_idx += 1
                while self.text[self.text_idx] in '_0123456789':
                    self.text_idx += 1
            else:
                # either just 0 or 0.1 or 0UL, etc
                pass
        else:
            # Regular int or float
            while self.text[self.text_idx] in '_0123456789':
                self.text_idx += 1

        # Check if we stopped on a decimal point
        if self.text[self.text_idx] == '.':
            self.text_idx += 1
            token_type = 3 # float
            if did_hex:
                while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
                    self.text_idx += 1
            else:
                while self.text[self.text_idx] in '_0123456789':
                    self.text_idx += 1

        # Check exponent
        # Valid exponents per language (not that it matters):
        # C/C++/D/Java: eEpP
        # C#/Pawn:      eE
        if self.text[self.text_idx] in 'eEpP':
            token_type = 3 # float
            self.text_idx += 1
            if self.text[self.text_idx] in '+-':
                self.text_idx += 1
            while self.text[self.text_idx] in '_0123456789':
                self.text_idx += 1

        # Check the suffixes
        # Valid suffixes per language (not that it matters):
        #        Integer       Float
        # C/C++: uUlL          lLfF
        # C#:    uUlL          fFdDMm
        # D:     uUL           ifFL
        # Java:  lL            fFdD
        # Pawn:  (none)        (none)
        #
        # Note that i, f, d, and m only appear in floats.
        while 1:
            if self.text[self.text_idx] in 'tTfFdDmM':
                token_type = 3 # float
            elif not self.text[self.text_idx] in 'lLuU':
                break
            self.text_idx += 1

        self.tokens.append((self.text[start_idx : self.text_idx], token_type))
        return True

text = """
1.23+4-3*16%2 *sin(1.e-3 + .5p32) "hello" and "hello\\"there"
123 // some comment
a = b + c;
#define abc \\
        5
d = 5 /* hello */ + 3;
"""

t = Tokenizer()
t.tokenize_text(text)
print(t.tokens)