File: cpp.g

package info (click to toggle)
antlr 2.7.7%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 9,920 kB
  • sloc: java: 54,649; cs: 12,533; makefile: 8,963; cpp: 7,359; pascal: 5,273; sh: 4,337; python: 4,301; lisp: 1,969; xml: 220; lex: 192; ansic: 134
file content (308 lines) | stat: -rw-r--r-- 8,899 bytes parent folder | download | duplicates (11)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
/*
 * A C PreProcessor
 * 
 * Handles #define/#undef, #ifdef/#elsif/#else/#endif, and #include using only
 * an ANTLR lexer (actually a stack of them).  This could be easily integrated
 * with an existing lexer to do preprocessing and tokenizing all at once.
 * 
 * Author: Eric Mahurin - eric_mahurin at yahoo dot com
 * License: just give me credit
 * 
 * BUG: missing some of the simpler directives
 * BUG: doesn't follow the cpp spec perfectly - haven't made any effort at
 * this not well tested
 * 
 * Be aware that this is my first real attempt at both ANTLR and Java, so
 * things may not be done the best way.  I welcome suggestions and fixes.
 *
 * 041124 - cpp.g translated and adapted as Python example by MK.
 */

header {
import sys
import StringIO
}

header "__main__" {

import traceback

class cpp:

    def __init__(self, *args):
        try:
            // will need a stack of lexers for #include and macro calls
            self.mainLexer = Lexer(sys.stdin)
            Lexer.selector.select(self.mainLexer)
            for token in Lexer.selector:
                sys.stdout.write(token.getText())
        except Exception, e:
            sys.stderr.write("exception: " + str(e) + '\n')
            traceback.print_exc()

Lexer.selector = antlr.TokenStreamSelector()

cpp(sys.argv[1:])

}

options {
    language="Python";
}

class cppLexer extends Lexer;

options {
    testLiterals = false;
    k = 4;
}

tokens {
    ENDIF ;
}

{
    selector = antlr.TokenStreamSelector()     // must be assigned externally

    ifState = 1         // -1: no-else false, 0: false, 1: true
    ifStates = []       // holds nested if conditions
    defines = {}        // holds the defines
    defineArgs = {}     // holds the args for a macro call

    def uponEOF(self):
        if Lexer.selector.getCurrentStream() != Lexer:
            try:
                Lexer.selector.pop() // return to old lexer/stream
                Lexer.selector.retry()
            //except antlr.TokenStreamRetryException, tsre:
            //    raise tsre
            except IndexError:
                // return a real EOF if nothing in stack
                pass

}

DIRECTIVE {
    args = []
    condition = True
} : '#'
    ( "include" (WS)? includeFile:STRING
      {
        if Lexer.ifState == 1:
            name = includeFile.getText()
            name = name[1:-1]
            try:
                sublexer = Lexer(file(name))
        // want defines to be persistent
                sublexer.defines = Lexer.defines
                sublexer.setFilename(name)
                Lexer.selector.push(sublexer)
                Lexer.selector.retry()
            except IOError, e:
                sys.stderr.write("cannot find file " + name + '\n')
      }
    | "define" WS defineMacro:RAW_IDENTIFIER
      {
          // first element will hold the macro text
      }
      (
        ( '(' // get arguments if you find them (no spaces before left paren)
          (WS)? defineArg0:RAW_IDENTIFIER (WS)?
          { args += defineArg0.getText() }
          ( COMMA (WS)? defineArg1:RAW_IDENTIFIER (WS)?
            { args += defineArg1.getText() } )*
          ')'
        | ' '|'\t'|'\f'
        )
        ( options { greedy=true; } : ' '|'\t'|'\f' )*
        // store the text verbatim - tokenize when called
        defineText:MACRO_TEXT { args[0] = defineText.getText() }
      )? ('\n'|"\r\n"|'\r') { $newline }
      {
          if Lexer.ifState == 1:
              Lexer.defines[defineMacro.getText()] = args
              $skip
      }
    | "undef" WS undefMacro:RAW_IDENTIFIER
      {
          if Lexer.ifState == 1:
              del Lexer.defines[undefMacro.getText()]
              $skip
      }
    | ( "ifdef" | "ifndef" { condition=False } )
      WS ifMacro:RAW_IDENTIFIER
      {
        Lexer.ifStates.append(ifState)
        if Lexer.ifState == 1:
            if Lexer.defines.has_key(ifMacro.getText()) == condition:
                Lexer.ifState = 1
            else:
                Lexer.ifState = 0
        else:
            Lexer.ifState = -1
        if Lexer.ifState == 1:
            $skip
        else:
            // gobble up tokens until ENDIF (could be caused by else)
            while True:
                try:
                    if Lexer.selector.nextToken().getType() == ENDIF:
                        break
                except antlr.TokenStreamRetryException, r:
                    // just continue if someone tried retry
                    pass
            // retry in case we switched lexers
            Lexer.selector.retry()
      }
    | ( "else" // treat like elsif (true)
      | "elsif" WS elsifMacro:RAW_IDENTIFIER
        {
            condition = Lexer.defines.has_key(elsifMacro.getText())
        }
      )
      {
        if Lexer.ifState == 1:
            // previous if/elsif was taken - discard rest
            Lexer.ifState = -1;
            while True:
                try:
                    if Lexer.selector.nextToken().getType() == ENDIF:
                        break
                except antlr.TokenStreamRetryException, r:
                    // just continue if someone tried retry
                    pass
            // retry in case we switched lexers
            Lexer.selector.retry()
        elif Lexer.ifState == 0 and condition:
            // "elsif" (true) or "else"
            $setType(ENDIF)
            Lexer.ifState = 1
      }
    | "endif"
      {
        if Lexer.ifState == 1:
            condition = True
        else:
            condition = False
        try:
            // return to previous if state
            del Lexer.ifStates[-1]
            if condition:
                $skip
            else:
                // tell if/else/elsif to stop discarding tokens
                $setType(ENDIF)
        except IndexError, e:
            // endif with no if
            pass
      }
    );

IDENTIFIER options { testLiterals=true; } {
    define = []
    args = []
} :
    identifier:RAW_IDENTIFIER
    {
        // see if this is a macro argument
        define = Lexer.defineArgs.has_key(identifier.getText())
        if define:
            define = Lexer.defineArgs[identifier.getText()]
        elif _begin == 0 and not define:
            // see if this is a macro call
            define = Lexer.defines.has_key(identifier.getText())
            if define:
                define = Lexer.defines[identifier.getText()]
    }
    ( { define and len(define) }? ( WS | COMMENT )?
        // take in arguments if macro call requires them
        '('
        callArg0:EXPR { args += callArg0.getText() }
        ( COMMA callArg1:EXPR { args += callArg1.getText() } )*
        { len(args) == len(define)-1 }? // better have right amount
        ')'
    | { not (define and len(define)) }?
    )
    {
      if define:
          defineText = define[0]
          if _begin:
              // just substitute text if called from EXPR - no token created
              $setText(defineText)
          else:
              // create a new lexer to handle the macro text
              sublexer = Lexer(StringIO.StringIO(defineText))
              for i in range(len(args)):
                  // treat macro arguments similar to local defines
                  arg = []
                  arg.append(args[i])
                  sublexer.defineArgs[define[1+i]] = arg
              Lexer.selector.push(sublexer)
              // retry in new lexer
              Lexer.selector.retry()
    };

STRING
    : '"' ( '\\' . | ~('\\'|'"') )* '"' // double quoted string
    | '\'' ( '\\' . | ~('\\'|'\'') )* '\'' // single quoted string
    ;

protected
MACRO_TEXT :
    ( '\\'! NL { $newline } // escaped newline
    | ~('\n'|'\r')
    )*
    ;

protected
NL
options {
    generateAmbigWarnings=false; // single '\r' is ambig with '\r' '\n'
}
	: '\r'
	| '\n'
	| '\r' '\n'
    ;

WS :
    ( ' '
    | '\t'
    | '\f'
    | NL { $newline }
    ) { /* $skip */ }
    ;

COMMENT :
    ( "//" (~('\n'|'\r'))* NL { $newline } // single line comment
    | "/*" ( options{greedy=false;} : NL { $newline } | ~('\n'|'\r') )* "*/"
      // multi-line comment
    ) { /* $skip */ }
    ;

protected
RAW_IDENTIFIER :
    ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'_'|'0'..'9')*
    ;

NUMBER :
    ('0'..'9') ('0'..'9'|'a'..'z'|'A'..'Z'|'_')*
    // allow alpha suffixes on numbers (i.e. L:long)
    ;

// group symbols into categories to parse EXPR
LEFT  : '(' | '[' | '{' ;
RIGHT : ')' | ']' | '}' ;
COMMA : ',' ;
OPERATOR : '!' | '#' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '`' | '|' | '~' ;

protected
EXPR // allow just about anything without being ambiguous
    : (WS)? (NUMBER|IDENTIFIER)?
      (
        ( LEFT EXPR ( COMMA EXPR )* RIGHT
        | STRING
        | OPERATOR // quotes, COMMA, LEFT, and RIGHT not in here
        )
        EXPR
      )?
    ;