#! /usr/bin/env python # encoding: utf-8 #! /usr/bin/env python # encoding: utf-8 # Thomas Nagy, 2006-2008 (ita) #C/C++ preprocessor for finding dependencies #TODO: more varargs, pragma once import re, sys, os, string, types if __name__ == '__main__': sys.path = ['.', '..'] + sys.path import Params from Params import debug, error, warning import traceback class PreprocError(Exception): pass g_findall = 1 'search harder for project includes' use_trigraphs = 0 'apply the trigraph rules first' strict_quotes = 0 "Keep <> for system includes (do not search for those includes)" g_optrans = { 'not':'!', 'and':'&&', 'bitand':'&', 'and_eq':'&=', 'or':'||', 'bitor':'|', 'or_eq':'|=', 'xor':'^', 'xor_eq':'^=', 'compl':'~', } "these ops are for c++, to reset, set an empty dict" # ignore #warning and #error re_lines = re.compile(\ '^[ \t]*(#|%:)[ \t]*(ifdef|ifndef|if|else|elif|endif|include|import|define|undef|pragma)[ \t]*(.*)\r*$', re.IGNORECASE | re.MULTILINE) re_mac = re.compile("^[a-zA-Z_]\w*") re_fun = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*[(]') re_pragma_once = re.compile('^\s*once\s*', re.IGNORECASE) re_nl = re.compile('\\\\\r*\n', re.MULTILINE) re_cpp = re.compile(\ r"""(/\*[^*]*\*+([^/*][^*]*\*+)*/)|//[^\n]*|("(\\.|[^"\\])*"|'(\\.|[^'\\])*'|.[^/"'\\]*)""", re.MULTILINE) trig_def = [('??'+a, b) for a, b in zip("=-/!'()<>", r'#~\|^[]{}')] chr_esc = {'0':0, 'a':7, 'b':8, 't':9, 'n':10, 'f':11, 'v':12, 'r':13, '\\':92, "'":39} NUM = 'i' OP = 'O' IDENT = 'T' STR = 's' CHAR = 'c' tok_types = [NUM, STR, IDENT, OP] exp_types = [ r"""0[xX](?P[a-fA-F0-9]+)(?P[uUlL]*)|L*?'(?P(\\.|[^\\'])+)'|(?P\d+)[Ee](?P[+-]*?\d+)(?P[fFlL]*)|(?P\d*\.\d+)([Ee](?P[+-]*?\d+))?(?P[fFlL]*)|(?P\d+\.\d*)([Ee](?P[+-]*?\d+))?(?P[fFlL]*)|(?P0*)(?P\d+)(?P[uUlL]*)""", r'L?"([^"\\]|\\.)*"', r'[a-zA-Z_]\w*', r'%:%:|<<=|>>=|\.\.\.|<<|<%|<:|<=|>>|>=|\+\+|\+=|--|->|-=|\*=|/=|%:|%=|%>|==|&&|&=|\|\||\|=|\^=|:>|!=|##|[\{\}\[\]<>\?\|\^\*\+&=:!#;,%/\-\?\~\.]', ] re_clexer = re.compile('|'.join(["(?P<%s>%s)" % (name, part) for name, part in zip(tok_types, exp_types)]), re.M) accepted = 'a' ignored = 'i' undefined = 'u' skipped = 's' def repl(m): s = m.group(1) if s is not None: return ' ' s = m.group(3) if s is None: return '' return s def filter_comments(filename): # return a list of tuples : keyword, line f = open(filename, "r") code = f.read() f.close() if use_trigraphs: for (a, b) in trig_def: code = code.split(a).join(b) code = re_nl.sub('', code) code = re_cpp.sub(repl, code) return [(m.group(2), m.group(3)) for m in re.finditer(re_lines, code)] prec = {} # op -> number, needed for such expressions: #if 1 && 2 != 0 ops = ['* / %', '+ -', '<< >>', '< <= >= >', '== !=', '& | ^', '&& ||', ','] for x in range(len(ops)): syms = ops[x] for u in syms.split(): prec[u] = x def reduce_nums(val_1, val_2, val_op): #print val_1, val_2, val_op # pass two values, return a value # now perform the operation, make certain a and b are numeric try: a = 0 + val_1 except TypeError: a = int(val_1) try: b = 0 + val_2 except TypeError: b = int(val_2) d = val_op if d == '%': c = a%b elif d=='+': c = a+b elif d=='-': c = a-b elif d=='*': c = a*b elif d=='/': c = a/b elif d=='^': c = a^b elif d=='|': c = a|b elif d=='||': c = int(a or b) elif d=='&': c = a&b elif d=='&&': c = int(a and b) elif d=='==': c = int(a == b) elif d=='!=': c = int(a != b) elif d=='<=': c = int(a <= b) elif d=='<': c = int(a < b) elif d=='>': c = int(a > b) elif d=='>=': c = int(a >= b) elif d=='^': c = int(a^b) elif d=='<<': c = a<>': c = a>>b else: c = 0 return c def get_expr(lst, defs, ban): if not lst: return ([], [], []) (p, v) = lst[0] if p == NUM: return (p, v, lst[1:]) elif p == STR: try: (p2, v2) = lst[1] if p2 == STR: return (p, v+v2, lst[2:]) except IndexError: pass return (p, v, lst[1:]) elif p == OP: if v in ['+', '-', '!', '~', '#']: (p2, v2, lst2) = get_expr(lst[1:], defs, ban) if v == '#': if p2 != IDENT: raise PreprocError, "ident expected %s" % str(lst) return get_expr([(STR, v2)]+lst2, defs, ban) if p2 != NUM: raise PreprocError, "num expected %s" % str(lst) if v == '+': return (p2, v2, lst2) elif v == '-': return (p2, - int(v2), lst2) elif v == '!': return (p2, int(not int(v2)), lst2) elif v == '~': return (p2, ~ int(v2), lst2) return (p2, v2, lst2) elif v == '(': count_par = 0 i = 0 for _, v in lst: if v == ')': count_par -= 1 if count_par == 0: break elif v == '(': count_par += 1 i += 1 else: raise PreprocError, "rparen expected %s" % str(lst) ret = process_tokens(lst[1:i], defs, ban) if len(ret) == 1: (p, v) = ret[0] return (p, v, lst[i+1:]) else: #return (None, lst1, lst[i+1:]) raise PreprocError, "cannot reduce %s" % str(lst) elif p == IDENT: if len(lst)>1: (p2, v2) = lst[1] if v2 == "##": # token pasting, reevaluate the identifier obtained (p3, v3) = lst[2] if p3 != IDENT and p3 != NUM and p3 != OP: raise PreprocError, "%s: ident expected after '##'" % str(lst) return get_expr([(p, v+v3)]+lst[3:], defs, ban) if v.lower() == 'defined': (p2, v2) = lst[1] off = 2 if v2 == '(': (p3, v3) = lst[2] if p3 != IDENT: raise PreprocError, 'expected an identifier after a "defined("' (p2, v2) = lst[3] if v2 != ')': raise PreprocError, 'expected a ")" after a "defined(x"' off = 4 elif p2 != IDENT: raise PreprocError, 'expected a "(" or an identifier after a defined' x = 0 if v2 in defs: x = 1 #return get_expr([(NUM, x)] + lst[off:], defs, ban) return (NUM, x, lst[off:]) elif not v in defs or v in ban: if "waf_include" in ban: return (p, v, lst[1:]) else: return (NUM, 0, lst[1:]) # tokenize on demand if type(defs[v]) is types.StringType: v, k = extract_macro(defs[v]) defs[v] = k macro_def = defs[v] if not macro_def[0]: # simple macro, substitute, and reevaluate lst = macro_def[1] + lst[1:] return get_expr(lst, defs, ban) else: # collect the arguments for the funcall params = [] i = 1 p2, v2 = lst[i] if p2 != OP or v2 != '(': raise PreprocError, "invalid function call '%s'" % v one_param = [] count_paren = 0 try: while 1: i += 1 p2, v2 = lst[i] if p2 == OP and count_paren == 0: if v2 == '(': one_param.append((p2, v2)) count_paren += 1 elif v2 == ')': if one_param: params.append(one_param) lst = lst[i+1:] break elif v2 == ',': if not one_param: raise PreprocError, "empty param in funcall %s" % p params.append(one_param) one_param = [] else: one_param.append((p2, v2)) else: one_param.append((p2, v2)) if v2 == '(': count_paren += 1 elif v2 == ')': count_paren -= 1 except IndexError, e: #raise PreprocError, 'invalid function call %s: missing ")"' % p raise # substitute the arguments within the define expression accu = [] table = macro_def[0] for p2, v2 in macro_def[1]: if p2 == IDENT and v2 in table: accu += params[table[v2]] else: if v2 == '__VA_ARGS__': # first collect the tokens va_toks = [] st = len(macro_def[0]) pt = len(params) for x in params[pt-st+1:]: va_toks.extend(x) va_toks.append((OP, ',')) if va_toks: va_toks.pop() # extra comma if len(accu)>1: (p3, v3) = accu[-1] (p4, v4) = accu[-2] if v3 == '##': # remove the token paste accu.pop() if v4 == ',' and pt < st: # remove the comma accu.pop() accu += va_toks else: accu.append((p2, v2)) return get_expr(accu + lst, defs, ban+[v]) def process_tokens(lst, defs, ban): accu = [] while lst: p, v, nlst = get_expr(lst, defs, ban) if p == NUM: if not nlst: return [(p, v)] # finished op1, ov1 = nlst[0] if op1 != OP: raise PreprocError, "op expected %s" % str(lst) if ov1 == '?': i = 0 count_par = 0 for _, k in nlst: if k == ')': count_par -= 1 elif k == '(': count_par += 1 elif k == ':' and count_par == 0: break i += 1 else: raise PreprocError, "ending ':' expected %s" % str(lst) if reduce_nums(v, 0, '+'): lst = nlst[1:i] else: lst = nlst[i+1:] continue elif ov1 == ',': lst = nlst[1:] continue p2, v2, nlst = get_expr(nlst[1:], defs, ban) if p2 != NUM: raise PreprocError, "num expected after op %s" % str(lst) if nlst: # op precedence op3, ov3 = nlst[0] if prec[ov3] < prec[ov1]: #print "ov3", ov3, ov1 # as needed p4, v4, nlst2 = get_expr(nlst[1:], defs, ban) v5 = reduce_nums(v2, v4, ov3) lst = [(p, v), (op1, ov1), (NUM, v5)] + nlst2 continue # no op precedence or empty list, reduce the first tokens lst = [(NUM, reduce_nums(v, v2, ov1))] + nlst continue elif p == STR: if nlst: raise PreprocError, "sequence must terminate with a string %s" % str(nlst) return [(p, v)] return (None, None, []) def eval_macro(lst, adefs): # look at the result, and try to return a 0/1 result ret = process_tokens(lst, adefs, []) if not ret: raise PreprocError, "missing tokens to evaluate %s" % str(lst) p, v = ret[0] return int(v) != 0 def try_exists(node, list): lst = []+list while lst: name = lst.pop(0) # it is not a build node, else we would already got it path = os.path.join(node.abspath(), name) try: os.stat(path) except OSError: #traceback.print_exc() return None node = node.find_dir_lst([name]) return node class c_parser(object): def __init__(self, nodepaths=None, strpaths=None, defines=None): #self.lines = txt.split('\n') self.lines = [] if defines is None: self.defs = {} else: self.defs = dict(defines) # make a copy self.state = [] self.env = None # needed for the variant when searching for files # include paths if strpaths is None: self.strpaths = [] else: self.strpaths = strpaths self.pathcontents = {} self.deps = [] self.deps_paths = [] if nodepaths is None: self.m_nodepaths = [] else: self.m_nodepaths = nodepaths self.m_nodes = [] self.m_names = [] # file added self.curfile = '' self.ban_includes = [] # dynamic cache try: self.parse_cache = Params.g_build.parse_cache except AttributeError: Params.g_build.parse_cache = {} self.parse_cache = Params.g_build.parse_cache def tryfind(self, filename): self.curfile = filename global g_findall if self.m_nodepaths: found = 0 for n in self.m_nodepaths: found = n.find_source(filename, create=0) if found: break # second pass for unreachable folders if not found and g_findall: lst = filename.split('/') if len(lst)>1: lst=lst[:-1] # take the folders only try: cache = Params.g_build.preproc_cache except AttributeError: Params.g_build.preproc_cache = cache = {} key = hash( (str(self.m_nodepaths), str(lst)) ) if not cache.get(key, None): cache[key] = 1 for n in self.m_nodepaths: node = try_exists(n, lst) if node: found = n.find_source(filename, create=0) if found: break if found: self.m_nodes.append(found) # Qt if filename[-4:] != '.moc': self.addlines(found.abspath(self.env)) if not found: if not filename in self.m_names: self.m_names.append(filename) else: found = 0 for p in self.strpaths: if not p in self.pathcontents.keys(): self.pathcontents[p] = os.listdir(p) if filename in self.pathcontents[p]: #print "file %s found in path %s" % (filename, p) np = os.path.join(p, filename) # screw Qt two times if filename[-4:] != '.moc': self.addlines(np) self.deps_paths.append(np) found = 1 if not found: pass #error("could not find %s " % filename) def addlines(self, filepath): pc = self.parse_cache debug("reading file %r" % filepath, 'preproc') if filepath in pc.keys(): self.lines = pc[filepath] + self.lines return try: lines = filter_comments(filepath) pc[filepath] = lines # memorize the lines filtered self.lines = lines + self.lines except IOError: raise PreprocError, "could not read the file %s" % filepath except Exception: if Params.g_verbose > 0: warning("parsing %s failed" % filepath) traceback.print_exc() def start(self, node, env): debug("scanning %s (in %s)" % (node.m_name, node.m_parent.m_name), 'preproc') self.env = env variant = node.variant(env) self.addlines(node.abspath(env)) if env['DEFLINES']: self.lines = [('define', x) for x in env['DEFLINES']] + self.lines while self.lines: (type, line) = self.lines.pop(0) try: self.process_line(type, line) except Exception, ex: if Params.g_verbose: warning("line parsing failed (%s): %s" % (str(ex), line)) traceback.print_exc() # debug only def start_local(self, filename): self.addlines(filename) #print self.lines while self.lines: (type, line) = self.lines.pop(0) try: self.process_line(type, line) except Exception, ex: if Params.g_verbose: warning("line parsing failed (%s): %s" % (str(ex), line)) traceback.print_exc() raise def process_line(self, token, line): ve = Params.g_verbose if ve: debug("line is %s - %s state is %s" % (token, line, self.state), 'preproc') state = self.state # make certain we define the state if we are about to enter in an if block if token in ['ifdef', 'ifndef', 'if']: state.append(undefined) elif token == 'endif': state.pop() # skip lines when in a dead 'if' branch, wait for the endif if not token in ['else', 'elif', 'endif']: if skipped in self.state or ignored in self.state: return if token == 'if': ret = eval_macro(tokenize(line), self.defs) if ret: state[-1] = accepted else: state[-1] = ignored elif token == 'ifdef': m = re_mac.search(line) if m and m.group(0) in self.defs: state[-1] = accepted else: state[-1] = ignored elif token == 'ifndef': m = re_mac.search(line) if m and m.group(0) in self.defs: state[-1] = ignored else: state[-1] = accepted elif token == 'include' or token == 'import': (type, inc) = extract_include(line, self.defs) if inc in self.ban_includes: return if token == 'import': self.ban_includes.append(inc) if ve: debug("include found %s (%s) " % (inc, type), 'preproc') if type == '"' or not strict_quotes: if not inc in self.deps: self.deps.append(inc) self.tryfind(inc) elif token == 'elif': if state[-1] == accepted: state[-1] = skipped elif state[-1] == ignored: if eval_macro(tokenize(line), self.defs): state[-1] = accepted elif token == 'else': if state[-1] == accepted: state[-1] = skipped elif state[-1] == ignored: state[-1] = accepted elif token == 'define': m = re_mac.search(line) if m: name = m.group(0) if ve: debug("define %s %s" % (name, line), 'preproc') self.defs[name] = line else: raise PreprocError, "invalid define line %s" % line elif token == 'undef': m = re_mac.search(line) if m and m.group(0) in self.defs: self.defs.__delitem__(m.group(0)) #print "undef %s" % name elif token == 'pragma': if re_pragma_once.search(line.lower()): self.ban_includes.append(self.curfile) def extract_macro(txt): t = tokenize(txt) if re_fun.search(txt): p, name = t[0] p, v = t[1] if p != OP: raise PreprocError, "expected open parenthesis" i = 1 pindex = 0 params = {} wantident = 1 while 1: i += 1 p, v = t[i] if wantident: if p == IDENT: params[v] = pindex pindex += 1 elif v == '...': pass else: raise PreprocError, "expected ident" else: if v == ',': pass elif v == ')': break elif v == '...': raise PreprocError, "not implemented" wantident = not wantident return (name, [params, t[i+1:]]) else: (p, v) = t[0] return (v, [[], t[1:]]) re_include = re.compile('^\s*(<(?P.*)>|"(?P.*)")') def extract_include(txt, defs): m = re_include.search(txt) if m: if m.group('a'): return '<', m.group('a') if m.group('b'): return '"', m.group('b') # perform preprocessing and look at the result, it must match an include tokens = process_tokens(tokens, defs, ['waf_include']) p, v = tokens[0] if p != STR: raise PreprocError, "could not parse include %s" % txt return ('"', v) def parse_char(txt): if not txt: raise PreprocError, "attempted to parse a null char" if txt[0] != '\\': return ord(txt) c = txt[1] if c == 'x': if len(txt) == 4 and txt[3] in string.hexdigits: return int(txt[2:], 16) return int(txt[2:], 16) elif c.isdigit(): if c == '0' and len(txt)==2: return 0 for i in 3, 2, 1: if len(txt) > i and txt[1:1+i].isdigit(): return (1+i, int(txt[1:1+i], 8)) else: try: return chr_esc[c] except KeyError: raise PreprocError, "could not parse char literal '%s'" % txt def tokenize(s): ret = [] for match in re_clexer.finditer(s): m = match.group for name in tok_types: v = m(name) if v: if name == IDENT: try: v = g_optrans[v]; name = OP except KeyError: # c++ specific if v.lower() == "true": v = 1 name = NUM elif v.lower() == "false": v = 0 name = NUM elif name == NUM: if m('oct'): v = int(v, 8) elif m('hex'): v = int(m('hex'), 16) elif m('n0'): v = m('n0') else: v = m('char') if v: v = parse_char(v) else: v = m('n2') or m('n4') elif name == OP: if v == '%:': v='#' elif v == '%:%:': v='##' ret.append((name, v)) break return ret # quick test # if __name__ == "__main__": Params.g_verbose = 2 Params.g_zones = ['preproc'] class dum: def __init__(self): self.parse_cache = {} Params.g_build = dum() try: arg = sys.argv[1] except IndexError: arg = "file.c" paths = ['.'] f = open(arg, "r"); txt = f.read(); f.close() m1 = [[], [(NUM, 1), (OP, '+'), (NUM, 2)]] fun1 = [[(IDENT, 'x'), (IDENT, 'y')], [(IDENT, 'x'), (OP, '##'), (IDENT, 'y')]] fun2 = [[(IDENT, 'x'), (IDENT, 'y')], [(IDENT, 'x'), (OP, '*'), (IDENT, 'y')]] def test(x): y = process_tokens(tokenize(x), {'m1':m1, 'fun1':fun1, 'fun2':fun2}, []) #print x, y test("0&&2<3") test("(5>1)*6") test("1+2+((3+4)+5)+6==(6*7)/2==1*-1*-1") test("1,2,3*9,9") test("1?77:88") test("0?77:88") test("1?1,(0?5:9):3,4") test("defined inex") test("defined(inex)") test("m1*3") test("7*m1*3") test("fun1(m,1)") test("fun2(2, fun1(m, 1))") #test("foo##.##h") gruik = c_parser(strpaths = paths) gruik.start_local(arg) print "we have found the following dependencies" print gruik.deps print gruik.deps_paths #f = open(arg, "r") #txt = f.read() #f.close() #print tokenize(txt)