File: parsing.py

package info (click to toggle)
qgis 2.18.28%2Bdfsg-2
links: PTS, VCS
area: main
in suites: buster
size: 1,007,948 kB
sloc: cpp: 671,774; python: 158,539; xml: 35,690; ansic: 8,346; sh: 1,766; perl: 1,669; sql: 999; yacc: 836; lex: 461; makefile: 292
file content (187 lines) | stat: -rw-r--r-- 6,057 bytes
# -*- coding: utf-8 -*-

"""
***************************************************************************
    parsing.py
    ---------------------
    Copyright            : (C) 2013 by CS Systemes d'information (CS SI)
    Email                : otb at c-s dot fr (CS SI)
    Contributors         : Julien Malik (CS SI)
                           Oscar Picas (CS SI)
***************************************************************************
*                                                                         *
*   This program is free software; you can redistribute it and/or modify  *
*   it under the terms of the GNU General Public License as published by  *
*   the Free Software Foundation; either version 2 of the License, or     *
*   (at your option) any later version.                                   *
*                                                                         *
***************************************************************************
"""
__author__ = 'Julien Malik, Oscar Picas'
__copyright__ = '(C) 2013, CS Systemes d\'information  (CS SI)'

from collections import namedtuple
import re


def merge_pairs(list, should_merge, merge):
    """
    Merges adjacent elements of list using the function merge
    if they satisfy the predicate should_merge.
    """
    ret = []
    i = 0
    while i < len(list) - 1:
        a = list[i]
        b = list[i + 1]
        if should_merge(a, b):
            ret.append(merge(a, b))
            i += 2
        else:
            ret.append(a)
            i += 1
    if i == len(list) - 1:
        ret.append(list[i])
    return ret

QuotedString = namedtuple('QuotedString', 'contents comments')
_Arg = namedtuple('Arg', 'contents comments')
_Command = namedtuple('Command', 'name body comment')
BlankLine = namedtuple('BlankLine', '')


class File(list):

    def __repr__(self):
        return 'File(' + repr(list(self)) + ')'


class Comment(str):

    def __repr__(self):
        return 'Comment(' + unicode(self) + ')'


def Arg(contents, comments=None):
    return _Arg(contents, comments or [])


def Command(name, body, comment=None):
    return _Command(name, body, comment)


class CMakeParseError(Exception):
    pass


def prettify(s):
    """
    Returns the pretty-print of the contents of a CMakeLists file.
    """
    return unicode(parse(s))


def parse(s):
    '''
    Parses a string s in CMakeLists format whose
    contents are assumed to have come from the
    file at the given path.
    '''
    nums_toks = tokenize(s)
    nums_items = list(parse_file(nums_toks))
    nums_items = attach_comments_to_commands(nums_items)
    items = [item for _, item in nums_items]
    return File(items)


def parse_file(toks):
    '''
    Yields line number ranges and top-level elements of the syntax tree for
    a CMakeLists file, given a generator of tokens from the file.

    toks must really be a generator, not a list, for this to work.
    '''
    prev_type = 'newline'
    for line_num, (typ, tok_contents) in toks:
        if typ == 'comment':
            yield ([line_num], Comment(tok_contents))
        elif typ == 'newline' and prev_type == 'newline':
            yield ([line_num], BlankLine())
        elif typ == 'word':
            line_nums, cmd = parse_command(line_num, tok_contents, toks)
            yield (line_nums, cmd)
        prev_type = typ


def attach_comments_to_commands(nodes):
    return merge_pairs(nodes, command_then_comment, attach_comment_to_command)


def command_then_comment(a, b):
    line_nums_a, thing_a = a
    line_nums_b, thing_b = b
    return (isinstance(thing_a, _Command) and
            isinstance(thing_b, Comment) and
            set(line_nums_a).intersection(line_nums_b))


def attach_comment_to_command(lnums_command, lnums_comment):
    command_lines, command = lnums_command
    _, comment = lnums_comment
    return command_lines, Command(command.name, command.body[:], comment)


def parse_command(start_line_num, command_name, toks):
    cmd = Command(name=command_name, body=[], comment=None)
    expect('left paren', toks)
    for line_num, (typ, tok_contents) in toks:
        if typ == 'right paren':
            line_nums = range(start_line_num, line_num + 1)
            return line_nums, cmd
        elif typ == 'left paren':
            raise ValueError('Unexpected left paren at line %s' % line_num)
        elif typ in ('word', 'string'):
            cmd.body.append(Arg(tok_contents, []))
        elif typ == 'comment':
            c = tok_contents
            if cmd.body:
                cmd.body[-1].comments.append(c)
            else:
                cmd.comments.append(c)
    msg = 'File ended while processing command "%s" started at line %s' % (
        command_name, start_line_num)
    raise CMakeParseError(msg)


def expect(expected_type, toks):
    line_num, (typ, tok_contents) = toks.next()
    if typ != expected_type:
        msg = 'Expected a %s, but got "%s" at line %s' % (
            expected_type, tok_contents, line_num)
        raise CMakeParseError(msg)

# http://stackoverflow.com/questions/691148/pythonic-way-to-implement-a-tokenizer
scanner = re.Scanner([
    (r'#.*', lambda scanner, token: ("comment", token)),
    (r'"[^"]*"', lambda scanner, token: ("string", token)),
    (r"\(", lambda scanner, token: ("left paren", token)),
    (r"\)", lambda scanner, token: ("right paren", token)),
    (r'[^ \t\r\n()#"]+', lambda scanner, token: ("word", token)),
    (r'\n', lambda scanner, token: ("newline", token)),
    (r"\s+", None),  # skip other whitespace
])


def tokenize(s):
    """
    Yields pairs of the form (line_num, (token_type, token_contents))
    given a string containing the contents of a CMakeLists file.
    """
    toks, remainder = scanner.scan(s)
    line_num = 1
    if remainder != '':
        msg = 'Unrecognized tokens at line %s: %s' % (line_num, remainder)
        raise ValueError(msg)
    for tok_type, tok_contents in toks:
        yield line_num, (tok_type, tok_contents.strip())
        line_num += tok_contents.count('\n')