File: mwrap_lexer.py

package info (click to toggle)
mwrap 1.3.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,480 kB
  • sloc: cpp: 3,315; python: 1,850; ansic: 856; makefile: 255; lex: 233; sh: 145
file content (310 lines) | stat: -rw-r--r-- 10,436 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
"""
mwrap_lexer.py — Line-oriented lexer for .mw files.

Copyright (c) 2007-2008  David Bindel
See the file COPYING for copying permissions

Converted to Python by Zydrunas Gimbutas (2026),
with assistance from Claude Code / Claude Opus 4.6 (Anthropic).
"""

import re
import sys
import os
from enum import Enum, auto
from dataclasses import dataclass
from typing import Optional, TextIO, List


class TokenType(Enum):
    ID        = auto()
    NUMBER    = auto()
    STRING    = auto()
    NEW       = auto()
    FORTRAN   = auto()
    INPUT     = auto()
    OUTPUT    = auto()
    INOUT     = auto()
    CLASS     = auto()
    TYPEDEF   = auto()
    CPU       = auto()
    GPU       = auto()
    PUNCT     = auto()      # single characters: ( ) , ; * & [ ] . - > = :
    NON_C_LINE = auto()
    EOF       = auto()


KEYWORDS = {
    "new":      TokenType.NEW,
    "FORTRAN":  TokenType.FORTRAN,
    "input":    TokenType.INPUT,
    "output":   TokenType.OUTPUT,
    "inout":    TokenType.INOUT,
    "class":    TokenType.CLASS,
    "typedef":  TokenType.TYPEDEF,
    "cpu":      TokenType.CPU,
    "gpu":      TokenType.GPU,
}

# Regex for tokenising a '#' line body
_TOKEN_RE = re.compile(
    r"(//[^\n]*)"               # comment (rest of line) — skip
    r"|('[^'\n]*'?)"            # string literal (single-quoted)
    r"|((?:::)?[_a-zA-Z][_a-zA-Z0-9]*(?:::(?:[_a-zA-Z][_a-zA-Z0-9]*))*)"  # ID (may have ::)
    r"|([0-9]+)"                # number
    r"|([->()\[\],;*&=:.])"     # punctuation
    r"|[ \t\r]+"                # whitespace — skip
)


@dataclass
class Token:
    type: TokenType
    value: str
    line: int


def _is_name_char(c):
    return c.isalnum() or c == '_'


def _fname_scan_line(text):
    """Extract function name from '@function ...' tail, return name.m."""
    paren = text.find('(')
    if paren < 0:
        paren = len(text)
    # Walk back from paren to find last alnum
    end = paren
    while end > 0 and not _is_name_char(text[end - 1]):
        end -= 1
    start = end
    while start > 0 and _is_name_char(text[start - 1]):
        start -= 1
    name = text[start:end]
    return name + ".m"


class Lexer:
    """Line-oriented lexer for .mw files.

    Usage:
        lex = Lexer(outfp, outcfp, mbatching_flag, listing_flag)
        for tok in lex.lex_file(filename):
            ...   # only tokens from '#' lines; NON_C_LINE for other lines
    """

    def __init__(self, outfp=None, outcfp=None,
                 mbatching_flag=False, listing_flag=False):
        self.outfp: Optional[TextIO] = outfp
        self.outcfp: Optional[TextIO] = outcfp
        self.mbatching_flag: bool = mbatching_flag
        self.listing_flag: bool = listing_flag
        self.linenum: int = 0
        self.current_ifname: str = ""

        # File include stack
        self._file_stack: List = []          # [(fp, linenum, ifname), ...]
        self._current_fp: Optional[TextIO] = None

    # ------------------------------------------------------------------
    # public interface
    # ------------------------------------------------------------------

    def lex_file(self, filename):
        """Yield Token objects from *filename*."""
        fp = open(filename, "r")
        self._current_fp = fp
        self.current_ifname = filename
        self.linenum = 1
        yield from self._lex_stream()

    # ------------------------------------------------------------------
    # directive handlers
    # ------------------------------------------------------------------

    def _handle_block_c(self, line):
        """Process a line in block C mode. Returns False when block ends."""
        stripped = line.rstrip('\r\n')
        if re.match(r'^\$\][ \t\r]*$', stripped):
            self.linenum += 1
            return False
        if self.outcfp:
            self.outcfp.write(line)
        self.linenum += 1
        return True

    def _handle_comment(self):
        """Handle // comment line."""
        self.linenum += 1
        yield Token(TokenType.NON_C_LINE, "", self.linenum - 1)

    def _handle_function(self, stripped):
        """Handle @function directive."""
        tail = stripped[len("@function"):]
        tail = tail.rstrip('\r\n')
        fname = _fname_scan_line(tail)
        if self.mbatching_flag:
            if self.outfp:
                self.outfp.close()
            try:
                self.outfp = open(fname, "w")
            except OSError:
                print(f"Error: Could not write {fname}",
                      file=sys.stderr)
                sys.exit(1)
        if self.listing_flag:
            print(fname)
        if self.outfp:
            self.outfp.write(f"function{tail}\n")
        self.linenum += 1
        yield Token(TokenType.NON_C_LINE, "", self.linenum - 1)

    def _handle_include(self, stripped):
        """Handle @include directive. Pushes current file onto stack."""
        rest = stripped[len("@include"):].strip().rstrip('\r\n')
        if len(self._file_stack) >= 10:
            print("Error: Includes nested too deeply",
                  file=sys.stderr)
            sys.exit(1)
        self._file_stack.append(
            (self._current_fp, self.linenum, self.current_ifname))
        try:
            new_fp = open(rest, "r")
        except OSError:
            print(f"Error: Could not read '{rest}'",
                  file=sys.stderr)
            sys.exit(1)
        self.current_ifname = rest
        self.linenum = 1
        self._current_fp = new_fp

    def _handle_redirect(self, stripped):
        """Handle @ redirect directive."""
        rest = stripped[1:].strip().rstrip('\r\n')
        if self.mbatching_flag:
            if self.outfp:
                self.outfp.close()
                self.outfp = None
            if rest:
                try:
                    self.outfp = open(rest, "w")
                except OSError:
                    print(f"Error: Could not write {rest}",
                          file=sys.stderr)
                    sys.exit(1)
        if self.listing_flag and rest:
            print(rest)
        self.linenum += 1
        yield Token(TokenType.NON_C_LINE, "", self.linenum - 1)

    def _handle_dollar_line(self, stripped):
        """Handle $ single-line C pass-through."""
        rest = stripped[1:]
        if self.outcfp:
            self.outcfp.write(rest)
        self.linenum += 1
        yield Token(TokenType.NON_C_LINE, "", self.linenum - 1)

    def _handle_hash_line(self, stripped):
        """Handle # C declaration line — tokenise."""
        body = stripped[1:].rstrip('\r\n')
        yield from self._tokenize_c_line(body, self.linenum)
        self.linenum += 1

    # ------------------------------------------------------------------
    # internal line-by-line driver
    # ------------------------------------------------------------------

    def _lex_stream(self):
        """Process all lines from self._current_fp, yielding tokens."""
        in_block_c = False

        while True:
            raw_line = self._current_fp.readline()
            if raw_line == "":
                # End of current file
                if self._file_stack:
                    self._current_fp.close()
                    self._current_fp, self.linenum, self.current_ifname = self._file_stack.pop()
                    continue
                else:
                    return       # real EOF

            # Strip the trailing newline for processing but track it
            line = raw_line

            # --- block C mode ($[ ... $]) ---
            if in_block_c:
                in_block_c = self._handle_block_c(line)
                continue

            # Determine line prefix — compute leading whitespace
            stripped = line.lstrip(' \t')
            leading_ws = line[:len(line) - len(stripped)]

            # In the original Flex lexer, leading [ \t] in INITIAL state is
            # always written to outfp regardless of what prefix follows.
            # We replicate this for all prefix types except pure text lines
            # (which include their own leading whitespace in the full line).
            if leading_ws and self.outfp and (
                    stripped.startswith("$") or
                    stripped.startswith("#") or
                    stripped.startswith("@") or
                    stripped.startswith("//")):
                self.outfp.write(leading_ws)

            # $[ block start
            if re.match(r'^\$\[[ \t\r]*\n?$', stripped):
                in_block_c = True
                self.linenum += 1
                continue

            if stripped.startswith("//"):
                yield from self._handle_comment()
                continue

            if stripped.startswith("@function"):
                yield from self._handle_function(stripped)
                continue

            if stripped.startswith("@include"):
                self._handle_include(stripped)
                continue

            if stripped.startswith("@"):
                yield from self._handle_redirect(stripped)
                continue

            if stripped.startswith("$") and not stripped.startswith("$["):
                yield from self._handle_dollar_line(stripped)
                continue

            if stripped.startswith("#"):
                yield from self._handle_hash_line(stripped)
                continue

            # Text line — copy to MATLAB output
            if self.outfp:
                self.outfp.write(line)
            self.linenum += 1
            yield Token(TokenType.NON_C_LINE, "", self.linenum - 1)

    # ------------------------------------------------------------------
    # tokenise a single '#' line body
    # ------------------------------------------------------------------

    def _tokenize_c_line(self, body, line):
        """Yield tokens for the body of a '#' line."""
        for m in _TOKEN_RE.finditer(body):
            comment, string, ident, number, punct = m.groups()
            if comment:
                break        # rest of line is a comment
            if string:
                yield Token(TokenType.STRING, string, line)
            elif ident:
                tt = KEYWORDS.get(ident, TokenType.ID)
                yield Token(tt, ident, line)
            elif number:
                yield Token(TokenType.NUMBER, number, line)
            elif punct:
                yield Token(TokenType.PUNCT, punct, line)