File: advanced.re

package info (click to toggle)
re2c 4.4-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 51,512 kB
  • sloc: cpp: 34,160; ml: 8,494; sh: 5,311; makefile: 1,014; haskell: 611; python: 431; ansic: 234; javascript: 113
file content (261 lines) | stat: -rw-r--r-- 8,541 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
// re2zig $INPUT -o $OUTPUT -cf --recursive-functions -Wno-nondeterministic-tags

const std = @import("std");

// Use a small buffer to cover the case when a lexeme doesn't fit.
// In real world use a larger buffer.
const bufsize = 100;
const none = std.math.maxInt(usize);
const mtag_root = none - 1;

/*!conditions:re2c*/

const State = struct {
    allocator: std.mem.Allocator,
    file: *std.io.Reader,
    yyinput: [bufsize + 1]u8,
    yycursor: usize,
    yymarker: usize,
    yylimit: usize,
    token: usize,
    yycond: u32,
    yystate: i32,
    trie: *std.ArrayList(MtagElem),
    /*!stags:re2c format = "@@: usize,\n"; */
    /*!mtags:re2c format = "@@: usize,\n"; */
    l1: usize,
    l2: usize,
    f1: usize,
    f2: usize,
    p1: usize,
    p2: usize,
    p3: usize,
    p4: usize,
    yyaccept: u32,
};

const Status = enum {
    end,
    ready,
    waiting,
    bad_packet,
    big_packet,
};

// An m-tag tree is a way to store histories with an O(1) copy operation.
// Histories naturally form a tree, as they have common start and fork at some
// point. The tree is stored as an array of pairs (tag value, link to parent).
// An m-tag is represented with a single link in the tree (array index).
const MtagElem = struct {
    elem: usize, // tag value
    pred: usize, // index of the predecessor node or root
};

// Append a single value to an m-tag history.
fn add_mtag(st: *State, mtag: usize, value: usize) !usize {
    try st.trie.append(st.allocator, MtagElem{.elem = value, .pred = mtag});
    return st.trie.items.len - 1;
}

// Recursively unwind tag histories and collect version components.
fn unwind(st: *State, x: usize, y: usize) !std.ArrayList([]const u8) {
    // Reached the root of the m-tag tree, stop recursion.
    if (x == mtag_root and y == mtag_root) {
        return try std.ArrayList([]const u8).initCapacity(st.allocator, 0);
    }

    // Unwind history further.
    var ss = try unwind(st, st.trie.items[x].pred, st.trie.items[y].pred);

    // Get tag values. Tag histories must have equal length.
    std.debug.assert(x != mtag_root and y != mtag_root);
    const ex = st.trie.items[x].elem;
    const ey = st.trie.items[y].elem;

    if (ex != none and ey != none) {
        // Both tags are valid string indices, extract component.
        const s = try std.mem.Allocator.dupe(st.allocator, u8, st.yyinput[ex..ey]);
        try ss.append(st.allocator, s);
    } else {
        // Both tags are none (this corresponds to zero repetitions).
        std.debug.assert(ex == none and ey == none);
    }

    return ss;
}

fn s2n(str: []const u8) u32 { // convert a pre-parsed string to a number
    var n: u32 = 0;
    for (str) |c| { n = n * 10 + (c - 48); }
    return n;
}

fn fill(st: *State) Status {
    const used = st.yylimit - st.token;
    const free = bufsize - used;

    // Error: lexeme too long. In real life can reallocate a larger buffer.
    if (free < 1) return Status.big_packet;

    // Shift buffer contents (discard everything up to the current token).
    std.mem.copyBackwards(u8, st.yyinput[0..used], st.yyinput[st.token..st.yylimit]);
    st.yycursor -= st.token;
    st.yymarker = @subWithOverflow(st.yymarker, st.token)[0];
    st.yylimit -= st.token;
    // Tag variables need to be shifted like other input positions. The check
    // for NONE is only needed if some tags are nested inside of alternative or
    // repetition, so that they can have NONE value.
    /*!stags:re2c format = "if (st.@@ != none) st.@@ = @subWithOverflow(st.@@, st.token)[0];\n"; */
    st.token = 0;

    // Fill free space at the end of buffer with new data from file.
    st.yylimit += st.file.readSliceShort(st.yyinput[st.yylimit..bufsize]) catch 0;
    st.yyinput[st.yylimit] = 0; // append sentinel symbol

    return Status.ready;
}

/*!re2c
    re2c:api = record;
    re2c:eof = 0;
    re2c:tags = 1;
    re2c:tags:negative = "none";
    re2c:variable:yyrecord = st;
    re2c:define:YYFN = ["lex;Status", "st;*State"];
    re2c:define:YYFILL = "return Status.waiting;";
    re2c:define:YYMTAGP = "@@ = add_mtag(st, @@, st.yycursor) catch none;";
    re2c:define:YYMTAGN = "@@ = add_mtag(st, @@, none) catch none;";

    crlf  = '\r\n';
    sp    = ' ';
    htab  = '\t';
    ows   = (sp | htab)*;
    digit = [0-9];
    alpha = [a-zA-Z];
    vchar = [\x1f-\x7e];
    tchar = [-!#$%&'*+.^_`|~] | digit | alpha;

    obs_fold            = #f1 crlf (sp | htab)+ #f2;
    obs_text            = [\x80-\xff];
    field_name          = tchar+;
    field_vchar         = vchar | obs_text;
    field_content       = field_vchar ((sp | htab)+ field_vchar)?;
    field_value_folded  = (field_content* obs_fold field_content*)+;
    header_field_folded = field_value_folded ows;
    token               = tchar+;
    qdtext
        = htab
        | sp
        | [\x21-\x5B\x5D-\x7E] \ '"'
        | obs_text;
    quoted_pair         = '\\' ( htab | sp | vchar | obs_text );
    quoted_string       = '"' ( qdtext | quoted_pair )* '"';
    parameter           = #p1 token #p2 '=' #p3 ( token | quoted_string ) #p4;
    media_type          = @l1 token '/' token @l2 ( ows ';' ows parameter )*;

    <media_type> media_type ows crlf {
        const mt = st.yyinput[st.l1..st.l2];
        std.log.debug("media type: {s}", .{mt});

        const pnames = unwind(st, st.p1, st.p2) catch null;
        std.log.debug("pnames: {any}", .{pnames});

        const pvals = unwind(st, st.p3, st.p4) catch null;
        std.log.debug("pvals: {any}", .{pvals});

        st.token = st.yycursor;
        return lex(st);
    }

    <header> header_field_folded crlf {
        const folds = unwind(st, st.f1, st.f2) catch null;
        std.log.debug("folds: {any}", .{folds});

        st.token = st.yycursor;
        return lex(st);
    }

    <*> * { return Status.bad_packet; }
    <*> $ { return Status.end; }
*/

fn run(expect: Status, packets: []const []const u8) !void {
    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
    defer arena.deinit();

    // Create a "pipe" (open the same file for reading and writing).
    const fname = "pipe";
    var fw = try std.fs.cwd().createFile(fname, .{});
    var fr = try std.fs.cwd().openFile(fname, .{ .mode = .read_only});

    // Initialize lexer state: `state` value is -1, all offsets are at the end
    // of buffer. Use unbuffered reader - lexer does its own buffering.
    const alc = arena.allocator();
    const zerobuf: [0]u8 = undefined;
    var reader = fr.reader(&zerobuf);
    var mt = try std.ArrayList(MtagElem).initCapacity(alc, 0);
    defer mt.deinit(alc);
    var st = State{
        .allocator = alc,
        .file = &reader.interface,
        .yyinput = undefined,
        .yycursor = bufsize,
        .yymarker = bufsize,
        .yylimit = bufsize,
        .token = bufsize,
        .yycond = yycmedia_type,
        .yystate = -1,
        .trie = &mt,
        /*!stags:re2c format = '.@@ = none,\n'; */
        /*!mtags:re2c format = '.@@ = mtag_root,\n'; */
        .l1 = none,
        .l2 = none,
        .f1 = mtag_root,
        .f2 = mtag_root,
        .p1 = mtag_root,
        .p2 = mtag_root,
        .p3 = mtag_root,
        .p4 = mtag_root,
        .yyaccept = 0,
    };
    // Sentinel at `yylimit` offset is set to zero, which triggers YYFILL.
    st.yyinput[st.yylimit] = 0;

    // Main loop. The buffer contains incomplete data which appears packet by
    // packet. When the lexer needs more input it saves its internal state and
    // returns to the caller which should provide more input and resume lexing.
    var status = Status.ready;
    var send: usize = 0;
    while (true) {
        status = lex(&st);
        if (status == Status.end) {
            break;
        } else if (status == Status.waiting) {
            if (send < packets.len) {
                std.log.debug("sending packet {}", .{send});
                try fw.writeAll(packets[send]);
                send += 1;
            }
            status = fill(&st);
            std.log.debug("filled buffer [{s}], status {}", .{st.yyinput, status});
            if (status != Status.ready) {
                break;
            }
        } else if (status == Status.bad_packet) {
            break;
        }
    }

    // Check results.
    try std.testing.expectEqual(status, expect);

    // Cleanup: remove input file.
    fw.close();
    fr.close();
    try std.fs.cwd().deleteFile(fname);
}

test {
    try run(Status.end,
        &[_][]const u8{"ap", "plication/j", "son;", " charset=\"", "utf\\\"-8\"\r", "\n", ""});
}