File: basic_io.hh

package info (click to toggle)
monotone 0.48-3
  • links: PTS
  • area: main
  • in suites: squeeze
  • size: 20,096 kB
  • ctags: 8,077
  • sloc: cpp: 81,000; sh: 6,402; perl: 1,241; lisp: 1,045; makefile: 655; python: 566; sql: 112; ansic: 52
file content (353 lines) | stat: -rw-r--r-- 9,912 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
// Copyright (C) 2004 Graydon Hoare <graydon@pobox.com>
//               2008 Stephen Leake <stephen_leake@stephe-leake.org>
//
// This program is made available under the GNU GPL version 2.0 or
// greater. See the accompanying file COPYING for details.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the
// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE.

#ifndef __BASIC_IO_HH__
#define __BASIC_IO_HH__

#include "vector.hh"
#include <map>

#include "paths.hh"
#include "sanity.hh"
#include "vocab.hh"
#include "numeric_vocab.hh"
#include "char_classifiers.hh"

// This file provides parsing and printing primitives used by the
// higher level parser and printer routines for the datatypes cset,
// roster/marking_map and revision.

namespace basic_io
{

  namespace
    {
      namespace syms
        {
          // general format symbol
          symbol const format_version("format_version");

          // common symbols
          symbol const dir("dir");
          symbol const file("file");
          symbol const content("content");
          symbol const attr("attr");

          symbol const content_mark("content_mark");
        }
    }

  typedef enum
    {
      TOK_SYMBOL,
      TOK_STRING,
      TOK_HEX,
      TOK_NONE
    } token_type;

  struct
  input_source : public origin_aware
  {
    size_t line, col;
    std::string const & in;
    std::string::const_iterator curr;
    std::string name;
    int lookahead;
    char c;
    input_source(std::string const & in, std::string const & nm)
      : line(1), col(1), in(in), curr(in.begin()),
        name(nm), lookahead(0), c('\0')
    {}
    input_source(std::string const & in, std::string const & nm, origin::type w)
      : origin_aware(w), line(1), col(1), in(in), curr(in.begin()),
        name(nm), lookahead(0), c('\0')
    {}

    inline void peek()
    {
      if (LIKELY(curr != in.end()))
        // we do want to distinguish between EOF and '\xff',
        // so we translate '\xff' to 255u
        lookahead = widen<unsigned int,char>(*curr);
      else
        lookahead = EOF;
    }

    inline void advance()
    {
      if (LIKELY(curr != in.end()))
        {
          c = *curr;
          ++curr;
          ++col;
          if (c == '\n')
            {
              col = 1;
              ++line;
            }
        }
      peek();
    }
    void err(std::string const & s);
  };

  struct
  tokenizer
  {
    input_source & in;
    std::string::const_iterator begin;
    std::string::const_iterator end;

    tokenizer(input_source & i) : in(i), begin(in.curr), end(in.curr)
    {}

    inline void mark()
    {
      begin = in.curr;
      end = begin;
    }

    inline void advance()
    {
      in.advance();
      end = in.curr;
    }

    inline void store(std::string & val)
    {
      val.assign(begin, end);
    }

    inline token_type get_token(std::string & val)
    {
      in.peek();

      while (true)
        {
          if (UNLIKELY(in.lookahead == EOF))
            return TOK_NONE;
          if (!is_space(in.lookahead))
            break;
          in.advance();
        }

      if (is_alpha(in.lookahead))
        {
          mark();
          while (is_alnum(in.lookahead) || in.lookahead == '_')
            advance();
          store(val);
          return basic_io::TOK_SYMBOL;
        }
      else if (in.lookahead == '[')
        {
          in.advance();
          mark();
          while (static_cast<char>(in.lookahead) != ']')
            {
              if (UNLIKELY(in.lookahead == EOF))
                in.err("input stream ended in hex string");
              if (UNLIKELY(!is_xdigit(in.lookahead)))
                in.err("non-hex character in hex string");
              advance();
            }

          store(val);

          if (UNLIKELY(static_cast<char>(in.lookahead) != ']'))
            in.err("hex string did not end with ']'");
          in.advance();

          return basic_io::TOK_HEX;
        }
      else if (in.lookahead == '"')
        {
          in.advance();
          mark();
          while (static_cast<char>(in.lookahead) != '"')
            {
              if (UNLIKELY(in.lookahead == EOF))
                in.err("input stream ended in string");
              if (UNLIKELY(static_cast<char>(in.lookahead) == '\\'))
                {
                  // Possible escape: we understand escaped quotes and
                  // escaped backslashes. Nothing else. If we // happen to
                  // hit an escape, we stop doing the mark/store // thing
                  // and switch to copying and appending per-character
                  // until the // end of the token.

                  // So first, store what we have *before* the escape.
                  store(val);

                  // Then skip over the escape backslash.
                  in.advance();

                  // Make sure it's an escape we recognize.
                  if (UNLIKELY(!(static_cast<char>(in.lookahead) == '"'
                                 ||
                                 static_cast<char>(in.lookahead) == '\\')))
                    in.err("unrecognized character escape");

                  // Add the escaped character onto the accumulating token.
                  in.advance();
                  val += in.c;

                  // Now enter special slow loop for remainder.
                  while (static_cast<char>(in.lookahead) != '"')
                    {
                      if (UNLIKELY(in.lookahead == EOF))
                        in.err("input stream ended in string");
                      if (UNLIKELY(static_cast<char>(in.lookahead) == '\\'))
                        {
                          // Skip over any further escape marker.
                          in.advance();
                          if (UNLIKELY
                              (!(static_cast<char>(in.lookahead) == '"'
                                 ||
                                 static_cast<char>(in.lookahead) == '\\')))
                            in.err("unrecognized character escape");
                        }
                      in.advance();
                      val += in.c;
                    }
                  // When slow loop completes, return early.
                  if (static_cast<char>(in.lookahead) != '"')
                    in.err("string did not end with '\"'");
                  in.advance();

                  return basic_io::TOK_STRING;
                }
              advance();
            }

          store(val);

          if (UNLIKELY(static_cast<char>(in.lookahead) != '"'))
            in.err("string did not end with '\"'");
          in.advance();

          return basic_io::TOK_STRING;
        }
      else
        return basic_io::TOK_NONE;
    }
   void err(std::string const & s);
  };

  std::string escape(std::string const & s);

  struct
  stanza
  {
    stanza();
    size_t indent;
    std::vector<std::pair<symbol, std::string> > entries;
    void push_symbol(symbol const & k);
    void push_hex_pair(symbol const & k, hexenc<id> const & v);
    void push_binary_pair(symbol const & k, id const & v);
    void push_binary_triple(symbol const & k, std::string const & n,
                         id const & v);
    void push_str_pair(symbol const & k, std::string const & v);
    void push_str_pair(symbol const & k, symbol const & v);
    void push_str_triple(symbol const & k, std::string const & n,
                         std::string const & v);
    void push_file_pair(symbol const & k, file_path const & v);
    void push_str_multi(symbol const & k,
                        std::vector<std::string> const & v);
    void push_str_multi(symbol const & k1,
                        symbol const & k2,
                        std::vector<std::string> const & v);
  };


  // Note: printer uses a static buffer; thus only one buffer
  // may be referenced (globally). An invariant will be triggered
  // if more than one basic_io::printer is instantiated.
  struct
  printer
  {
    static std::string buf;
    static int count;
    printer();
    ~printer();
    void print_stanza(stanza const & st);
  };

  struct
  parser
  {
    tokenizer & tok;
    parser(tokenizer & t) : tok(t)
    {
      token.reserve(128);
      advance();
    }

    std::string token;
    token_type ttype;

    void err(std::string const & s);
    std::string tt2str(token_type tt);

    inline void advance()
    {
      ttype = tok.get_token(token);
    }

    inline void eat(token_type want)
    {
      if (ttype != want)
        err("wanted "
            + tt2str(want)
            + ", got "
            + tt2str(ttype)
            + (token.empty()
               ? std::string("")
               : (std::string(" with value ") + token)));
      advance();
    }

    inline void str() { eat(basic_io::TOK_STRING); }
    inline void sym() { eat(basic_io::TOK_SYMBOL); }
    inline void hex() { eat(basic_io::TOK_HEX); }

    inline void str(std::string & v) { v = token; str(); }
    inline void sym(std::string & v) { v = token; sym(); }
    inline void hex(std::string & v) { v = token; hex(); }
    inline bool symp() { return ttype == basic_io::TOK_SYMBOL; }
    inline bool symp(symbol const & val)
    {
      return ttype == basic_io::TOK_SYMBOL && token == val();
    }
    inline void esym(symbol const & val)
    {
      if (!(ttype == basic_io::TOK_SYMBOL && token == val()))
        err("wanted symbol '"
            + val() +
            + "', got "
            + tt2str(ttype)
            + (token.empty()
               ? std::string("")
               : (std::string(" with value ") + token)));
      advance();
    }
  };

}

#endif // __BASIC_IO_HH__

// Local Variables:
// mode: C++
// fill-column: 76
// c-file-style: "gnu"
// indent-tabs-mode: nil
// End:
// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s: