File: tokenizer.rb

package info (click to toggle)
ruby-hocon 1.4.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 768 kB
  • sloc: ruby: 7,903; makefile: 4
file content (623 lines) | stat: -rw-r--r-- 17,774 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
# encoding: utf-8

require_relative '../../hocon/impl'
require_relative '../../hocon/impl/config_impl_util'
require_relative '../../hocon/impl/tokens'
require_relative '../../hocon/config_error'
require 'stringio'
require 'forwardable'

class Hocon::Impl::Tokenizer
  Tokens = Hocon::Impl::Tokens
  ConfigBugOrBrokenError = Hocon::ConfigError::ConfigBugOrBrokenError

  class TokenizerProblemError < StandardError
    def initialize(problem)
      @problem = problem
    end

    def problem
      @problem
    end
  end

  def self.as_string(codepoint)
    if codepoint == "\n"
      "newline"
    elsif codepoint == "\t"
      "tab"
    elsif codepoint == -1
      "end of file"
    elsif codepoint =~ /[[:cntrl:]]/
      "control character 0x%x" % codepoint
    else
      "%c" % codepoint
    end
  end

  # Tokenizes a Reader. Does not close the reader; you have to arrange to do
  # that after you're done with the returned iterator.
  def self.tokenize(origin, input, syntax)
    TokenIterator.new(origin, input, syntax != Hocon::ConfigSyntax::JSON)
  end

  def self.render(tokens)
    rendered_text = ""
    while (t = tokens.next)
      rendered_text << t.token_text
    end
    rendered_text
  end

  class TokenIterator

    class WhitespaceSaver
      def initialize
        @whitespace = StringIO.new
        @last_token_was_simple_value = false
      end

      def add(c)
        @whitespace << c
      end

      def check(t, base_origin, line_number)
        if TokenIterator.simple_value?(t)
          next_is_a_simple_value(base_origin, line_number)
        else
          next_is_not_a_simple_value(base_origin, line_number)
        end
      end

      private
      # called if the next token is not a simple value;
      # discards any whitespace we were saving between
      # simple values.
      def next_is_not_a_simple_value(base_origin, line_number)
        @last_token_was_simple_value = false
        create_whitespace_token_from_saver(base_origin, line_number)
      end

      # called if the next token IS a simple value,
      # so creates a whitespace token if the previous
      # token also was.
      def next_is_a_simple_value(base_origin, line_number)
        t = create_whitespace_token_from_saver(base_origin, line_number)
        @last_token_was_simple_value = true unless @last_token_was_simple_value
        t
      end

      def create_whitespace_token_from_saver(base_origin, line_number)
        return nil unless @whitespace.length > 0
        if (@last_token_was_simple_value)
          t = Tokens.new_unquoted_text(
              Hocon::Impl::Tokenizer::TokenIterator.line_origin(base_origin, line_number),
              String.new(@whitespace.string)
          )
        else
          t = Tokens.new_ignored_whitespace(
              Hocon::Impl::Tokenizer::TokenIterator.line_origin(base_origin, line_number),
              String.new(@whitespace.string)
          )
        end
        @whitespace.string = ""
        t
      end
    end

    def initialize(origin, input, allow_comments)
      @origin = origin
      @input = input
      @allow_comments = allow_comments
      @buffer = []
      @line_number = 1
      @line_origin = @origin.with_line_number(@line_number)
      @tokens = []
      @tokens << Tokens::START
      @whitespace_saver = WhitespaceSaver.new
    end

    # this should ONLY be called from nextCharSkippingComments
    # or when inside a quoted string, or when parsing a sequence
    # like ${ or +=, everything else should use
    # nextCharSkippingComments().
    def next_char_raw
      if @buffer.empty?
        begin
          @input.readchar.chr
        rescue EOFError
          -1
        end
      else
        @buffer.pop
      end
    end

    def put_back(c)
      if @buffer.length > 2
        raise ConfigBugOrBrokenError, "bug: putBack() three times, undesirable look-ahead"
      end
      @buffer.push(c)
    end

    def self.whitespace?(c)
      Hocon::Impl::ConfigImplUtil.whitespace?(c)
    end

    def self.whitespace_not_newline?(c)
      (c != "\n") and (Hocon::Impl::ConfigImplUtil.whitespace?(c))
    end

    def start_of_comment?(c)
      if c == -1
        false
      else
        if @allow_comments
          if c == '#'
            true
          elsif c == '/'
            maybe_second_slash = next_char_raw
            # we want to predictably NOT consume any chars
            put_back(maybe_second_slash)
            if maybe_second_slash == '/'
              true
            else
              false
            end
          end
        else
          false
        end
      end
    end

    # get next char, skipping non-newline whitespace
    def next_char_after_whitespace(saver)
      while true
        c = next_char_raw
        if c == -1
          return -1
        else
          if self.class.whitespace_not_newline?(c)
            saver.add(c)
          else
            return c
          end
        end
      end
    end

    def self.problem(origin, what, message, suggest_quotes, cause)
      if what.nil? || message.nil?
        raise ConfigBugOrBrokenError.new("internal error, creating bad TokenizerProblemError")
      end
      TokenizerProblemError.new(Tokens.new_problem(origin, what, message, suggest_quotes, cause))
    end

    def self.line_origin(base_origin, line_number)
      base_origin.with_line_number(line_number)
    end

    # ONE char has always been consumed, either the # or the first /, but not
    # both slashes
    def pull_comment(first_char)
      double_slash = false
      if first_char == '/'
        discard = next_char_raw
        if discard != '/'
          raise ConfigBugOrBrokenError, "called pullComment but // not seen"
        end
        double_slash = true
      end

      io = StringIO.new
      while true
        c = next_char_raw
        if (c == -1) || (c == "\n")
          put_back(c)
          if (double_slash)
            return Tokens.new_comment_double_slash(@line_origin, io.string)
          else
            return Tokens.new_comment_hash(@line_origin, io.string)
          end
        else
          io << c
        end
      end
    end

    # chars JSON allows a number to start with
    FIRST_NUMBER_CHARS = "0123456789-"
    # chars JSON allows to be part of a number
    NUMBER_CHARS = "0123456789eE+-."
    # chars that stop an unquoted string
    NOT_IN_UNQUOTED_TEXT = "$\"{}[]:=,+#`^?!@*&\\"


    # The rules here are intended to maximize convenience while
    # avoiding confusion with real valid JSON. Basically anything
    # that parses as JSON is treated the JSON way and otherwise
    # we assume it's a string and let the parser sort it out.
    def pull_unquoted_text
      origin = @line_origin
      io = StringIO.new
      c = next_char_raw
      while true
        if (c == -1) or
            (NOT_IN_UNQUOTED_TEXT.index(c)) or
            (self.class.whitespace?(c)) or
            (start_of_comment?(c))
          break
        else
          io << c
        end

        # we parse true/false/null tokens as such no matter
        # what is after them, as long as they are at the
        # start of the unquoted token.
        if io.length == 4
          if io.string == "true"
            return Tokens.new_boolean(origin, true)
          elsif io.string == "null"
            return Tokens.new_null(origin)
          end
        elsif io.length  == 5
          if io.string == "false"
            return Tokens.new_boolean(origin, false)
          end
        end

        c = next_char_raw
      end

      # put back the char that ended the unquoted text
      put_back(c)

      Tokens.new_unquoted_text(origin, io.string)
    end

    def pull_number(first_char)
      sb = StringIO.new
      sb << first_char
      contained_decimal_or_e = false
      c = next_char_raw
      while (c != -1) && (NUMBER_CHARS.index(c))
        if (c == '.') ||
            (c == 'e') ||
            (c == 'E')
          contained_decimal_or_e = true
        end
        sb << c
        c = next_char_raw
      end
      # the last character we looked at wasn't part of the number, put it
      # back
      put_back(c)
      s = sb.string
      begin
        if contained_decimal_or_e
          # force floating point representation
          Tokens.new_double(@line_origin, Float(s), s)
        else
          Tokens.new_long(@line_origin, Integer(s), s)
        end
      rescue ArgumentError => e
        if e.message =~ /^invalid value for (Float|Integer)\(\)/
          # not a number after all, see if it's an unquoted string.
          s.each_char do |u|
            if NOT_IN_UNQUOTED_TEXT.index(u)
              raise self.class.problem(@line_origin, u, "Reserved character '#{u}'" +
                                                       "is not allowed outside quotes", true, nil)
            end
          end
          # no evil chars so we just decide this was a string and
          # not a number.
          Tokens.new_unquoted_text(@line_origin, s)
        else
          raise e
        end
      end
    end

    def pull_escape_sequence(sb, sb_orig)
      escaped = next_char_raw

      if escaped == -1
        error_msg = "End of input but backslash in string had nothing after it"
        raise self.class.problem(@line_origin, "", error_msg, false, nil)
      end

      # This is needed so we return the unescaped escape characters back out when rendering
      # the token
      sb_orig << "\\" << escaped

      case escaped
        when "\""
          sb << "\""
        when "\\"
          sb << "\\"
        when "/"
          sb << "/"
        when "b"
          sb << "\b"
        when "f"
          sb << "\f"
        when "n"
          sb << "\n"
        when "r"
          sb << "\r"
        when "t"
          sb << "\t"
        when "u"
          codepoint = ""

          # Grab the 4 hex chars for the unicode character
          4.times do
            c = next_char_raw

            if c == -1
              error_msg = "End of input but expecting 4 hex digits for \\uXXXX escape"
              raise self.class.problem(@line_origin, c, error_msg, false, nil)
            end

            codepoint << c
          end
          sb_orig << codepoint
          # Convert codepoint to a unicode character
          packed = [codepoint.hex].pack("U")
          if packed == "_"
            raise self.class.problem(@line_origin, codepoint,
                                     "Malformed hex digits after \\u escape in string: '#{codepoint}'",
                                     false, nil)
          end
          sb << packed
        else
          error_msg = "backslash followed by '#{escaped}', this is not a valid escape sequence (quoted strings use JSON escaping, so use double-backslash \\ for literal backslash)"
          raise self.class.problem(Hocon::Impl::Tokenizer.as_string(escaped), "", error_msg, false, nil)
      end
    end

    def append_triple_quoted_string(sb, sb_orig)
      # we are after the opening triple quote and need to consume the
      # close triple
      consecutive_quotes = 0

      while true
        c = next_char_raw

        if c == '"'
          consecutive_quotes += 1
        elsif consecutive_quotes >= 3
          # the last three quotes end the string and the other kept.
          sb.string = sb.string[0...-3]
          put_back c
          break
        else
          consecutive_quotes = 0
          if c == -1
            error_msg = "End of input but triple-quoted string was still open"
            raise self.class.problem(@line_origin, c, error_msg, false, nil)
          elsif c == "\n"
            # keep the line number accurate
            @line_number += 1
            @line_origin = @origin.with_line_number(@line_number)
          end
        end

        sb << c
        sb_orig << c
      end
    end

    def pull_quoted_string
      # the open quote has already been consumed
      sb = StringIO.new

      # We need a second StringIO to keep track of escape characters.
      # We want to return them exactly as they appeared in the original text,
      # which means we will need a new StringIO to escape escape characters
      # so we can also keep the actual value of the string. This is gross.
      sb_orig = StringIO.new
      sb_orig << '"'

      c = ""
      while c != '"'
        c = next_char_raw
        if c == -1
          raise self.class.problem(@line_origin, c, "End of input but string quote was still open", false, nil)
        end

        if c == "\\"
          pull_escape_sequence(sb, sb_orig)
        elsif c == '"'
          sb_orig << c
          # done!
        elsif c =~ /[[:cntrl:]]/
          raise self.class.problem(@line_origin, c, "JSON does not allow unescaped #{c}" +
                                                   " in quoted strings, use a backslash escape", false, nil)
        else
          sb << c
          sb_orig << c
        end
      end

      # maybe switch to triple-quoted string, sort of hacky...
      if sb.length == 0
        third = next_char_raw
        if third == '"'
          sb_orig << third
          append_triple_quoted_string(sb, sb_orig)
        else
          put_back(third)
        end
      end

      Tokens.new_string(@line_origin, sb.string, sb_orig.string)
    end

    def pull_plus_equals
      # the initial '+' has already been consumed
      c = next_char_raw

      unless c == '='
        error_msg = "'+' not followed by =, '#{c}' not allowed after '+'"
        raise self.class.problem(@line_origin, c, error_msg, true, nil) # true = suggest quotes
      end

      Tokens::PLUS_EQUALS
    end

    def pull_substitution
      # the initial '$' has already been consumed
      c = next_char_raw
      if c != '{'
        error_msg = "'$' not followed by {, '#{c}' not allowed after '$'"
        raise self.class.problem(@line_origin, c, error_msg, true, nil) # true = suggest quotes
      end

      optional = false
      c = next_char_raw

      if c == '?'
        optional = true
      else
        put_back(c)
      end

      saver = WhitespaceSaver.new
      expression = []

      while true
        t = pull_next_token(saver)
        # note that we avoid validating the allowed tokens inside
        # the substitution here; we even allow nested substitutions
        # in the tokenizer. The parser sorts it out.

        if t == Tokens::CLOSE_CURLY
          # end the loop, done!
          break
        elsif t == Tokens::EOF
          raise self.class.problem(@line_origin, t, "Substitution ${ was not closed with a }", false, nil)
        else
          whitespace = saver.check(t, @line_origin, @line_number)
          unless whitespace.nil?
            expression << whitespace
          end
          expression << t
        end
      end

      Tokens.new_substitution(@line_origin, optional, expression)
    end

    def pull_next_token(saver)
      c = next_char_after_whitespace(saver)
      if c == -1
        Tokens::EOF
      elsif c == "\n"
        # newline tokens have the just-ended line number
        line = Tokens.new_line(@line_origin)
        @line_number += 1
        @line_origin = @origin.with_line_number(@line_number)
        line
      else
        t = nil
        if start_of_comment?(c)
          t = pull_comment(c)
        else
          t = case c
                when '"' then pull_quoted_string
                when '$' then pull_substitution
                when ':' then Tokens::COLON
                when ',' then Tokens::COMMA
                when '=' then Tokens::EQUALS
                when '{' then Tokens::OPEN_CURLY
                when '}' then Tokens::CLOSE_CURLY
                when '[' then Tokens::OPEN_SQUARE
                when ']' then Tokens::CLOSE_SQUARE
                when '+' then pull_plus_equals
                else nil
              end

          if t.nil?
            if FIRST_NUMBER_CHARS.index(c)
              t = pull_number(c)
            elsif NOT_IN_UNQUOTED_TEXT.index(c)
              raise self.class.problem(@line_origin, c, "Reserved character '#{c}' is not allowed outside quotes", true, nil)
            else
              put_back(c)
              t = pull_unquoted_text
            end
          end
        end

        if t.nil?
          raise ConfigBugOrBrokenError, "bug: failed to generate next token"
        end

        t
      end
    end

    def self.simple_value?(t)
      Tokens.substitution?(t) ||
          Tokens.unquoted_text?(t) ||
          Tokens.value?(t)
    end

    def queue_next_token
      t = pull_next_token(@whitespace_saver)
      whitespace = @whitespace_saver.check(t, @origin, @line_number)
      if whitespace
        @tokens.push(whitespace)
      end
      @tokens.push(t)
    end

    def has_next?
      !@tokens.empty?
    end

    def next
      t = @tokens.shift
      if (@tokens.empty?) and (t != Tokens::EOF)
        begin
          queue_next_token
        rescue TokenizerProblemError => e
          @tokens.push(e.problem)
        end
        if @tokens.empty?
          raise ConfigBugOrBrokenError, "bug: tokens queue should not be empty here"
        end
      end
      t
    end

    def remove
      raise ConfigBugOrBrokenError, "Does not make sense to remove items from token stream"
    end

    def each
      while has_next?
        # Have to use self.next instead of next because next is a reserved word
        yield self.next
      end
    end

    def map
      token_list = []
      each do |token|
        # yield token to calling method, append whatever is returned from the
        # map block to token_list
        token_list << yield(token)
      end
      token_list
    end

    def to_list
      # Return array of tokens from the iterator
      self.map { |token| token }
    end

  end
end