File: tokenizer.rb

package info (click to toggle)
ruby-hocon 1.4.0-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 768 kB
sloc: ruby: 7,903; makefile: 4
file content (623 lines) | stat: -rw-r--r-- 17,774 bytes
# encoding: utf-8

require_relative '../../hocon/impl'
require_relative '../../hocon/impl/config_impl_util'
require_relative '../../hocon/impl/tokens'
require_relative '../../hocon/config_error'
require 'stringio'
require 'forwardable'

class Hocon::Impl::Tokenizer
  Tokens = Hocon::Impl::Tokens
  ConfigBugOrBrokenError = Hocon::ConfigError::ConfigBugOrBrokenError

  class TokenizerProblemError < StandardError
    def initialize(problem)
      @problem = problem
    end

    def problem
      @problem
    end
  end

  def self.as_string(codepoint)
    if codepoint == "\n"
      "newline"
    elsif codepoint == "\t"
      "tab"
    elsif codepoint == -1
      "end of file"
    elsif codepoint =~ /[[:cntrl:]]/
      "control character 0x%x" % codepoint
    else
      "%c" % codepoint
    end
  end

  # Tokenizes a Reader. Does not close the reader; you have to arrange to do
  # that after you're done with the returned iterator.
  def self.tokenize(origin, input, syntax)
    TokenIterator.new(origin, input, syntax != Hocon::ConfigSyntax::JSON)
  end

  def self.render(tokens)
    rendered_text = ""
    while (t = tokens.next)
      rendered_text << t.token_text
    end
    rendered_text
  end

  class TokenIterator

    class WhitespaceSaver
      def initialize
        @whitespace = StringIO.new
        @last_token_was_simple_value = false
      end

      def add(c)
        @whitespace << c
      end

      def check(t, base_origin, line_number)
        if TokenIterator.simple_value?(t)
          next_is_a_simple_value(base_origin, line_number)
        else
          next_is_not_a_simple_value(base_origin, line_number)
        end
      end

      private
      # called if the next token is not a simple value;
      # discards any whitespace we were saving between
      # simple values.
      def next_is_not_a_simple_value(base_origin, line_number)
        @last_token_was_simple_value = false
        create_whitespace_token_from_saver(base_origin, line_number)
      end

      # called if the next token IS a simple value,
      # so creates a whitespace token if the previous
      # token also was.
      def next_is_a_simple_value(base_origin, line_number)
        t = create_whitespace_token_from_saver(base_origin, line_number)
        @last_token_was_simple_value = true unless @last_token_was_simple_value
        t
      end

      def create_whitespace_token_from_saver(base_origin, line_number)
        return nil unless @whitespace.length > 0
        if (@last_token_was_simple_value)
          t = Tokens.new_unquoted_text(
              Hocon::Impl::Tokenizer::TokenIterator.line_origin(base_origin, line_number),
              String.new(@whitespace.string)
          )
        else
          t = Tokens.new_ignored_whitespace(
              Hocon::Impl::Tokenizer::TokenIterator.line_origin(base_origin, line_number),
              String.new(@whitespace.string)
          )
        end
        @whitespace.string = ""
        t
      end
    end

    def initialize(origin, input, allow_comments)
      @origin = origin
      @input = input
      @allow_comments = allow_comments
      @buffer = []
      @line_number = 1
      @line_origin = @origin.with_line_number(@line_number)
      @tokens = []
      @tokens << Tokens::START
      @whitespace_saver = WhitespaceSaver.new
    end

    # this should ONLY be called from nextCharSkippingComments
    # or when inside a quoted string, or when parsing a sequence
    # like ${ or +=, everything else should use
    # nextCharSkippingComments().
    def next_char_raw
      if @buffer.empty?
        begin
          @input.readchar.chr
        rescue EOFError
          -1
        end
      else
        @buffer.pop
      end
    end

    def put_back(c)
      if @buffer.length > 2
        raise ConfigBugOrBrokenError, "bug: putBack() three times, undesirable look-ahead"
      end
      @buffer.push(c)
    end

    def self.whitespace?(c)
      Hocon::Impl::ConfigImplUtil.whitespace?(c)
    end

    def self.whitespace_not_newline?(c)
      (c != "\n") and (Hocon::Impl::ConfigImplUtil.whitespace?(c))
    end

    def start_of_comment?(c)
      if c == -1
        false
      else
        if @allow_comments
          if c == '#'
            true
          elsif c == '/'
            maybe_second_slash = next_char_raw
            # we want to predictably NOT consume any chars
            put_back(maybe_second_slash)
            if maybe_second_slash == '/'
              true
            else
              false
            end
          end
        else
          false
        end
      end
    end

    # get next char, skipping non-newline whitespace
    def next_char_after_whitespace(saver)
      while true
        c = next_char_raw
        if c == -1
          return -1
        else
          if self.class.whitespace_not_newline?(c)
            saver.add(c)
          else
            return c
          end
        end
      end
    end

    def self.problem(origin, what, message, suggest_quotes, cause)
      if what.nil? || message.nil?
        raise ConfigBugOrBrokenError.new("internal error, creating bad TokenizerProblemError")
      end
      TokenizerProblemError.new(Tokens.new_problem(origin, what, message, suggest_quotes, cause))
    end

    def self.line_origin(base_origin, line_number)
      base_origin.with_line_number(line_number)
    end

    # ONE char has always been consumed, either the # or the first /, but not
    # both slashes
    def pull_comment(first_char)
      double_slash = false
      if first_char == '/'
        discard = next_char_raw
        if discard != '/'
          raise ConfigBugOrBrokenError, "called pullComment but // not seen"
        end
        double_slash = true
      end

      io = StringIO.new
      while true
        c = next_char_raw
        if (c == -1) || (c == "\n")
          put_back(c)
          if (double_slash)
            return Tokens.new_comment_double_slash(@line_origin, io.string)
          else
            return Tokens.new_comment_hash(@line_origin, io.string)
          end
        else
          io << c
        end
      end
    end

    # chars JSON allows a number to start with
    FIRST_NUMBER_CHARS = "0123456789-"
    # chars JSON allows to be part of a number
    NUMBER_CHARS = "0123456789eE+-."
    # chars that stop an unquoted string
    NOT_IN_UNQUOTED_TEXT = "$\"{}[]:=,+#`^?!@*&\\"


    # The rules here are intended to maximize convenience while
    # avoiding confusion with real valid JSON. Basically anything
    # that parses as JSON is treated the JSON way and otherwise
    # we assume it's a string and let the parser sort it out.
    def pull_unquoted_text
      origin = @line_origin
      io = StringIO.new
      c = next_char_raw
      while true
        if (c == -1) or
            (NOT_IN_UNQUOTED_TEXT.index(c)) or
            (self.class.whitespace?(c)) or
            (start_of_comment?(c))
          break
        else
          io << c
        end

        # we parse true/false/null tokens as such no matter
        # what is after them, as long as they are at the
        # start of the unquoted token.
        if io.length == 4
          if io.string == "true"
            return Tokens.new_boolean(origin, true)
          elsif io.string == "null"
            return Tokens.new_null(origin)
          end
        elsif io.length  == 5
          if io.string == "false"
            return Tokens.new_boolean(origin, false)
          end
        end

        c = next_char_raw
      end

      # put back the char that ended the unquoted text
      put_back(c)

      Tokens.new_unquoted_text(origin, io.string)
    end

    def pull_number(first_char)
      sb = StringIO.new
      sb << first_char
      contained_decimal_or_e = false
      c = next_char_raw
      while (c != -1) && (NUMBER_CHARS.index(c))
        if (c == '.') ||
            (c == 'e') ||
            (c == 'E')
          contained_decimal_or_e = true
        end
        sb << c
        c = next_char_raw
      end
      # the last character we looked at wasn't part of the number, put it
      # back
      put_back(c)
      s = sb.string
      begin
        if contained_decimal_or_e
          # force floating point representation
          Tokens.new_double(@line_origin, Float(s), s)
        else
          Tokens.new_long(@line_origin, Integer(s), s)
        end
      rescue ArgumentError => e
        if e.message =~ /^invalid value for (Float|Integer)\(\)/
          # not a number after all, see if it's an unquoted string.
          s.each_char do |u|
            if NOT_IN_UNQUOTED_TEXT.index(u)
              raise self.class.problem(@line_origin, u, "Reserved character '#{u}'" +
                                                       "is not allowed outside quotes", true, nil)
            end
          end
          # no evil chars so we just decide this was a string and
          # not a number.
          Tokens.new_unquoted_text(@line_origin, s)
        else
          raise e
        end
      end
    end

    def pull_escape_sequence(sb, sb_orig)
      escaped = next_char_raw

      if escaped == -1
        error_msg = "End of input but backslash in string had nothing after it"
        raise self.class.problem(@line_origin, "", error_msg, false, nil)
      end

      # This is needed so we return the unescaped escape characters back out when rendering
      # the token
      sb_orig << "\\" << escaped

      case escaped
        when "\""
          sb << "\""
        when "\\"
          sb << "\\"
        when "/"
          sb << "/"
        when "b"
          sb << "\b"
        when "f"
          sb << "\f"
        when "n"
          sb << "\n"
        when "r"
          sb << "\r"
        when "t"
          sb << "\t"
        when "u"
          codepoint = ""

          # Grab the 4 hex chars for the unicode character
          4.times do
            c = next_char_raw

            if c == -1
              error_msg = "End of input but expecting 4 hex digits for \\uXXXX escape"
              raise self.class.problem(@line_origin, c, error_msg, false, nil)
            end

            codepoint << c
          end
          sb_orig << codepoint
          # Convert codepoint to a unicode character
          packed = [codepoint.hex].pack("U")
          if packed == "_"
            raise self.class.problem(@line_origin, codepoint,
                                     "Malformed hex digits after \\u escape in string: '#{codepoint}'",
                                     false, nil)
          end
          sb << packed
        else
          error_msg = "backslash followed by '#{escaped}', this is not a valid escape sequence (quoted strings use JSON escaping, so use double-backslash \\ for literal backslash)"
          raise self.class.problem(Hocon::Impl::Tokenizer.as_string(escaped), "", error_msg, false, nil)
      end
    end

    def append_triple_quoted_string(sb, sb_orig)
      # we are after the opening triple quote and need to consume the
      # close triple
      consecutive_quotes = 0

      while true
        c = next_char_raw

        if c == '"'
          consecutive_quotes += 1
        elsif consecutive_quotes >= 3
          # the last three quotes end the string and the other kept.
          sb.string = sb.string[0...-3]
          put_back c
          break
        else
          consecutive_quotes = 0
          if c == -1
            error_msg = "End of input but triple-quoted string was still open"
            raise self.class.problem(@line_origin, c, error_msg, false, nil)
          elsif c == "\n"
            # keep the line number accurate
            @line_number += 1
            @line_origin = @origin.with_line_number(@line_number)
          end
        end

        sb << c
        sb_orig << c
      end
    end

    def pull_quoted_string
      # the open quote has already been consumed
      sb = StringIO.new

      # We need a second StringIO to keep track of escape characters.
      # We want to return them exactly as they appeared in the original text,
      # which means we will need a new StringIO to escape escape characters
      # so we can also keep the actual value of the string. This is gross.
      sb_orig = StringIO.new
      sb_orig << '"'

      c = ""
      while c != '"'
        c = next_char_raw
        if c == -1
          raise self.class.problem(@line_origin, c, "End of input but string quote was still open", false, nil)
        end

        if c == "\\"
          pull_escape_sequence(sb, sb_orig)
        elsif c == '"'
          sb_orig << c
          # done!
        elsif c =~ /[[:cntrl:]]/
          raise self.class.problem(@line_origin, c, "JSON does not allow unescaped #{c}" +
                                                   " in quoted strings, use a backslash escape", false, nil)
        else
          sb << c
          sb_orig << c
        end
      end

      # maybe switch to triple-quoted string, sort of hacky...
      if sb.length == 0
        third = next_char_raw
        if third == '"'
          sb_orig << third
          append_triple_quoted_string(sb, sb_orig)
        else
          put_back(third)
        end
      end

      Tokens.new_string(@line_origin, sb.string, sb_orig.string)
    end

    def pull_plus_equals
      # the initial '+' has already been consumed
      c = next_char_raw

      unless c == '='
        error_msg = "'+' not followed by =, '#{c}' not allowed after '+'"
        raise self.class.problem(@line_origin, c, error_msg, true, nil) # true = suggest quotes
      end

      Tokens::PLUS_EQUALS
    end

    def pull_substitution
      # the initial '$' has already been consumed
      c = next_char_raw
      if c != '{'
        error_msg = "'$' not followed by {, '#{c}' not allowed after '$'"
        raise self.class.problem(@line_origin, c, error_msg, true, nil) # true = suggest quotes
      end

      optional = false
      c = next_char_raw

      if c == '?'
        optional = true
      else
        put_back(c)
      end

      saver = WhitespaceSaver.new
      expression = []

      while true
        t = pull_next_token(saver)
        # note that we avoid validating the allowed tokens inside
        # the substitution here; we even allow nested substitutions
        # in the tokenizer. The parser sorts it out.

        if t == Tokens::CLOSE_CURLY
          # end the loop, done!
          break
        elsif t == Tokens::EOF
          raise self.class.problem(@line_origin, t, "Substitution ${ was not closed with a }", false, nil)
        else
          whitespace = saver.check(t, @line_origin, @line_number)
          unless whitespace.nil?
            expression << whitespace
          end
          expression << t
        end
      end

      Tokens.new_substitution(@line_origin, optional, expression)
    end

    def pull_next_token(saver)
      c = next_char_after_whitespace(saver)
      if c == -1
        Tokens::EOF
      elsif c == "\n"
        # newline tokens have the just-ended line number
        line = Tokens.new_line(@line_origin)
        @line_number += 1
        @line_origin = @origin.with_line_number(@line_number)
        line
      else
        t = nil
        if start_of_comment?(c)
          t = pull_comment(c)
        else
          t = case c
                when '"' then pull_quoted_string
                when '$' then pull_substitution
                when ':' then Tokens::COLON
                when ',' then Tokens::COMMA
                when '=' then Tokens::EQUALS
                when '{' then Tokens::OPEN_CURLY
                when '}' then Tokens::CLOSE_CURLY
                when '[' then Tokens::OPEN_SQUARE
                when ']' then Tokens::CLOSE_SQUARE
                when '+' then pull_plus_equals
                else nil
              end

          if t.nil?
            if FIRST_NUMBER_CHARS.index(c)
              t = pull_number(c)
            elsif NOT_IN_UNQUOTED_TEXT.index(c)
              raise self.class.problem(@line_origin, c, "Reserved character '#{c}' is not allowed outside quotes", true, nil)
            else
              put_back(c)
              t = pull_unquoted_text
            end
          end
        end

        if t.nil?
          raise ConfigBugOrBrokenError, "bug: failed to generate next token"
        end

        t
      end
    end

    def self.simple_value?(t)
      Tokens.substitution?(t) ||
          Tokens.unquoted_text?(t) ||
          Tokens.value?(t)
    end

    def queue_next_token
      t = pull_next_token(@whitespace_saver)
      whitespace = @whitespace_saver.check(t, @origin, @line_number)
      if whitespace
        @tokens.push(whitespace)
      end
      @tokens.push(t)
    end

    def has_next?
      !@tokens.empty?
    end

    def next
      t = @tokens.shift
      if (@tokens.empty?) and (t != Tokens::EOF)
        begin
          queue_next_token
        rescue TokenizerProblemError => e
          @tokens.push(e.problem)
        end
        if @tokens.empty?
          raise ConfigBugOrBrokenError, "bug: tokens queue should not be empty here"
        end
      end
      t
    end

    def remove
      raise ConfigBugOrBrokenError, "Does not make sense to remove items from token stream"
    end

    def each
      while has_next?
        # Have to use self.next instead of next because next is a reserved word
        yield self.next
      end
    end

    def map
      token_list = []
      each do |token|
        # yield token to calling method, append whatever is returned from the
        # map block to token_list
        token_list << yield(token)
      end
      token_list
    end

    def to_list
      # Return array of tokens from the iterator
      self.map { |token| token }
    end

  end
end