File: lexer.rb

package info (click to toggle)
ruby-regexp-parser 2.11.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,092 kB
  • sloc: ruby: 6,891; makefile: 6; sh: 3
file content (171 lines) | stat: -rw-r--r-- 5,238 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# frozen_string_literal: true

# A very thin wrapper around the scanner that breaks quantified literal runs,
# collects emitted tokens into an array, calculates their nesting depth, and
# normalizes tokens for the parser, and checks if they are implemented by the
# given syntax flavor.
class Regexp::Lexer

  OPENING_TOKENS = %i[
    capture passive lookahead nlookahead lookbehind nlookbehind
    atomic options options_switch named absence open
  ].freeze

  CLOSING_TOKENS = %i[close].freeze

  CONDITION_TOKENS = %i[condition condition_close].freeze

  def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
    new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
  end

  def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
    syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT

    self.block = block
    self.collect_tokens = collect_tokens
    self.tokens = []
    self.prev_token = nil
    self.preprev_token = nil
    self.nesting = 0
    self.set_nesting = 0
    self.conditional_nesting = 0
    self.shift = 0

    Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
      type, token = *syntax.normalize(type, token)
      syntax.check! type, token

      ascend(type, token)

      if (last = prev_token) &&
         type == :quantifier &&
         (
           (last.type == :literal         && (parts = break_literal(last))) ||
           (last.token == :codepoint_list && (parts = break_codepoint_list(last)))
         )
        emit(parts[0])
        last = parts[1]
      end

      current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
                                  nesting, set_nesting, conditional_nesting)

      if type == :conditional && CONDITION_TOKENS.include?(token)
        current = merge_condition(current, last)
      elsif last
        last.next = current
        current.previous = last
        emit(last)
      end

      self.preprev_token = last
      self.prev_token = current

      descend(type, token)
    end

    emit(prev_token) if prev_token

    collect_tokens ? tokens : nil
  end

  def emit(token)
    if block
      # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
      res = block.call(token)
      tokens << res if collect_tokens
    else
      tokens << token
    end
  end

  class << self
    alias :scan :lex
  end

  private

  attr_accessor :block,
                :collect_tokens, :tokens, :prev_token, :preprev_token,
                :nesting, :set_nesting, :conditional_nesting, :shift

  def ascend(type, token)
    return unless CLOSING_TOKENS.include?(token)

    case type
    when :group, :assertion
      self.nesting = nesting - 1
    when :set
      self.set_nesting = set_nesting - 1
    when :conditional
      self.conditional_nesting = conditional_nesting - 1
    else
      raise "unhandled nesting type #{type}"
    end
  end

  def descend(type, token)
    return unless OPENING_TOKENS.include?(token)

    case type
    when :group, :assertion
      self.nesting = nesting + 1
    when :set
      self.set_nesting = set_nesting + 1
    when :conditional
      self.conditional_nesting = conditional_nesting + 1
    else
      raise "unhandled nesting type #{type}"
    end
  end

  # called by scan to break a literal run that is longer than one character
  # into two separate tokens when it is followed by a quantifier
  def break_literal(token)
    lead, last, _ = token.text.partition(/.\z/mu)
    return if lead.empty?

    token_1 = Regexp::Token.new(:literal, :literal, lead,
              token.ts, (token.te - last.length),
              nesting, set_nesting, conditional_nesting)
    token_2 = Regexp::Token.new(:literal, :literal, last,
              (token.ts + lead.length), token.te,
              nesting, set_nesting, conditional_nesting)

    token_1.previous = preprev_token
    token_1.next = token_2
    token_2.previous = token_1 # .next will be set by #lex
    [token_1, token_2]
  end

  # if a codepoint list is followed by a quantifier, that quantifier applies
  # to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
  # c.f. #break_literal.
  def break_codepoint_list(token)
    lead, _, tail = token.text.rpartition(' ')
    return if lead.empty?

    token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
              token.ts, (token.te - tail.length),
              nesting, set_nesting, conditional_nesting)
    token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
              (token.ts + lead.length + 1), (token.te + 3),
              nesting, set_nesting, conditional_nesting)

    self.shift = shift + 3 # one space less, but extra \, u, {, and }

    token_1.previous = preprev_token
    token_1.next = token_2
    token_2.previous = token_1 # .next will be set by #lex
    [token_1, token_2]
  end

  def merge_condition(current, last)
    token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
      last.ts, current.te, nesting, set_nesting, conditional_nesting)
    token.previous = preprev_token # .next will be set by #lex
    token
  end

end # module Regexp::Lexer