1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
|
# frozen_string_literal: true
# A very thin wrapper around the scanner that breaks quantified literal runs,
# collects emitted tokens into an array, calculates their nesting depth, and
# normalizes tokens for the parser, and checks if they are implemented by the
# given syntax flavor.
class Regexp::Lexer
OPENING_TOKENS = %i[
capture passive lookahead nlookahead lookbehind nlookbehind
atomic options options_switch named absence open
].freeze
CLOSING_TOKENS = %i[close].freeze
CONDITION_TOKENS = %i[condition condition_close].freeze
def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
end
def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
self.block = block
self.collect_tokens = collect_tokens
self.tokens = []
self.prev_token = nil
self.preprev_token = nil
self.nesting = 0
self.set_nesting = 0
self.conditional_nesting = 0
self.shift = 0
Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
type, token = *syntax.normalize(type, token)
syntax.check! type, token
ascend(type, token)
if (last = prev_token) &&
type == :quantifier &&
(
(last.type == :literal && (parts = break_literal(last))) ||
(last.token == :codepoint_list && (parts = break_codepoint_list(last)))
)
emit(parts[0])
last = parts[1]
end
current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
nesting, set_nesting, conditional_nesting)
if type == :conditional && CONDITION_TOKENS.include?(token)
current = merge_condition(current, last)
elsif last
last.next = current
current.previous = last
emit(last)
end
self.preprev_token = last
self.prev_token = current
descend(type, token)
end
emit(prev_token) if prev_token
collect_tokens ? tokens : nil
end
def emit(token)
if block
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
res = block.call(token)
tokens << res if collect_tokens
else
tokens << token
end
end
class << self
alias :scan :lex
end
private
attr_accessor :block,
:collect_tokens, :tokens, :prev_token, :preprev_token,
:nesting, :set_nesting, :conditional_nesting, :shift
def ascend(type, token)
return unless CLOSING_TOKENS.include?(token)
case type
when :group, :assertion
self.nesting = nesting - 1
when :set
self.set_nesting = set_nesting - 1
when :conditional
self.conditional_nesting = conditional_nesting - 1
else
raise "unhandled nesting type #{type}"
end
end
def descend(type, token)
return unless OPENING_TOKENS.include?(token)
case type
when :group, :assertion
self.nesting = nesting + 1
when :set
self.set_nesting = set_nesting + 1
when :conditional
self.conditional_nesting = conditional_nesting + 1
else
raise "unhandled nesting type #{type}"
end
end
# called by scan to break a literal run that is longer than one character
# into two separate tokens when it is followed by a quantifier
def break_literal(token)
lead, last, _ = token.text.partition(/.\z/mu)
return if lead.empty?
token_1 = Regexp::Token.new(:literal, :literal, lead,
token.ts, (token.te - last.length),
nesting, set_nesting, conditional_nesting)
token_2 = Regexp::Token.new(:literal, :literal, last,
(token.ts + lead.length), token.te,
nesting, set_nesting, conditional_nesting)
token_1.previous = preprev_token
token_1.next = token_2
token_2.previous = token_1 # .next will be set by #lex
[token_1, token_2]
end
# if a codepoint list is followed by a quantifier, that quantifier applies
# to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
# c.f. #break_literal.
def break_codepoint_list(token)
lead, _, tail = token.text.rpartition(' ')
return if lead.empty?
token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
token.ts, (token.te - tail.length),
nesting, set_nesting, conditional_nesting)
token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
(token.ts + lead.length + 1), (token.te + 3),
nesting, set_nesting, conditional_nesting)
self.shift = shift + 3 # one space less, but extra \, u, {, and }
token_1.previous = preprev_token
token_1.next = token_2
token_2.previous = token_1 # .next will be set by #lex
[token_1, token_2]
end
def merge_condition(current, last)
token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
last.ts, current.te, nesting, set_nesting, conditional_nesting)
token.previous = preprev_token # .next will be set by #lex
token
end
end # module Regexp::Lexer
|