1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
|
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import (
"strconv"
"strings"
"unicode"
"unicode/utf8"
)
// Compiled program.
// May not belong in this package, but convenient for now.
// A Prog is a compiled regular expression program.
type Prog struct {
Inst []Inst
Start int // index of start instruction
NumCap int // number of InstCapture insts in re
}
// An InstOp is an instruction opcode.
type InstOp uint8
const (
InstAlt InstOp = iota
InstAltMatch
InstCapture
InstEmptyWidth
InstMatch
InstFail
InstNop
InstRune
InstRune1
InstRuneAny
InstRuneAnyNotNL
)
var instOpNames = []string{
"InstAlt",
"InstAltMatch",
"InstCapture",
"InstEmptyWidth",
"InstMatch",
"InstFail",
"InstNop",
"InstRune",
"InstRune1",
"InstRuneAny",
"InstRuneAnyNotNL",
}
func (i InstOp) String() string {
if uint(i) >= uint(len(instOpNames)) {
return ""
}
return instOpNames[i]
}
// An EmptyOp specifies a kind or mixture of zero-width assertions.
type EmptyOp uint8
const (
EmptyBeginLine EmptyOp = 1 << iota
EmptyEndLine
EmptyBeginText
EmptyEndText
EmptyWordBoundary
EmptyNoWordBoundary
)
// EmptyOpContext returns the zero-width assertions
// satisfied at the position between the runes r1 and r2.
// Passing r1 == -1 indicates that the position is
// at the beginning of the text.
// Passing r2 == -1 indicates that the position is
// at the end of the text.
func EmptyOpContext(r1, r2 rune) EmptyOp {
var op EmptyOp = EmptyNoWordBoundary
var boundary byte
switch {
case IsWordChar(r1):
boundary = 1
case r1 == '\n':
op |= EmptyBeginLine
case r1 < 0:
op |= EmptyBeginText | EmptyBeginLine
}
switch {
case IsWordChar(r2):
boundary ^= 1
case r2 == '\n':
op |= EmptyEndLine
case r2 < 0:
op |= EmptyEndText | EmptyEndLine
}
if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
}
return op
}
// IsWordChar reports whether r is considered a “word character”
// during the evaluation of the \b and \B zero-width assertions.
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
func IsWordChar(r rune) bool {
// Test for lowercase letters first, as these occur more
// frequently than uppercase letters in common cases.
return 'a' <= r && r <= 'z' || 'A' <= r && r <= 'Z' || '0' <= r && r <= '9' || r == '_'
}
// An Inst is a single instruction in a regular expression program.
type Inst struct {
Op InstOp
Out uint32 // all but InstMatch, InstFail
Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
Rune []rune
}
func (p *Prog) String() string {
var b strings.Builder
dumpProg(&b, p)
return b.String()
}
// skipNop follows any no-op or capturing instructions.
func (p *Prog) skipNop(pc uint32) *Inst {
i := &p.Inst[pc]
for i.Op == InstNop || i.Op == InstCapture {
i = &p.Inst[i.Out]
}
return i
}
// op returns i.Op but merges all the Rune special cases into InstRune
func (i *Inst) op() InstOp {
op := i.Op
switch op {
case InstRune1, InstRuneAny, InstRuneAnyNotNL:
op = InstRune
}
return op
}
// Prefix returns a literal string that all matches for the
// regexp must start with. Complete is true if the prefix
// is the entire match.
func (p *Prog) Prefix() (prefix string, complete bool) {
i := p.skipNop(uint32(p.Start))
// Avoid allocation of buffer if prefix is empty.
if i.op() != InstRune || len(i.Rune) != 1 {
return "", i.Op == InstMatch
}
// Have prefix; gather characters.
var buf strings.Builder
for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError {
buf.WriteRune(i.Rune[0])
i = p.skipNop(i.Out)
}
return buf.String(), i.Op == InstMatch
}
// StartCond returns the leading empty-width conditions that must
// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
func (p *Prog) StartCond() EmptyOp {
var flag EmptyOp
pc := uint32(p.Start)
i := &p.Inst[pc]
Loop:
for {
switch i.Op {
case InstEmptyWidth:
flag |= EmptyOp(i.Arg)
case InstFail:
return ^EmptyOp(0)
case InstCapture, InstNop:
// skip
default:
break Loop
}
pc = i.Out
i = &p.Inst[pc]
}
return flag
}
const noMatch = -1
// MatchRune reports whether the instruction matches (and consumes) r.
// It should only be called when i.Op == [InstRune].
func (i *Inst) MatchRune(r rune) bool {
return i.MatchRunePos(r) != noMatch
}
// MatchRunePos checks whether the instruction matches (and consumes) r.
// If so, MatchRunePos returns the index of the matching rune pair
// (or, when len(i.Rune) == 1, rune singleton).
// If not, MatchRunePos returns -1.
// MatchRunePos should only be called when i.Op == [InstRune].
func (i *Inst) MatchRunePos(r rune) int {
rune := i.Rune
switch len(rune) {
case 0:
return noMatch
case 1:
// Special case: single-rune slice is from literal string, not char class.
r0 := rune[0]
if r == r0 {
return 0
}
if Flags(i.Arg)&FoldCase != 0 {
for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
if r == r1 {
return 0
}
}
}
return noMatch
case 2:
if r >= rune[0] && r <= rune[1] {
return 0
}
return noMatch
case 4, 6, 8:
// Linear search for a few pairs.
// Should handle ASCII well.
for j := 0; j < len(rune); j += 2 {
if r < rune[j] {
return noMatch
}
if r <= rune[j+1] {
return j / 2
}
}
return noMatch
}
// Otherwise binary search.
lo := 0
hi := len(rune) / 2
for lo < hi {
m := int(uint(lo+hi) >> 1)
if c := rune[2*m]; c <= r {
if r <= rune[2*m+1] {
return m
}
lo = m + 1
} else {
hi = m
}
}
return noMatch
}
// MatchEmptyWidth reports whether the instruction matches
// an empty string between the runes before and after.
// It should only be called when i.Op == [InstEmptyWidth].
func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
switch EmptyOp(i.Arg) {
case EmptyBeginLine:
return before == '\n' || before == -1
case EmptyEndLine:
return after == '\n' || after == -1
case EmptyBeginText:
return before == -1
case EmptyEndText:
return after == -1
case EmptyWordBoundary:
return IsWordChar(before) != IsWordChar(after)
case EmptyNoWordBoundary:
return IsWordChar(before) == IsWordChar(after)
}
panic("unknown empty width arg")
}
func (i *Inst) String() string {
var b strings.Builder
dumpInst(&b, i)
return b.String()
}
func bw(b *strings.Builder, args ...string) {
for _, s := range args {
b.WriteString(s)
}
}
func dumpProg(b *strings.Builder, p *Prog) {
for j := range p.Inst {
i := &p.Inst[j]
pc := strconv.Itoa(j)
if len(pc) < 3 {
b.WriteString(" "[len(pc):])
}
if j == p.Start {
pc += "*"
}
bw(b, pc, "\t")
dumpInst(b, i)
bw(b, "\n")
}
}
func u32(i uint32) string {
return strconv.FormatUint(uint64(i), 10)
}
func dumpInst(b *strings.Builder, i *Inst) {
switch i.Op {
case InstAlt:
bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
case InstAltMatch:
bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
case InstCapture:
bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
case InstEmptyWidth:
bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
case InstMatch:
bw(b, "match")
case InstFail:
bw(b, "fail")
case InstNop:
bw(b, "nop -> ", u32(i.Out))
case InstRune:
if i.Rune == nil {
// shouldn't happen
bw(b, "rune <nil>")
}
bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
if Flags(i.Arg)&FoldCase != 0 {
bw(b, "/i")
}
bw(b, " -> ", u32(i.Out))
case InstRune1:
bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
case InstRuneAny:
bw(b, "any -> ", u32(i.Out))
case InstRuneAnyNotNL:
bw(b, "anynotnl -> ", u32(i.Out))
}
}
|