1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
|
package lexer
import (
"bytes"
"io"
"io/ioutil"
"regexp"
"unicode/utf8"
)
var eolBytes = []byte("\n")
type regexpDefinition struct {
re *regexp.Regexp
symbols map[string]rune
}
// Regexp creates a lexer definition from a regular expression.
//
// Each named sub-expression in the regular expression matches a token. Anonymous sub-expressions
// will be matched and discarded.
//
// eg.
//
// def, err := Regexp(`(?P<Ident>[a-z]+)|(\s+)|(?P<Number>\d+)`)
func Regexp(pattern string) (Definition, error) {
re, err := regexp.Compile(pattern)
if err != nil {
return nil, err
}
symbols := map[string]rune{
"EOF": EOF,
}
for i, sym := range re.SubexpNames()[1:] {
if sym != "" {
symbols[sym] = EOF - 1 - rune(i)
}
}
return ®expDefinition{re: re, symbols: symbols}, nil
}
func (d *regexpDefinition) Lex(r io.Reader) (Lexer, error) {
b, err := ioutil.ReadAll(r)
if err != nil {
return nil, err
}
return ®expLexer{
pos: Position{
Filename: NameOfReader(r),
Line: 1,
Column: 1,
},
b: b,
re: d.re,
names: d.re.SubexpNames(),
}, nil
}
func (d *regexpDefinition) Symbols() map[string]rune {
return d.symbols
}
type regexpLexer struct {
pos Position
b []byte
re *regexp.Regexp
names []string
}
func (r *regexpLexer) Next() (Token, error) {
nextToken:
for len(r.b) != 0 {
matches := r.re.FindSubmatchIndex(r.b)
if matches == nil || matches[0] != 0 {
rn, _ := utf8.DecodeRune(r.b)
return Token{}, Errorf(r.pos, "invalid token %q", rn)
}
match := r.b[:matches[1]]
token := Token{
Pos: r.pos,
Value: string(match),
}
// Update lexer state.
r.pos.Offset += matches[1]
lines := bytes.Count(match, eolBytes)
r.pos.Line += lines
// Update column.
if lines == 0 {
r.pos.Column += utf8.RuneCount(match)
} else {
r.pos.Column = utf8.RuneCount(match[bytes.LastIndex(match, eolBytes):])
}
// Move slice along.
r.b = r.b[matches[1]:]
// Finally, assign token type. If it is not a named group, we continue to the next token.
for i := 2; i < len(matches); i += 2 {
if matches[i] != -1 {
if r.names[i/2] == "" {
continue nextToken
}
token.Type = EOF - rune(i/2)
break
}
}
return token, nil
}
return EOFToken(r.pos), nil
}
|