1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
|
package lexer
import (
"fmt"
"io"
"strings"
"unicode/utf8"
)
type TokenType int
const (
// EOF represents an end of file.
EOF TokenType = -(iota + 1)
)
// EOFToken creates a new EOF token at the given position.
func EOFToken(pos Position) Token {
return Token{Type: EOF, Pos: pos}
}
// Definition is the main entry point for lexing.
type Definition interface {
// Symbols returns a map of symbolic names to the corresponding pseudo-runes for those symbols.
// This is the same approach as used by text/scanner. For example, "EOF" might have the rune
// value of -1, "Ident" might be -2, and so on.
Symbols() map[string]TokenType
// Lex an io.Reader.
Lex(filename string, r io.Reader) (Lexer, error)
}
// StringDefinition is an optional interface lexer Definition's can implement
// to offer a fast path for lexing strings.
type StringDefinition interface {
LexString(filename string, input string) (Lexer, error)
}
// BytesDefinition is an optional interface lexer Definition's can implement
// to offer a fast path for lexing byte slices.
type BytesDefinition interface {
LexBytes(filename string, input []byte) (Lexer, error)
}
// A Lexer returns tokens from a source.
type Lexer interface {
// Next consumes and returns the next token.
Next() (Token, error)
}
// SymbolsByRune returns a map of lexer symbol names keyed by rune.
func SymbolsByRune(def Definition) map[TokenType]string {
symbols := def.Symbols()
out := make(map[TokenType]string, len(symbols))
for s, r := range symbols {
out[r] = s
}
return out
}
// NameOfReader attempts to retrieve the filename of a reader.
func NameOfReader(r interface{}) string {
if nr, ok := r.(interface{ Name() string }); ok {
return nr.Name()
}
return ""
}
// Must takes the result of a Definition constructor call and returns the definition, but panics if
// it errors
//
// eg.
//
// lex = lexer.Must(lexer.Build(`Symbol = "symbol" .`))
func Must(def Definition, err error) Definition {
if err != nil {
panic(err)
}
return def
}
// ConsumeAll reads all tokens from a Lexer.
func ConsumeAll(lexer Lexer) ([]Token, error) {
tokens := make([]Token, 0, 1024)
for {
token, err := lexer.Next()
if err != nil {
return nil, err
}
tokens = append(tokens, token)
if token.Type == EOF {
return tokens, nil
}
}
}
// Position of a token.
type Position struct {
Filename string
Offset int
Line int
Column int
}
// Advance the Position based on the number of characters and newlines in "span".
func (p *Position) Advance(span string) {
p.Offset += len(span)
lines := strings.Count(span, "\n")
p.Line += lines
// Update column.
if lines == 0 {
p.Column += utf8.RuneCountInString(span)
} else {
p.Column = utf8.RuneCountInString(span[strings.LastIndex(span, "\n"):])
}
}
// Add returns a new Position that is the sum of this position and "pos".
//
// This is useful when parsing values from a parent grammar.
func (p Position) Add(pos Position) Position {
p.Line += pos.Line - 1
if pos.Line > 1 {
p.Column = pos.Column
} else {
p.Column += pos.Column - 1
}
p.Offset += pos.Offset
return p
}
func (p Position) GoString() string {
return fmt.Sprintf("Position{Filename: %q, Offset: %d, Line: %d, Column: %d}",
p.Filename, p.Offset, p.Line, p.Column)
}
func (p Position) String() string {
filename := p.Filename
if filename == "" {
return fmt.Sprintf("%d:%d", p.Line, p.Column)
}
return fmt.Sprintf("%s:%d:%d", filename, p.Line, p.Column)
}
// A Token returned by a Lexer.
type Token struct {
// Type of token. This is the value keyed by symbol as returned by Definition.Symbols().
Type TokenType
Value string
Pos Position
}
// EOF returns true if this Token is an EOF token.
func (t Token) EOF() bool {
return t.Type == EOF
}
func (t Token) String() string {
if t.EOF() {
return "<EOF>"
}
return t.Value
}
func (t Token) GoString() string {
if t.Pos == (Position{}) {
return fmt.Sprintf("Token{%d, %q}", t.Type, t.Value)
}
return fmt.Sprintf("Token@%s{%d, %q}", t.Pos.String(), t.Type, t.Value)
}
// MakeSymbolTable builds a lookup table for checking token ID existence.
//
// For each symbolic name in "types", the returned map will contain the corresponding token ID as a key.
func MakeSymbolTable(def Definition, types ...string) (map[TokenType]bool, error) {
symbols := def.Symbols()
table := make(map[TokenType]bool, len(types))
for _, symbol := range types {
rn, ok := symbols[symbol]
if !ok {
return nil, fmt.Errorf("lexer does not support symbol %q", symbol)
}
table[rn] = true
}
return table, nil
}
|