1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
|
package json
import (
"fmt"
"github.com/apparentlymart/go-textseg/v13/textseg"
"github.com/hashicorp/hcl/v2"
)
//go:generate stringer -type tokenType scanner.go
type tokenType rune
const (
tokenBraceO tokenType = '{'
tokenBraceC tokenType = '}'
tokenBrackO tokenType = '['
tokenBrackC tokenType = ']'
tokenComma tokenType = ','
tokenColon tokenType = ':'
tokenKeyword tokenType = 'K'
tokenString tokenType = 'S'
tokenNumber tokenType = 'N'
tokenEOF tokenType = '␄'
tokenInvalid tokenType = 0
tokenEquals tokenType = '=' // used only for reminding the user of JSON syntax
)
type token struct {
Type tokenType
Bytes []byte
Range hcl.Range
}
// scan returns the primary tokens for the given JSON buffer in sequence.
//
// The responsibility of this pass is to just mark the slices of the buffer
// as being of various types. It is lax in how it interprets the multi-byte
// token types keyword, string and number, preferring to capture erroneous
// extra bytes that we presume the user intended to be part of the token
// so that we can generate more helpful diagnostics in the parser.
func scan(buf []byte, start pos) []token {
var tokens []token
p := start
for {
if len(buf) == 0 {
tokens = append(tokens, token{
Type: tokenEOF,
Bytes: nil,
Range: posRange(p, p),
})
return tokens
}
buf, p = skipWhitespace(buf, p)
if len(buf) == 0 {
tokens = append(tokens, token{
Type: tokenEOF,
Bytes: nil,
Range: posRange(p, p),
})
return tokens
}
start = p
first := buf[0]
switch {
case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':' || first == '=':
p.Pos.Column++
p.Pos.Byte++
tokens = append(tokens, token{
Type: tokenType(first),
Bytes: buf[0:1],
Range: posRange(start, p),
})
buf = buf[1:]
case first == '"':
var tokBuf []byte
tokBuf, buf, p = scanString(buf, p)
tokens = append(tokens, token{
Type: tokenString,
Bytes: tokBuf,
Range: posRange(start, p),
})
case byteCanStartNumber(first):
var tokBuf []byte
tokBuf, buf, p = scanNumber(buf, p)
tokens = append(tokens, token{
Type: tokenNumber,
Bytes: tokBuf,
Range: posRange(start, p),
})
case byteCanStartKeyword(first):
var tokBuf []byte
tokBuf, buf, p = scanKeyword(buf, p)
tokens = append(tokens, token{
Type: tokenKeyword,
Bytes: tokBuf,
Range: posRange(start, p),
})
default:
tokens = append(tokens, token{
Type: tokenInvalid,
Bytes: buf[:1],
Range: start.Range(1, 1),
})
// If we've encountered an invalid then we might as well stop
// scanning since the parser won't proceed beyond this point.
// We insert a synthetic EOF marker here to match the expectations
// of consumers of this data structure.
p.Pos.Column++
p.Pos.Byte++
tokens = append(tokens, token{
Type: tokenEOF,
Bytes: nil,
Range: posRange(p, p),
})
return tokens
}
}
}
func byteCanStartNumber(b byte) bool {
switch b {
// We are slightly more tolerant than JSON requires here since we
// expect the parser will make a stricter interpretation of the
// number bytes, but we specifically don't allow 'e' or 'E' here
// since we want the scanner to treat that as the start of an
// invalid keyword instead, to produce more intelligible error messages.
case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return true
default:
return false
}
}
func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) {
// The scanner doesn't check that the sequence of digit-ish bytes is
// in a valid order. The parser must do this when decoding a number
// token.
var i int
p := start
Byte:
for i = 0; i < len(buf); i++ {
switch buf[i] {
case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
p.Pos.Byte++
p.Pos.Column++
default:
break Byte
}
}
return buf[:i], buf[i:], p
}
func byteCanStartKeyword(b byte) bool {
switch {
// We allow any sequence of alphabetical characters here, even though
// JSON is more constrained, so that we can collect what we presume
// the user intended to be a single keyword and then check its validity
// in the parser, where we can generate better diagnostics.
// So e.g. we want to be able to say:
// unrecognized keyword "True". Did you mean "true"?
case isAlphabetical(b):
return true
default:
return false
}
}
func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) {
var i int
p := start
Byte:
for i = 0; i < len(buf); i++ {
b := buf[i]
switch {
case isAlphabetical(b) || b == '_':
p.Pos.Byte++
p.Pos.Column++
default:
break Byte
}
}
return buf[:i], buf[i:], p
}
func scanString(buf []byte, start pos) ([]byte, []byte, pos) {
// The scanner doesn't validate correct use of escapes, etc. It pays
// attention to escapes only for the purpose of identifying the closing
// quote character. It's the parser's responsibility to do proper
// validation.
//
// The scanner also doesn't specifically detect unterminated string
// literals, though they can be identified in the parser by checking if
// the final byte in a string token is the double-quote character.
// Skip the opening quote symbol
i := 1
p := start
p.Pos.Byte++
p.Pos.Column++
escaping := false
Byte:
for i < len(buf) {
b := buf[i]
switch {
case b == '\\':
escaping = !escaping
p.Pos.Byte++
p.Pos.Column++
i++
case b == '"':
p.Pos.Byte++
p.Pos.Column++
i++
if !escaping {
break Byte
}
escaping = false
case b < 32:
break Byte
default:
// Advance by one grapheme cluster, so that we consider each
// grapheme to be a "column".
// Ignoring error because this scanner cannot produce errors.
advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true)
p.Pos.Byte += advance
p.Pos.Column++
i += advance
escaping = false
}
}
return buf[:i], buf[i:], p
}
func skipWhitespace(buf []byte, start pos) ([]byte, pos) {
var i int
p := start
Byte:
for i = 0; i < len(buf); i++ {
switch buf[i] {
case ' ':
p.Pos.Byte++
p.Pos.Column++
case '\n':
p.Pos.Byte++
p.Pos.Column = 1
p.Pos.Line++
case '\r':
// For the purpose of line/column counting we consider a
// carriage return to take up no space, assuming that it will
// be paired up with a newline (on Windows, for example) that
// will account for both of them.
p.Pos.Byte++
case '\t':
// We arbitrarily count a tab as if it were two spaces, because
// we need to choose _some_ number here. This means any system
// that renders code on-screen with markers must itself treat
// tabs as a pair of spaces for rendering purposes, or instead
// use the byte offset and back into its own column position.
p.Pos.Byte++
p.Pos.Column += 2
default:
break Byte
}
}
return buf[i:], p
}
type pos struct {
Filename string
Pos hcl.Pos
}
func (p *pos) Range(byteLen, charLen int) hcl.Range {
start := p.Pos
end := p.Pos
end.Byte += byteLen
end.Column += charLen
return hcl.Range{
Filename: p.Filename,
Start: start,
End: end,
}
}
func posRange(start, end pos) hcl.Range {
return hcl.Range{
Filename: start.Filename,
Start: start.Pos,
End: end.Pos,
}
}
func (t token) GoString() string {
return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range)
}
func isAlphabetical(b byte) bool {
return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z')
}
|