b5ce4360cd
Currently lacking correct support for unicode text segmentation into grapheme clusters, so it miscounts "Column" in positions. This will be addressed later.
282 lines
6.7 KiB
Go
282 lines
6.7 KiB
Go
package json
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/apparentlymart/go-zcl/zcl"
|
|
)
|
|
|
|
//go:generate stringer -type tokenType scanner.go
|
|
type tokenType rune
|
|
|
|
const (
|
|
tokenBraceO tokenType = '{'
|
|
tokenBraceC tokenType = '}'
|
|
tokenBrackO tokenType = '['
|
|
tokenBrackC tokenType = ']'
|
|
tokenComma tokenType = ','
|
|
tokenColon tokenType = ':'
|
|
tokenKeyword tokenType = 'K'
|
|
tokenString tokenType = 'S'
|
|
tokenNumber tokenType = 'N'
|
|
tokenInvalid tokenType = 0
|
|
)
|
|
|
|
type token struct {
|
|
Type tokenType
|
|
Bytes []byte
|
|
Range zcl.Range
|
|
}
|
|
|
|
// scan returns the primary tokens for the given JSON buffer in sequence.
|
|
//
|
|
// The responsibility of this pass is to just mark the slices of the buffer
|
|
// as being of various types. It is lax in how it interprets the multi-byte
|
|
// token types keyword, string and number, preferring to capture erroneous
|
|
// extra bytes that we presume the user intended to be part of the token
|
|
// so that we can generate more helpful diagnostics in the parser.
|
|
func scan(buf []byte, start pos) []token {
|
|
var tokens []token
|
|
p := start
|
|
for {
|
|
if len(buf) == 0 {
|
|
return tokens
|
|
}
|
|
|
|
buf, p = skipWhitespace(buf, p)
|
|
|
|
if len(buf) == 0 {
|
|
return tokens
|
|
}
|
|
|
|
start = p
|
|
|
|
first := buf[0]
|
|
switch {
|
|
case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':':
|
|
p.Pos.Column++
|
|
p.Pos.Byte++
|
|
tokens = append(tokens, token{
|
|
Type: tokenType(first),
|
|
Bytes: buf[0:1],
|
|
Range: posRange(start, p),
|
|
})
|
|
buf = buf[1:]
|
|
case first == '"':
|
|
var tokBuf []byte
|
|
tokBuf, buf, p = scanString(buf, p)
|
|
tokens = append(tokens, token{
|
|
Type: tokenString,
|
|
Bytes: tokBuf,
|
|
Range: posRange(start, p),
|
|
})
|
|
case byteCanStartNumber(first):
|
|
var tokBuf []byte
|
|
tokBuf, buf, p = scanNumber(buf, p)
|
|
tokens = append(tokens, token{
|
|
Type: tokenNumber,
|
|
Bytes: tokBuf,
|
|
Range: posRange(start, p),
|
|
})
|
|
case byteCanStartKeyword(first):
|
|
var tokBuf []byte
|
|
tokBuf, buf, p = scanKeyword(buf, p)
|
|
tokens = append(tokens, token{
|
|
Type: tokenKeyword,
|
|
Bytes: tokBuf,
|
|
Range: posRange(start, p),
|
|
})
|
|
default:
|
|
tokens = append(tokens, token{
|
|
Type: tokenInvalid,
|
|
Bytes: buf[:1],
|
|
Range: start.Range(1, 1),
|
|
})
|
|
// If we've encountered an invalid then we might as well stop
|
|
// scanning since the parser won't proceed beyond this point.
|
|
return tokens
|
|
}
|
|
}
|
|
}
|
|
|
|
func byteCanStartNumber(b byte) bool {
|
|
switch b {
|
|
// We are slightly more tolerant than JSON requires here since we
|
|
// expect the parser will make a stricter interpretation of the
|
|
// number bytes, but we specifically don't allow 'e' or 'E' here
|
|
// since we want the scanner to treat that as the start of an
|
|
// invalid keyword instead, to produce more intelligible error messages.
|
|
case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) {
|
|
// The scanner doesn't check that the sequence of digit-ish bytes is
|
|
// in a valid order. The parser must do this when decoding a number
|
|
// token.
|
|
var i int
|
|
p := start
|
|
Byte:
|
|
for i = 0; i < len(buf); i++ {
|
|
switch buf[i] {
|
|
case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
|
|
p.Pos.Byte++
|
|
p.Pos.Column++
|
|
default:
|
|
break Byte
|
|
}
|
|
}
|
|
return buf[:i], buf[i:], p
|
|
}
|
|
|
|
func byteCanStartKeyword(b byte) bool {
|
|
switch {
|
|
// We allow any sequence of alphabetical characters here, even though
|
|
// JSON is more constrained, so that we can collect what we presume
|
|
// the user intended to be a single keyword and then check its validity
|
|
// in the parser, where we can generate better diagnostics.
|
|
// So e.g. we want to be able to say:
|
|
// unrecognized keyword "True". Did you mean "true"?
|
|
case b >= 'a' || b <= 'z' || b >= 'A' || b <= 'Z':
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) {
|
|
var i int
|
|
p := start
|
|
Byte:
|
|
for i = 0; i < len(buf); i++ {
|
|
b := buf[i]
|
|
switch {
|
|
case b >= 'a' || b <= 'z' || b >= 'A' || b <= 'Z' || b == '_':
|
|
p.Pos.Byte++
|
|
p.Pos.Column++
|
|
default:
|
|
break Byte
|
|
}
|
|
}
|
|
return buf[:i], buf[i:], p
|
|
}
|
|
|
|
func scanString(buf []byte, start pos) ([]byte, []byte, pos) {
|
|
// The scanner doesn't validate correct use of escapes, etc. It pays
|
|
// attention to escapes only for the purpose of identifying the closing
|
|
// quote character. It's the parser's responsibility to do proper
|
|
// validation.
|
|
//
|
|
// The scanner also doesn't specifically detect unterminated string
|
|
// literals, though they can be identified in the parser by checking if
|
|
// the final byte in a string token is the double-quote character.
|
|
|
|
// Skip the opening quote symbol
|
|
i := 1
|
|
p := start
|
|
p.Pos.Byte++
|
|
p.Pos.Column++
|
|
escaping := false
|
|
Byte:
|
|
for i < len(buf) {
|
|
b := buf[i]
|
|
|
|
switch {
|
|
case b == '\\':
|
|
escaping = !escaping
|
|
p.Pos.Byte++
|
|
p.Pos.Column++
|
|
i++
|
|
case b == '"':
|
|
p.Pos.Byte++
|
|
p.Pos.Column++
|
|
i++
|
|
if !escaping {
|
|
break Byte
|
|
}
|
|
escaping = false
|
|
case b < 32:
|
|
break Byte
|
|
default:
|
|
// TODO: Use Unicode Text Segmentation spec to advance
|
|
// Column only once per grapheme cluster, rather than once per
|
|
// byte.
|
|
// Consume one or more UTF-8 codepoints that together form
|
|
// a single grapheme cluster.
|
|
|
|
p.Pos.Byte++
|
|
p.Pos.Column++
|
|
i++
|
|
|
|
escaping = false
|
|
}
|
|
}
|
|
return buf[:i], buf[i:], p
|
|
}
|
|
|
|
func skipWhitespace(buf []byte, start pos) ([]byte, pos) {
|
|
var i int
|
|
p := start
|
|
Byte:
|
|
for i = 0; i < len(buf); i++ {
|
|
switch buf[i] {
|
|
case ' ':
|
|
p.Pos.Byte++
|
|
p.Pos.Column++
|
|
case '\n':
|
|
p.Pos.Byte++
|
|
p.Pos.Column = 1
|
|
p.Pos.Line++
|
|
case '\r':
|
|
// For the purpose of line/column counting we consider a
|
|
// carriage return to take up no space, assuming that it will
|
|
// be paired up with a newline (on Windows, for example) that
|
|
// will account for both of them.
|
|
p.Pos.Byte++
|
|
case '\t':
|
|
// We arbitrarily count a tab as if it were two spaces, because
|
|
// we need to choose _some_ number here. This means any system
|
|
// that renders code on-screen with markers must itself treat
|
|
// tabs as a pair of spaces for rendering purposes, or instead
|
|
// use the byte offset and back into its own column position.
|
|
p.Pos.Byte++
|
|
p.Pos.Column += 2
|
|
default:
|
|
break Byte
|
|
}
|
|
}
|
|
return buf[i:], p
|
|
}
|
|
|
|
type pos struct {
|
|
Filename string
|
|
Pos zcl.Pos
|
|
}
|
|
|
|
func (p *pos) Range(byteLen, charLen int) zcl.Range {
|
|
start := p.Pos
|
|
end := p.Pos
|
|
end.Byte += byteLen
|
|
end.Column += charLen
|
|
return zcl.Range{
|
|
Filename: p.Filename,
|
|
Start: start,
|
|
End: end,
|
|
}
|
|
}
|
|
|
|
func posRange(start, end pos) zcl.Range {
|
|
return zcl.Range{
|
|
Filename: start.Filename,
|
|
Start: start.Pos,
|
|
End: end.Pos,
|
|
}
|
|
}
|
|
|
|
func (t token) GoString() string {
|
|
return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range)
|
|
}
|