hcl/json/scanner.go
Alisdair McDiarmid b265bbd046 json: Fix panic when parsing malformed JSON
When scanning JSON, upon encountering an invalid token, we immediately
return. Previously this return happened without inserting an EOF token.
Since other functions assume that a token sequence always ends in EOF,
this could cause a panic.

This commit adds a synthetic EOF token after the invalid token before
returning. While this does not match the real end-of-file of the source
JSON, it is marking the end of the scanned bytes, so it seems reasonable.

Fixes #339
2020-03-25 16:40:36 -04:00

307 lines
7.4 KiB
Go

package json
import (
"fmt"
"github.com/apparentlymart/go-textseg/v12/textseg"
"github.com/hashicorp/hcl/v2"
)
//go:generate stringer -type tokenType scanner.go
type tokenType rune
const (
tokenBraceO tokenType = '{'
tokenBraceC tokenType = '}'
tokenBrackO tokenType = '['
tokenBrackC tokenType = ']'
tokenComma tokenType = ','
tokenColon tokenType = ':'
tokenKeyword tokenType = 'K'
tokenString tokenType = 'S'
tokenNumber tokenType = 'N'
tokenEOF tokenType = '␄'
tokenInvalid tokenType = 0
tokenEquals tokenType = '=' // used only for reminding the user of JSON syntax
)
type token struct {
Type tokenType
Bytes []byte
Range hcl.Range
}
// scan returns the primary tokens for the given JSON buffer in sequence.
//
// The responsibility of this pass is to just mark the slices of the buffer
// as being of various types. It is lax in how it interprets the multi-byte
// token types keyword, string and number, preferring to capture erroneous
// extra bytes that we presume the user intended to be part of the token
// so that we can generate more helpful diagnostics in the parser.
func scan(buf []byte, start pos) []token {
var tokens []token
p := start
for {
if len(buf) == 0 {
tokens = append(tokens, token{
Type: tokenEOF,
Bytes: nil,
Range: posRange(p, p),
})
return tokens
}
buf, p = skipWhitespace(buf, p)
if len(buf) == 0 {
tokens = append(tokens, token{
Type: tokenEOF,
Bytes: nil,
Range: posRange(p, p),
})
return tokens
}
start = p
first := buf[0]
switch {
case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':' || first == '=':
p.Pos.Column++
p.Pos.Byte++
tokens = append(tokens, token{
Type: tokenType(first),
Bytes: buf[0:1],
Range: posRange(start, p),
})
buf = buf[1:]
case first == '"':
var tokBuf []byte
tokBuf, buf, p = scanString(buf, p)
tokens = append(tokens, token{
Type: tokenString,
Bytes: tokBuf,
Range: posRange(start, p),
})
case byteCanStartNumber(first):
var tokBuf []byte
tokBuf, buf, p = scanNumber(buf, p)
tokens = append(tokens, token{
Type: tokenNumber,
Bytes: tokBuf,
Range: posRange(start, p),
})
case byteCanStartKeyword(first):
var tokBuf []byte
tokBuf, buf, p = scanKeyword(buf, p)
tokens = append(tokens, token{
Type: tokenKeyword,
Bytes: tokBuf,
Range: posRange(start, p),
})
default:
tokens = append(tokens, token{
Type: tokenInvalid,
Bytes: buf[:1],
Range: start.Range(1, 1),
})
// If we've encountered an invalid then we might as well stop
// scanning since the parser won't proceed beyond this point.
// We insert a synthetic EOF marker here to match the expectations
// of consumers of this data structure.
p.Pos.Column++
p.Pos.Byte++
tokens = append(tokens, token{
Type: tokenEOF,
Bytes: nil,
Range: posRange(p, p),
})
return tokens
}
}
}
func byteCanStartNumber(b byte) bool {
switch b {
// We are slightly more tolerant than JSON requires here since we
// expect the parser will make a stricter interpretation of the
// number bytes, but we specifically don't allow 'e' or 'E' here
// since we want the scanner to treat that as the start of an
// invalid keyword instead, to produce more intelligible error messages.
case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return true
default:
return false
}
}
func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) {
// The scanner doesn't check that the sequence of digit-ish bytes is
// in a valid order. The parser must do this when decoding a number
// token.
var i int
p := start
Byte:
for i = 0; i < len(buf); i++ {
switch buf[i] {
case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
p.Pos.Byte++
p.Pos.Column++
default:
break Byte
}
}
return buf[:i], buf[i:], p
}
func byteCanStartKeyword(b byte) bool {
switch {
// We allow any sequence of alphabetical characters here, even though
// JSON is more constrained, so that we can collect what we presume
// the user intended to be a single keyword and then check its validity
// in the parser, where we can generate better diagnostics.
// So e.g. we want to be able to say:
// unrecognized keyword "True". Did you mean "true"?
case isAlphabetical(b):
return true
default:
return false
}
}
func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) {
var i int
p := start
Byte:
for i = 0; i < len(buf); i++ {
b := buf[i]
switch {
case isAlphabetical(b) || b == '_':
p.Pos.Byte++
p.Pos.Column++
default:
break Byte
}
}
return buf[:i], buf[i:], p
}
func scanString(buf []byte, start pos) ([]byte, []byte, pos) {
// The scanner doesn't validate correct use of escapes, etc. It pays
// attention to escapes only for the purpose of identifying the closing
// quote character. It's the parser's responsibility to do proper
// validation.
//
// The scanner also doesn't specifically detect unterminated string
// literals, though they can be identified in the parser by checking if
// the final byte in a string token is the double-quote character.
// Skip the opening quote symbol
i := 1
p := start
p.Pos.Byte++
p.Pos.Column++
escaping := false
Byte:
for i < len(buf) {
b := buf[i]
switch {
case b == '\\':
escaping = !escaping
p.Pos.Byte++
p.Pos.Column++
i++
case b == '"':
p.Pos.Byte++
p.Pos.Column++
i++
if !escaping {
break Byte
}
escaping = false
case b < 32:
break Byte
default:
// Advance by one grapheme cluster, so that we consider each
// grapheme to be a "column".
// Ignoring error because this scanner cannot produce errors.
advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true)
p.Pos.Byte += advance
p.Pos.Column++
i += advance
escaping = false
}
}
return buf[:i], buf[i:], p
}
func skipWhitespace(buf []byte, start pos) ([]byte, pos) {
var i int
p := start
Byte:
for i = 0; i < len(buf); i++ {
switch buf[i] {
case ' ':
p.Pos.Byte++
p.Pos.Column++
case '\n':
p.Pos.Byte++
p.Pos.Column = 1
p.Pos.Line++
case '\r':
// For the purpose of line/column counting we consider a
// carriage return to take up no space, assuming that it will
// be paired up with a newline (on Windows, for example) that
// will account for both of them.
p.Pos.Byte++
case '\t':
// We arbitrarily count a tab as if it were two spaces, because
// we need to choose _some_ number here. This means any system
// that renders code on-screen with markers must itself treat
// tabs as a pair of spaces for rendering purposes, or instead
// use the byte offset and back into its own column position.
p.Pos.Byte++
p.Pos.Column += 2
default:
break Byte
}
}
return buf[i:], p
}
type pos struct {
Filename string
Pos hcl.Pos
}
func (p *pos) Range(byteLen, charLen int) hcl.Range {
start := p.Pos
end := p.Pos
end.Byte += byteLen
end.Column += charLen
return hcl.Range{
Filename: p.Filename,
Start: start,
End: end,
}
}
func posRange(start, end pos) hcl.Range {
return hcl.Range{
Filename: start.Filename,
Start: start.Pos,
End: end.Pos,
}
}
func (t token) GoString() string {
return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range)
}
func isAlphabetical(b byte) bool {
return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z')
}