json: initial scanner implementation

Currently lacking correct support for unicode text segmentation into
grapheme clusters, so it miscounts "Column" in positions. This will be
addressed later.
This commit is contained in:
Martin Atkins 2017-05-15 07:02:13 -07:00
parent b9183e85e4
commit b5ce4360cd
4 changed files with 809 additions and 0 deletions

8
zcl/json/doc.go Normal file
View File

@ -0,0 +1,8 @@
// Package json is the JSON parser for ZCL. It parses JSON files and returns
// implementations of the core ZCL structural interfaces in terms of the
// JSON data inside.
//
// This is not a generic JSON parser. Instead, it deals with the mapping from
// the JSON information model to the ZCL information model, using a number
// of hard-coded structural conventions.
package json

281
zcl/json/scanner.go Normal file
View File

@ -0,0 +1,281 @@
package json
import (
"fmt"
"github.com/apparentlymart/go-zcl/zcl"
)
//go:generate stringer -type tokenType scanner.go
type tokenType rune
const (
tokenBraceO tokenType = '{'
tokenBraceC tokenType = '}'
tokenBrackO tokenType = '['
tokenBrackC tokenType = ']'
tokenComma tokenType = ','
tokenColon tokenType = ':'
tokenKeyword tokenType = 'K'
tokenString tokenType = 'S'
tokenNumber tokenType = 'N'
tokenInvalid tokenType = 0
)
type token struct {
Type tokenType
Bytes []byte
Range zcl.Range
}
// scan returns the primary tokens for the given JSON buffer in sequence.
//
// The responsibility of this pass is to just mark the slices of the buffer
// as being of various types. It is lax in how it interprets the multi-byte
// token types keyword, string and number, preferring to capture erroneous
// extra bytes that we presume the user intended to be part of the token
// so that we can generate more helpful diagnostics in the parser.
func scan(buf []byte, start pos) []token {
var tokens []token
p := start
for {
if len(buf) == 0 {
return tokens
}
buf, p = skipWhitespace(buf, p)
if len(buf) == 0 {
return tokens
}
start = p
first := buf[0]
switch {
case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':':
p.Pos.Column++
p.Pos.Byte++
tokens = append(tokens, token{
Type: tokenType(first),
Bytes: buf[0:1],
Range: posRange(start, p),
})
buf = buf[1:]
case first == '"':
var tokBuf []byte
tokBuf, buf, p = scanString(buf, p)
tokens = append(tokens, token{
Type: tokenString,
Bytes: tokBuf,
Range: posRange(start, p),
})
case byteCanStartNumber(first):
var tokBuf []byte
tokBuf, buf, p = scanNumber(buf, p)
tokens = append(tokens, token{
Type: tokenNumber,
Bytes: tokBuf,
Range: posRange(start, p),
})
case byteCanStartKeyword(first):
var tokBuf []byte
tokBuf, buf, p = scanKeyword(buf, p)
tokens = append(tokens, token{
Type: tokenKeyword,
Bytes: tokBuf,
Range: posRange(start, p),
})
default:
tokens = append(tokens, token{
Type: tokenInvalid,
Bytes: buf[:1],
Range: start.Range(1, 1),
})
// If we've encountered an invalid then we might as well stop
// scanning since the parser won't proceed beyond this point.
return tokens
}
}
}
func byteCanStartNumber(b byte) bool {
switch b {
// We are slightly more tolerant than JSON requires here since we
// expect the parser will make a stricter interpretation of the
// number bytes, but we specifically don't allow 'e' or 'E' here
// since we want the scanner to treat that as the start of an
// invalid keyword instead, to produce more intelligible error messages.
case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return true
default:
return false
}
}
func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) {
// The scanner doesn't check that the sequence of digit-ish bytes is
// in a valid order. The parser must do this when decoding a number
// token.
var i int
p := start
Byte:
for i = 0; i < len(buf); i++ {
switch buf[i] {
case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
p.Pos.Byte++
p.Pos.Column++
default:
break Byte
}
}
return buf[:i], buf[i:], p
}
func byteCanStartKeyword(b byte) bool {
switch {
// We allow any sequence of alphabetical characters here, even though
// JSON is more constrained, so that we can collect what we presume
// the user intended to be a single keyword and then check its validity
// in the parser, where we can generate better diagnostics.
// So e.g. we want to be able to say:
// unrecognized keyword "True". Did you mean "true"?
case b >= 'a' || b <= 'z' || b >= 'A' || b <= 'Z':
return true
default:
return false
}
}
func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) {
var i int
p := start
Byte:
for i = 0; i < len(buf); i++ {
b := buf[i]
switch {
case b >= 'a' || b <= 'z' || b >= 'A' || b <= 'Z' || b == '_':
p.Pos.Byte++
p.Pos.Column++
default:
break Byte
}
}
return buf[:i], buf[i:], p
}
func scanString(buf []byte, start pos) ([]byte, []byte, pos) {
// The scanner doesn't validate correct use of escapes, etc. It pays
// attention to escapes only for the purpose of identifying the closing
// quote character. It's the parser's responsibility to do proper
// validation.
//
// The scanner also doesn't specifically detect unterminated string
// literals, though they can be identified in the parser by checking if
// the final byte in a string token is the double-quote character.
// Skip the opening quote symbol
i := 1
p := start
p.Pos.Byte++
p.Pos.Column++
escaping := false
Byte:
for i < len(buf) {
b := buf[i]
switch {
case b == '\\':
escaping = !escaping
p.Pos.Byte++
p.Pos.Column++
i++
case b == '"':
p.Pos.Byte++
p.Pos.Column++
i++
if !escaping {
break Byte
}
escaping = false
case b < 32:
break Byte
default:
// TODO: Use Unicode Text Segmentation spec to advance
// Column only once per grapheme cluster, rather than once per
// byte.
// Consume one or more UTF-8 codepoints that together form
// a single grapheme cluster.
p.Pos.Byte++
p.Pos.Column++
i++
escaping = false
}
}
return buf[:i], buf[i:], p
}
func skipWhitespace(buf []byte, start pos) ([]byte, pos) {
var i int
p := start
Byte:
for i = 0; i < len(buf); i++ {
switch buf[i] {
case ' ':
p.Pos.Byte++
p.Pos.Column++
case '\n':
p.Pos.Byte++
p.Pos.Column = 1
p.Pos.Line++
case '\r':
// For the purpose of line/column counting we consider a
// carriage return to take up no space, assuming that it will
// be paired up with a newline (on Windows, for example) that
// will account for both of them.
p.Pos.Byte++
case '\t':
// We arbitrarily count a tab as if it were two spaces, because
// we need to choose _some_ number here. This means any system
// that renders code on-screen with markers must itself treat
// tabs as a pair of spaces for rendering purposes, or instead
// use the byte offset and back into its own column position.
p.Pos.Byte++
p.Pos.Column += 2
default:
break Byte
}
}
return buf[i:], p
}
type pos struct {
Filename string
Pos zcl.Pos
}
func (p *pos) Range(byteLen, charLen int) zcl.Range {
start := p.Pos
end := p.Pos
end.Byte += byteLen
end.Column += charLen
return zcl.Range{
Filename: p.Filename,
Start: start,
End: end,
}
}
func posRange(start, end pos) zcl.Range {
return zcl.Range{
Filename: start.Filename,
Start: start.Pos,
End: end.Pos,
}
}
func (t token) GoString() string {
return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range)
}

462
zcl/json/scanner_test.go Normal file
View File

@ -0,0 +1,462 @@
package json
import (
"bytes"
"fmt"
"reflect"
"testing"
"github.com/apparentlymart/go-zcl/zcl"
)
func TestScan(t *testing.T) {
tests := []struct {
Input string
Want []token
}{
{
``,
nil,
},
{
`{}`,
[]token{
{
Type: tokenBraceO,
Bytes: []byte(`{`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 1,
Line: 1,
Column: 2,
},
},
},
{
Type: tokenBraceC,
Bytes: []byte(`}`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 1,
Line: 1,
Column: 2,
},
End: zcl.Pos{
Byte: 2,
Line: 1,
Column: 3,
},
},
},
},
},
{
`][`,
[]token{
{
Type: tokenBrackC,
Bytes: []byte(`]`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 1,
Line: 1,
Column: 2,
},
},
},
{
Type: tokenBrackO,
Bytes: []byte(`[`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 1,
Line: 1,
Column: 2,
},
End: zcl.Pos{
Byte: 2,
Line: 1,
Column: 3,
},
},
},
},
},
{
`:,`,
[]token{
{
Type: tokenColon,
Bytes: []byte(`:`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 1,
Line: 1,
Column: 2,
},
},
},
{
Type: tokenComma,
Bytes: []byte(`,`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 1,
Line: 1,
Column: 2,
},
End: zcl.Pos{
Byte: 2,
Line: 1,
Column: 3,
},
},
},
},
},
{
`1`,
[]token{
{
Type: tokenNumber,
Bytes: []byte(`1`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 1,
Line: 1,
Column: 2,
},
},
},
},
},
{
` 1`,
[]token{
{
Type: tokenNumber,
Bytes: []byte(`1`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 2,
Line: 1,
Column: 3,
},
End: zcl.Pos{
Byte: 3,
Line: 1,
Column: 4,
},
},
},
},
},
{
` 12`,
[]token{
{
Type: tokenNumber,
Bytes: []byte(`12`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 2,
Line: 1,
Column: 3,
},
End: zcl.Pos{
Byte: 4,
Line: 1,
Column: 5,
},
},
},
},
},
{
`1 2`,
[]token{
{
Type: tokenNumber,
Bytes: []byte(`1`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 1,
Line: 1,
Column: 2,
},
},
},
{
Type: tokenNumber,
Bytes: []byte(`2`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 2,
Line: 1,
Column: 3,
},
End: zcl.Pos{
Byte: 3,
Line: 1,
Column: 4,
},
},
},
},
},
{
"\n1\n 2",
[]token{
{
Type: tokenNumber,
Bytes: []byte(`1`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 1,
Line: 2,
Column: 1,
},
End: zcl.Pos{
Byte: 2,
Line: 2,
Column: 2,
},
},
},
{
Type: tokenNumber,
Bytes: []byte(`2`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 4,
Line: 3,
Column: 2,
},
End: zcl.Pos{
Byte: 5,
Line: 3,
Column: 3,
},
},
},
},
},
{
`-1 2.5`,
[]token{
{
Type: tokenNumber,
Bytes: []byte(`-1`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 2,
Line: 1,
Column: 3,
},
},
},
{
Type: tokenNumber,
Bytes: []byte(`2.5`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 3,
Line: 1,
Column: 4,
},
End: zcl.Pos{
Byte: 6,
Line: 1,
Column: 7,
},
},
},
},
},
{
`true`,
[]token{
{
Type: tokenKeyword,
Bytes: []byte(`true`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 4,
Line: 1,
Column: 5,
},
},
},
},
},
{
`""`,
[]token{
{
Type: tokenString,
Bytes: []byte(`""`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 2,
Line: 1,
Column: 3,
},
},
},
},
},
{
`"hello"`,
[]token{
{
Type: tokenString,
Bytes: []byte(`"hello"`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 7,
Line: 1,
Column: 8,
},
},
},
},
},
{
`"he\"llo"`,
[]token{
{
Type: tokenString,
Bytes: []byte(`"he\"llo"`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 9,
Line: 1,
Column: 10,
},
},
},
},
},
{
`"hello\\" 1`,
[]token{
{
Type: tokenString,
Bytes: []byte(`"hello\\"`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 9,
Line: 1,
Column: 10,
},
},
},
{
Type: tokenNumber,
Bytes: []byte(`1`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 10,
Line: 1,
Column: 11,
},
End: zcl.Pos{
Byte: 11,
Line: 1,
Column: 12,
},
},
},
},
},
}
for _, test := range tests {
t.Run(test.Input, func(t *testing.T) {
buf := []byte(test.Input)
start := pos{
Filename: "",
Pos: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
}
got := scan(buf, start)
if !reflect.DeepEqual(got, test.Want) {
errMsg := &bytes.Buffer{}
errMsg.WriteString("wrong result\ngot:\n")
if len(got) == 0 {
errMsg.WriteString(" (empty slice)\n")
}
for _, tok := range got {
fmt.Fprintf(errMsg, " - %#v\n", tok)
}
errMsg.WriteString("want:\n")
if len(test.Want) == 0 {
errMsg.WriteString(" (empty slice)\n")
}
for _, tok := range test.Want {
fmt.Fprintf(errMsg, " - %#v\n", tok)
}
t.Error(errMsg.String())
}
})
}
}

View File

@ -0,0 +1,58 @@
// Code generated by "stringer -type tokenType scanner.go"; DO NOT EDIT.
package json
import "fmt"
const (
_tokenType_name_0 = "tokenInvalid"
_tokenType_name_1 = "tokenComma"
_tokenType_name_2 = "tokenColon"
_tokenType_name_3 = "tokenKeyword"
_tokenType_name_4 = "tokenNumber"
_tokenType_name_5 = "tokenString"
_tokenType_name_6 = "tokenBrackO"
_tokenType_name_7 = "tokenBrackC"
_tokenType_name_8 = "tokenBraceO"
_tokenType_name_9 = "tokenBraceC"
)
var (
_tokenType_index_0 = [...]uint8{0, 12}
_tokenType_index_1 = [...]uint8{0, 10}
_tokenType_index_2 = [...]uint8{0, 10}
_tokenType_index_3 = [...]uint8{0, 12}
_tokenType_index_4 = [...]uint8{0, 11}
_tokenType_index_5 = [...]uint8{0, 11}
_tokenType_index_6 = [...]uint8{0, 11}
_tokenType_index_7 = [...]uint8{0, 11}
_tokenType_index_8 = [...]uint8{0, 11}
_tokenType_index_9 = [...]uint8{0, 11}
)
func (i tokenType) String() string {
switch {
case i == 0:
return _tokenType_name_0
case i == 44:
return _tokenType_name_1
case i == 58:
return _tokenType_name_2
case i == 75:
return _tokenType_name_3
case i == 78:
return _tokenType_name_4
case i == 83:
return _tokenType_name_5
case i == 91:
return _tokenType_name_6
case i == 93:
return _tokenType_name_7
case i == 123:
return _tokenType_name_8
case i == 125:
return _tokenType_name_9
default:
return fmt.Sprintf("tokenType(%d)", i)
}
}