json: initial scanner implementation

Currently lacking correct support for unicode text segmentation into grapheme clusters, so it miscounts "Column" in positions. This will be addressed later.
2017-05-15 07:02:13 -07:00 · 2017-05-15 07:02:13 -07:00 · b5ce4360cd
commit b5ce4360cd
parent b9183e85e4
4 changed files with 809 additions and 0 deletions
--- a/zcl/json/doc.go
+++ b/zcl/json/doc.go
@ -0,0 +1,8 @@
+// Package json is the JSON parser for ZCL. It parses JSON files and returns
+// implementations of the core ZCL structural interfaces in terms of the
+// JSON data inside.
+//
+// This is not a generic JSON parser. Instead, it deals with the mapping from
+// the JSON information model to the ZCL information model, using a number
+// of hard-coded structural conventions.
+package json
--- a/zcl/json/scanner.go
+++ b/zcl/json/scanner.go
@ -0,0 +1,281 @@
+package json
+
+import (
+	"fmt"
+
+	"github.com/apparentlymart/go-zcl/zcl"
+)
+
+//go:generate stringer -type tokenType scanner.go
+type tokenType rune
+
+const (
+	tokenBraceO  tokenType = '{'
+	tokenBraceC  tokenType = '}'
+	tokenBrackO  tokenType = '['
+	tokenBrackC  tokenType = ']'
+	tokenComma   tokenType = ','
+	tokenColon   tokenType = ':'
+	tokenKeyword tokenType = 'K'
+	tokenString  tokenType = 'S'
+	tokenNumber  tokenType = 'N'
+	tokenInvalid tokenType = 0
+)
+
+type token struct {
+	Type  tokenType
+	Bytes []byte
+	Range zcl.Range
+}
+
+// scan returns the primary tokens for the given JSON buffer in sequence.
+//
+// The responsibility of this pass is to just mark the slices of the buffer
+// as being of various types. It is lax in how it interprets the multi-byte
+// token types keyword, string and number, preferring to capture erroneous
+// extra bytes that we presume the user intended to be part of the token
+// so that we can generate more helpful diagnostics in the parser.
+func scan(buf []byte, start pos) []token {
+	var tokens []token
+	p := start
+	for {
+		if len(buf) == 0 {
+			return tokens
+		}
+
+		buf, p = skipWhitespace(buf, p)
+
+		if len(buf) == 0 {
+			return tokens
+		}
+
+		start = p
+
+		first := buf[0]
+		switch {
+		case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':':
+			p.Pos.Column++
+			p.Pos.Byte++
+			tokens = append(tokens, token{
+				Type:  tokenType(first),
+				Bytes: buf[0:1],
+				Range: posRange(start, p),
+			})
+			buf = buf[1:]
+		case first == '"':
+			var tokBuf []byte
+			tokBuf, buf, p = scanString(buf, p)
+			tokens = append(tokens, token{
+				Type:  tokenString,
+				Bytes: tokBuf,
+				Range: posRange(start, p),
+			})
+		case byteCanStartNumber(first):
+			var tokBuf []byte
+			tokBuf, buf, p = scanNumber(buf, p)
+			tokens = append(tokens, token{
+				Type:  tokenNumber,
+				Bytes: tokBuf,
+				Range: posRange(start, p),
+			})
+		case byteCanStartKeyword(first):
+			var tokBuf []byte
+			tokBuf, buf, p = scanKeyword(buf, p)
+			tokens = append(tokens, token{
+				Type:  tokenKeyword,
+				Bytes: tokBuf,
+				Range: posRange(start, p),
+			})
+		default:
+			tokens = append(tokens, token{
+				Type:  tokenInvalid,
+				Bytes: buf[:1],
+				Range: start.Range(1, 1),
+			})
+			// If we've encountered an invalid then we might as well stop
+			// scanning since the parser won't proceed beyond this point.
+			return tokens
+		}
+	}
+}
+
+func byteCanStartNumber(b byte) bool {
+	switch b {
+	// We are slightly more tolerant than JSON requires here since we
+	// expect the parser will make a stricter interpretation of the
+	// number bytes, but we specifically don't allow 'e' or 'E' here
+	// since we want the scanner to treat that as the start of an
+	// invalid keyword instead, to produce more intelligible error messages.
+	case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+		return true
+	default:
+		return false
+	}
+}
+
+func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) {
+	// The scanner doesn't check that the sequence of digit-ish bytes is
+	// in a valid order. The parser must do this when decoding a number
+	// token.
+	var i int
+	p := start
+Byte:
+	for i = 0; i < len(buf); i++ {
+		switch buf[i] {
+		case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+			p.Pos.Byte++
+			p.Pos.Column++
+		default:
+			break Byte
+		}
+	}
+	return buf[:i], buf[i:], p
+}
+
+func byteCanStartKeyword(b byte) bool {
+	switch {
+	// We allow any sequence of alphabetical characters here, even though
+	// JSON is more constrained, so that we can collect what we presume
+	// the user intended to be a single keyword and then check its validity
+	// in the parser, where we can generate better diagnostics.
+	// So e.g. we want to be able to say:
+	//   unrecognized keyword "True". Did you mean "true"?
+	case b >= 'a' || b <= 'z' || b >= 'A' || b <= 'Z':
+		return true
+	default:
+		return false
+	}
+}
+
+func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) {
+	var i int
+	p := start
+Byte:
+	for i = 0; i < len(buf); i++ {
+		b := buf[i]
+		switch {
+		case b >= 'a' || b <= 'z' || b >= 'A' || b <= 'Z' || b == '_':
+			p.Pos.Byte++
+			p.Pos.Column++
+		default:
+			break Byte
+		}
+	}
+	return buf[:i], buf[i:], p
+}
+
+func scanString(buf []byte, start pos) ([]byte, []byte, pos) {
+	// The scanner doesn't validate correct use of escapes, etc. It pays
+	// attention to escapes only for the purpose of identifying the closing
+	// quote character. It's the parser's responsibility to do proper
+	// validation.
+	//
+	// The scanner also doesn't specifically detect unterminated string
+	// literals, though they can be identified in the parser by checking if
+	// the final byte in a string token is the double-quote character.
+
+	// Skip the opening quote symbol
+	i := 1
+	p := start
+	p.Pos.Byte++
+	p.Pos.Column++
+	escaping := false
+Byte:
+	for i < len(buf) {
+		b := buf[i]
+
+		switch {
+		case b == '\\':
+			escaping = !escaping
+			p.Pos.Byte++
+			p.Pos.Column++
+			i++
+		case b == '"':
+			p.Pos.Byte++
+			p.Pos.Column++
+			i++
+			if !escaping {
+				break Byte
+			}
+			escaping = false
+		case b < 32:
+			break Byte
+		default:
+			// TODO: Use Unicode Text Segmentation spec to advance
+			// Column only once per grapheme cluster, rather than once per
+			// byte.
+			// Consume one or more UTF-8 codepoints that together form
+			// a single grapheme cluster.
+
+			p.Pos.Byte++
+			p.Pos.Column++
+			i++
+
+			escaping = false
+		}
+	}
+	return buf[:i], buf[i:], p
+}
+
+func skipWhitespace(buf []byte, start pos) ([]byte, pos) {
+	var i int
+	p := start
+Byte:
+	for i = 0; i < len(buf); i++ {
+		switch buf[i] {
+		case ' ':
+			p.Pos.Byte++
+			p.Pos.Column++
+		case '\n':
+			p.Pos.Byte++
+			p.Pos.Column = 1
+			p.Pos.Line++
+		case '\r':
+			// For the purpose of line/column counting we consider a
+			// carriage return to take up no space, assuming that it will
+			// be paired up with a newline (on Windows, for example) that
+			// will account for both of them.
+			p.Pos.Byte++
+		case '\t':
+			// We arbitrarily count a tab as if it were two spaces, because
+			// we need to choose _some_ number here. This means any system
+			// that renders code on-screen with markers must itself treat
+			// tabs as a pair of spaces for rendering purposes, or instead
+			// use the byte offset and back into its own column position.
+			p.Pos.Byte++
+			p.Pos.Column += 2
+		default:
+			break Byte
+		}
+	}
+	return buf[i:], p
+}
+
+type pos struct {
+	Filename string
+	Pos      zcl.Pos
+}
+
+func (p *pos) Range(byteLen, charLen int) zcl.Range {
+	start := p.Pos
+	end := p.Pos
+	end.Byte += byteLen
+	end.Column += charLen
+	return zcl.Range{
+		Filename: p.Filename,
+		Start:    start,
+		End:      end,
+	}
+}
+
+func posRange(start, end pos) zcl.Range {
+	return zcl.Range{
+		Filename: start.Filename,
+		Start:    start.Pos,
+		End:      end.Pos,
+	}
+}
+
+func (t token) GoString() string {
+	return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range)
+}
--- a/zcl/json/scanner_test.go
+++ b/zcl/json/scanner_test.go
@ -0,0 +1,462 @@
+package json
+
+import (
+	"bytes"
+	"fmt"
+	"reflect"
+	"testing"
+
+	"github.com/apparentlymart/go-zcl/zcl"
+)
+
+func TestScan(t *testing.T) {
+	tests := []struct {
+		Input string
+		Want  []token
+	}{
+		{
+			``,
+			nil,
+		},
+		{
+			`{}`,
+			[]token{
+				{
+					Type:  tokenBraceO,
+					Bytes: []byte(`{`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   0,
+							Line:   1,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   1,
+							Line:   1,
+							Column: 2,
+						},
+					},
+				},
+				{
+					Type:  tokenBraceC,
+					Bytes: []byte(`}`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   1,
+							Line:   1,
+							Column: 2,
+						},
+						End: zcl.Pos{
+							Byte:   2,
+							Line:   1,
+							Column: 3,
+						},
+					},
+				},
+			},
+		},
+		{
+			`][`,
+			[]token{
+				{
+					Type:  tokenBrackC,
+					Bytes: []byte(`]`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   0,
+							Line:   1,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   1,
+							Line:   1,
+							Column: 2,
+						},
+					},
+				},
+				{
+					Type:  tokenBrackO,
+					Bytes: []byte(`[`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   1,
+							Line:   1,
+							Column: 2,
+						},
+						End: zcl.Pos{
+							Byte:   2,
+							Line:   1,
+							Column: 3,
+						},
+					},
+				},
+			},
+		},
+		{
+			`:,`,
+			[]token{
+				{
+					Type:  tokenColon,
+					Bytes: []byte(`:`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   0,
+							Line:   1,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   1,
+							Line:   1,
+							Column: 2,
+						},
+					},
+				},
+				{
+					Type:  tokenComma,
+					Bytes: []byte(`,`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   1,
+							Line:   1,
+							Column: 2,
+						},
+						End: zcl.Pos{
+							Byte:   2,
+							Line:   1,
+							Column: 3,
+						},
+					},
+				},
+			},
+		},
+		{
+			`1`,
+			[]token{
+				{
+					Type:  tokenNumber,
+					Bytes: []byte(`1`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   0,
+							Line:   1,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   1,
+							Line:   1,
+							Column: 2,
+						},
+					},
+				},
+			},
+		},
+		{
+			`  1`,
+			[]token{
+				{
+					Type:  tokenNumber,
+					Bytes: []byte(`1`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   2,
+							Line:   1,
+							Column: 3,
+						},
+						End: zcl.Pos{
+							Byte:   3,
+							Line:   1,
+							Column: 4,
+						},
+					},
+				},
+			},
+		},
+		{
+			`  12`,
+			[]token{
+				{
+					Type:  tokenNumber,
+					Bytes: []byte(`12`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   2,
+							Line:   1,
+							Column: 3,
+						},
+						End: zcl.Pos{
+							Byte:   4,
+							Line:   1,
+							Column: 5,
+						},
+					},
+				},
+			},
+		},
+		{
+			`1 2`,
+			[]token{
+				{
+					Type:  tokenNumber,
+					Bytes: []byte(`1`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   0,
+							Line:   1,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   1,
+							Line:   1,
+							Column: 2,
+						},
+					},
+				},
+				{
+					Type:  tokenNumber,
+					Bytes: []byte(`2`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   2,
+							Line:   1,
+							Column: 3,
+						},
+						End: zcl.Pos{
+							Byte:   3,
+							Line:   1,
+							Column: 4,
+						},
+					},
+				},
+			},
+		},
+		{
+			"\n1\n 2",
+			[]token{
+				{
+					Type:  tokenNumber,
+					Bytes: []byte(`1`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   1,
+							Line:   2,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   2,
+							Line:   2,
+							Column: 2,
+						},
+					},
+				},
+				{
+					Type:  tokenNumber,
+					Bytes: []byte(`2`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   4,
+							Line:   3,
+							Column: 2,
+						},
+						End: zcl.Pos{
+							Byte:   5,
+							Line:   3,
+							Column: 3,
+						},
+					},
+				},
+			},
+		},
+		{
+			`-1 2.5`,
+			[]token{
+				{
+					Type:  tokenNumber,
+					Bytes: []byte(`-1`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   0,
+							Line:   1,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   2,
+							Line:   1,
+							Column: 3,
+						},
+					},
+				},
+				{
+					Type:  tokenNumber,
+					Bytes: []byte(`2.5`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   3,
+							Line:   1,
+							Column: 4,
+						},
+						End: zcl.Pos{
+							Byte:   6,
+							Line:   1,
+							Column: 7,
+						},
+					},
+				},
+			},
+		},
+		{
+			`true`,
+			[]token{
+				{
+					Type:  tokenKeyword,
+					Bytes: []byte(`true`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   0,
+							Line:   1,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   4,
+							Line:   1,
+							Column: 5,
+						},
+					},
+				},
+			},
+		},
+		{
+			`""`,
+			[]token{
+				{
+					Type:  tokenString,
+					Bytes: []byte(`""`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   0,
+							Line:   1,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   2,
+							Line:   1,
+							Column: 3,
+						},
+					},
+				},
+			},
+		},
+		{
+			`"hello"`,
+			[]token{
+				{
+					Type:  tokenString,
+					Bytes: []byte(`"hello"`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   0,
+							Line:   1,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   7,
+							Line:   1,
+							Column: 8,
+						},
+					},
+				},
+			},
+		},
+		{
+			`"he\"llo"`,
+			[]token{
+				{
+					Type:  tokenString,
+					Bytes: []byte(`"he\"llo"`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   0,
+							Line:   1,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   9,
+							Line:   1,
+							Column: 10,
+						},
+					},
+				},
+			},
+		},
+		{
+			`"hello\\" 1`,
+			[]token{
+				{
+					Type:  tokenString,
+					Bytes: []byte(`"hello\\"`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   0,
+							Line:   1,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   9,
+							Line:   1,
+							Column: 10,
+						},
+					},
+				},
+				{
+					Type:  tokenNumber,
+					Bytes: []byte(`1`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   10,
+							Line:   1,
+							Column: 11,
+						},
+						End: zcl.Pos{
+							Byte:   11,
+							Line:   1,
+							Column: 12,
+						},
+					},
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.Input, func(t *testing.T) {
+			buf := []byte(test.Input)
+			start := pos{
+				Filename: "",
+				Pos: zcl.Pos{
+					Byte:   0,
+					Line:   1,
+					Column: 1,
+				},
+			}
+			got := scan(buf, start)
+
+			if !reflect.DeepEqual(got, test.Want) {
+				errMsg := &bytes.Buffer{}
+				errMsg.WriteString("wrong result\ngot:\n")
+				if len(got) == 0 {
+					errMsg.WriteString("  (empty slice)\n")
+				}
+				for _, tok := range got {
+					fmt.Fprintf(errMsg, "  - %#v\n", tok)
+				}
+				errMsg.WriteString("want:\n")
+				if len(test.Want) == 0 {
+					errMsg.WriteString("  (empty slice)\n")
+				}
+				for _, tok := range test.Want {
+					fmt.Fprintf(errMsg, "  - %#v\n", tok)
+				}
+				t.Error(errMsg.String())
+			}
+		})
+	}
+}
--- a/zcl/json/tokentype_string.go
+++ b/zcl/json/tokentype_string.go
@ -0,0 +1,58 @@
+// Code generated by "stringer -type tokenType scanner.go"; DO NOT EDIT.
+
+package json
+
+import "fmt"
+
+const (
+	_tokenType_name_0 = "tokenInvalid"
+	_tokenType_name_1 = "tokenComma"
+	_tokenType_name_2 = "tokenColon"
+	_tokenType_name_3 = "tokenKeyword"
+	_tokenType_name_4 = "tokenNumber"
+	_tokenType_name_5 = "tokenString"
+	_tokenType_name_6 = "tokenBrackO"
+	_tokenType_name_7 = "tokenBrackC"
+	_tokenType_name_8 = "tokenBraceO"
+	_tokenType_name_9 = "tokenBraceC"
+)
+
+var (
+	_tokenType_index_0 = [...]uint8{0, 12}
+	_tokenType_index_1 = [...]uint8{0, 10}
+	_tokenType_index_2 = [...]uint8{0, 10}
+	_tokenType_index_3 = [...]uint8{0, 12}
+	_tokenType_index_4 = [...]uint8{0, 11}
+	_tokenType_index_5 = [...]uint8{0, 11}
+	_tokenType_index_6 = [...]uint8{0, 11}
+	_tokenType_index_7 = [...]uint8{0, 11}
+	_tokenType_index_8 = [...]uint8{0, 11}
+	_tokenType_index_9 = [...]uint8{0, 11}
+)
+
+func (i tokenType) String() string {
+	switch {
+	case i == 0:
+		return _tokenType_name_0
+	case i == 44:
+		return _tokenType_name_1
+	case i == 58:
+		return _tokenType_name_2
+	case i == 75:
+		return _tokenType_name_3
+	case i == 78:
+		return _tokenType_name_4
+	case i == 83:
+		return _tokenType_name_5
+	case i == 91:
+		return _tokenType_name_6
+	case i == 93:
+		return _tokenType_name_7
+	case i == 123:
+		return _tokenType_name_8
+	case i == 125:
+		return _tokenType_name_9
+	default:
+		return fmt.Sprintf("tokenType(%d)", i)
+	}
+}