json: initial scanner implementation

Currently lacking correct support for unicode text segmentation into grapheme clusters, so it miscounts "Column" in positions. This will be addressed later.
2017-05-15 07:02:13 -07:00 · 2017-05-15 07:02:13 -07:00 · b5ce4360cd
commit b5ce4360cd
parent b9183e85e4
4 changed files with 809 additions and 0 deletions
--- a/zcl/json/doc.go
+++ b/zcl/json/doc.go
@ -0,0 +1,8 @@
 // Package json is the JSON parser for ZCL. It parses JSON files and returns
 // implementations of the core ZCL structural interfaces in terms of the
 // JSON data inside.
 //
 // This is not a generic JSON parser. Instead, it deals with the mapping from
 // the JSON information model to the ZCL information model, using a number
 // of hard-coded structural conventions.
 package json
--- a/zcl/json/scanner.go
+++ b/zcl/json/scanner.go
@ -0,0 +1,281 @@
 package json
 import (
 	"fmt"
 	"github.com/apparentlymart/go-zcl/zcl"
 )
 //go:generate stringer -type tokenType scanner.go
 type tokenType rune
 const (
 	tokenBraceO  tokenType = '{'
 	tokenBraceC  tokenType = '}'
 	tokenBrackO  tokenType = '['
 	tokenBrackC  tokenType = ']'
 	tokenComma   tokenType = ','
 	tokenColon   tokenType = ':'
 	tokenKeyword tokenType = 'K'
 	tokenString  tokenType = 'S'
 	tokenNumber  tokenType = 'N'
 	tokenInvalid tokenType = 0
 )
 type token struct {
 	Type  tokenType
 	Bytes []byte
 	Range zcl.Range
 }
 // scan returns the primary tokens for the given JSON buffer in sequence.
 //
 // The responsibility of this pass is to just mark the slices of the buffer
 // as being of various types. It is lax in how it interprets the multi-byte
 // token types keyword, string and number, preferring to capture erroneous
 // extra bytes that we presume the user intended to be part of the token
 // so that we can generate more helpful diagnostics in the parser.
 func scan(buf []byte, start pos) []token {
 	var tokens []token
 	p := start
 	for {
 		if len(buf) == 0 {
 			return tokens
 		}
 		buf, p = skipWhitespace(buf, p)
 		if len(buf) == 0 {
 			return tokens
 		}
 		start = p
 		first := buf[0]
 		switch {
 		case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':':
 			p.Pos.Column++
 			p.Pos.Byte++
 			tokens = append(tokens, token{
 				Type:  tokenType(first),
 				Bytes: buf[0:1],
 				Range: posRange(start, p),
 			})
 			buf = buf[1:]
 		case first == '"':
 			var tokBuf []byte
 			tokBuf, buf, p = scanString(buf, p)
 			tokens = append(tokens, token{
 				Type:  tokenString,
 				Bytes: tokBuf,
 				Range: posRange(start, p),
 			})
 		case byteCanStartNumber(first):
 			var tokBuf []byte
 			tokBuf, buf, p = scanNumber(buf, p)
 			tokens = append(tokens, token{
 				Type:  tokenNumber,
 				Bytes: tokBuf,
 				Range: posRange(start, p),
 			})
 		case byteCanStartKeyword(first):
 			var tokBuf []byte
 			tokBuf, buf, p = scanKeyword(buf, p)
 			tokens = append(tokens, token{
 				Type:  tokenKeyword,
 				Bytes: tokBuf,
 				Range: posRange(start, p),
 			})
 		default:
 			tokens = append(tokens, token{
 				Type:  tokenInvalid,
 				Bytes: buf[:1],
 				Range: start.Range(1, 1),
 			})
 			// If we've encountered an invalid then we might as well stop
 			// scanning since the parser won't proceed beyond this point.
 			return tokens
 		}
 	}
 }
 func byteCanStartNumber(b byte) bool {
 	switch b {
 	// We are slightly more tolerant than JSON requires here since we
 	// expect the parser will make a stricter interpretation of the
 	// number bytes, but we specifically don't allow 'e' or 'E' here
 	// since we want the scanner to treat that as the start of an
 	// invalid keyword instead, to produce more intelligible error messages.
 	case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 		return true
 	default:
 		return false
 	}
 }
 func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) {
 	// The scanner doesn't check that the sequence of digit-ish bytes is
 	// in a valid order. The parser must do this when decoding a number
 	// token.
 	var i int
 	p := start
 Byte:
 	for i = 0; i < len(buf); i++ {
 		switch buf[i] {
 		case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 			p.Pos.Byte++
 			p.Pos.Column++
 		default:
 			break Byte
 		}
 	}
 	return buf[:i], buf[i:], p
 }
 func byteCanStartKeyword(b byte) bool {
 	switch {
 	// We allow any sequence of alphabetical characters here, even though
 	// JSON is more constrained, so that we can collect what we presume
 	// the user intended to be a single keyword and then check its validity
 	// in the parser, where we can generate better diagnostics.
 	// So e.g. we want to be able to say:
 	//   unrecognized keyword "True". Did you mean "true"?
 	case b >= 'a' || b <= 'z' || b >= 'A' || b <= 'Z':
 		return true
 	default:
 		return false
 	}
 }
 func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) {
 	var i int
 	p := start
 Byte:
 	for i = 0; i < len(buf); i++ {
 		b := buf[i]
 		switch {
 		case b >= 'a' || b <= 'z' || b >= 'A' || b <= 'Z' || b == '_':
 			p.Pos.Byte++
 			p.Pos.Column++
 		default:
 			break Byte
 		}
 	}
 	return buf[:i], buf[i:], p
 }
 func scanString(buf []byte, start pos) ([]byte, []byte, pos) {
 	// The scanner doesn't validate correct use of escapes, etc. It pays
 	// attention to escapes only for the purpose of identifying the closing
 	// quote character. It's the parser's responsibility to do proper
 	// validation.
 	//
 	// The scanner also doesn't specifically detect unterminated string
 	// literals, though they can be identified in the parser by checking if
 	// the final byte in a string token is the double-quote character.
 	// Skip the opening quote symbol
 	i := 1
 	p := start
 	p.Pos.Byte++
 	p.Pos.Column++
 	escaping := false
 Byte:
 	for i < len(buf) {
 		b := buf[i]
 		switch {
 		case b == '\\':
 			escaping = !escaping
 			p.Pos.Byte++
 			p.Pos.Column++
 			i++
 		case b == '"':
 			p.Pos.Byte++
 			p.Pos.Column++
 			i++
 			if !escaping {
 				break Byte
 			}
 			escaping = false
 		case b < 32:
 			break Byte
 		default:
 			// TODO: Use Unicode Text Segmentation spec to advance
 			// Column only once per grapheme cluster, rather than once per
 			// byte.
 			// Consume one or more UTF-8 codepoints that together form
 			// a single grapheme cluster.
 			p.Pos.Byte++
 			p.Pos.Column++
 			i++
 			escaping = false
 		}
 	}
 	return buf[:i], buf[i:], p
 }
 func skipWhitespace(buf []byte, start pos) ([]byte, pos) {
 	var i int
 	p := start
 Byte:
 	for i = 0; i < len(buf); i++ {
 		switch buf[i] {
 		case ' ':
 			p.Pos.Byte++
 			p.Pos.Column++
 		case '\n':
 			p.Pos.Byte++
 			p.Pos.Column = 1
 			p.Pos.Line++
 		case '\r':
 			// For the purpose of line/column counting we consider a
 			// carriage return to take up no space, assuming that it will
 			// be paired up with a newline (on Windows, for example) that
 			// will account for both of them.
 			p.Pos.Byte++
 		case '\t':
 			// We arbitrarily count a tab as if it were two spaces, because
 			// we need to choose _some_ number here. This means any system
 			// that renders code on-screen with markers must itself treat
 			// tabs as a pair of spaces for rendering purposes, or instead
 			// use the byte offset and back into its own column position.
 			p.Pos.Byte++
 			p.Pos.Column += 2
 		default:
 			break Byte
 		}
 	}
 	return buf[i:], p
 }
 type pos struct {
 	Filename string
 	Pos      zcl.Pos
 }
 func (p *pos) Range(byteLen, charLen int) zcl.Range {
 	start := p.Pos
 	end := p.Pos
 	end.Byte += byteLen
 	end.Column += charLen
 	return zcl.Range{
 		Filename: p.Filename,
 		Start:    start,
 		End:      end,
 	}
 }
 func posRange(start, end pos) zcl.Range {
 	return zcl.Range{
 		Filename: start.Filename,
 		Start:    start.Pos,
 		End:      end.Pos,
 	}
 }
 func (t token) GoString() string {
 	return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range)
 }
--- a/zcl/json/scanner_test.go
+++ b/zcl/json/scanner_test.go
@ -0,0 +1,462 @@
 package json
 import (
 	"bytes"
 	"fmt"
 	"reflect"
 	"testing"
 	"github.com/apparentlymart/go-zcl/zcl"
 )
 func TestScan(t *testing.T) {
 	tests := []struct {
 		Input string
 		Want  []token
 	}{
 		{
 			``,
 			nil,
 		},
 		{
 			`{}`,
 			[]token{
 				{
 					Type:  tokenBraceO,
 					Bytes: []byte(`{`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   0,
 							Line:   1,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   1,
 							Line:   1,
 							Column: 2,
 						},
 					},
 				},
 				{
 					Type:  tokenBraceC,
 					Bytes: []byte(`}`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   1,
 							Line:   1,
 							Column: 2,
 						},
 						End: zcl.Pos{
 							Byte:   2,
 							Line:   1,
 							Column: 3,
 						},
 					},
 				},
 			},
 		},
 		{
 			`][`,
 			[]token{
 				{
 					Type:  tokenBrackC,
 					Bytes: []byte(`]`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   0,
 							Line:   1,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   1,
 							Line:   1,
 							Column: 2,
 						},
 					},
 				},
 				{
 					Type:  tokenBrackO,
 					Bytes: []byte(`[`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   1,
 							Line:   1,
 							Column: 2,
 						},
 						End: zcl.Pos{
 							Byte:   2,
 							Line:   1,
 							Column: 3,
 						},
 					},
 				},
 			},
 		},
 		{
 			`:,`,
 			[]token{
 				{
 					Type:  tokenColon,
 					Bytes: []byte(`:`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   0,
 							Line:   1,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   1,
 							Line:   1,
 							Column: 2,
 						},
 					},
 				},
 				{
 					Type:  tokenComma,
 					Bytes: []byte(`,`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   1,
 							Line:   1,
 							Column: 2,
 						},
 						End: zcl.Pos{
 							Byte:   2,
 							Line:   1,
 							Column: 3,
 						},
 					},
 				},
 			},
 		},
 		{
 			`1`,
 			[]token{
 				{
 					Type:  tokenNumber,
 					Bytes: []byte(`1`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   0,
 							Line:   1,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   1,
 							Line:   1,
 							Column: 2,
 						},
 					},
 				},
 			},
 		},
 		{
 			`  1`,
 			[]token{
 				{
 					Type:  tokenNumber,
 					Bytes: []byte(`1`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   2,
 							Line:   1,
 							Column: 3,
 						},
 						End: zcl.Pos{
 							Byte:   3,
 							Line:   1,
 							Column: 4,
 						},
 					},
 				},
 			},
 		},
 		{
 			`  12`,
 			[]token{
 				{
 					Type:  tokenNumber,
 					Bytes: []byte(`12`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   2,
 							Line:   1,
 							Column: 3,
 						},
 						End: zcl.Pos{
 							Byte:   4,
 							Line:   1,
 							Column: 5,
 						},
 					},
 				},
 			},
 		},
 		{
 			`1 2`,
 			[]token{
 				{
 					Type:  tokenNumber,
 					Bytes: []byte(`1`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   0,
 							Line:   1,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   1,
 							Line:   1,
 							Column: 2,
 						},
 					},
 				},
 				{
 					Type:  tokenNumber,
 					Bytes: []byte(`2`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   2,
 							Line:   1,
 							Column: 3,
 						},
 						End: zcl.Pos{
 							Byte:   3,
 							Line:   1,
 							Column: 4,
 						},
 					},
 				},
 			},
 		},
 		{
 			"\n1\n 2",
 			[]token{
 				{
 					Type:  tokenNumber,
 					Bytes: []byte(`1`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   1,
 							Line:   2,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   2,
 							Line:   2,
 							Column: 2,
 						},
 					},
 				},
 				{
 					Type:  tokenNumber,
 					Bytes: []byte(`2`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   4,
 							Line:   3,
 							Column: 2,
 						},
 						End: zcl.Pos{
 							Byte:   5,
 							Line:   3,
 							Column: 3,
 						},
 					},
 				},
 			},
 		},
 		{
 			`-1 2.5`,
 			[]token{
 				{
 					Type:  tokenNumber,
 					Bytes: []byte(`-1`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   0,
 							Line:   1,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   2,
 							Line:   1,
 							Column: 3,
 						},
 					},
 				},
 				{
 					Type:  tokenNumber,
 					Bytes: []byte(`2.5`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   3,
 							Line:   1,
 							Column: 4,
 						},
 						End: zcl.Pos{
 							Byte:   6,
 							Line:   1,
 							Column: 7,
 						},
 					},
 				},
 			},
 		},
 		{
 			`true`,
 			[]token{
 				{
 					Type:  tokenKeyword,
 					Bytes: []byte(`true`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   0,
 							Line:   1,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   4,
 							Line:   1,
 							Column: 5,
 						},
 					},
 				},
 			},
 		},
 		{
 			`""`,
 			[]token{
 				{
 					Type:  tokenString,
 					Bytes: []byte(`""`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   0,
 							Line:   1,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   2,
 							Line:   1,
 							Column: 3,
 						},
 					},
 				},
 			},
 		},
 		{
 			`"hello"`,
 			[]token{
 				{
 					Type:  tokenString,
 					Bytes: []byte(`"hello"`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   0,
 							Line:   1,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   7,
 							Line:   1,
 							Column: 8,
 						},
 					},
 				},
 			},
 		},
 		{
 			`"he\"llo"`,
 			[]token{
 				{
 					Type:  tokenString,
 					Bytes: []byte(`"he\"llo"`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   0,
 							Line:   1,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   9,
 							Line:   1,
 							Column: 10,
 						},
 					},
 				},
 			},
 		},
 		{
 			`"hello\\" 1`,
 			[]token{
 				{
 					Type:  tokenString,
 					Bytes: []byte(`"hello\\"`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   0,
 							Line:   1,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   9,
 							Line:   1,
 							Column: 10,
 						},
 					},
 				},
 				{
 					Type:  tokenNumber,
 					Bytes: []byte(`1`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   10,
 							Line:   1,
 							Column: 11,
 						},
 						End: zcl.Pos{
 							Byte:   11,
 							Line:   1,
 							Column: 12,
 						},
 					},
 				},
 			},
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.Input, func(t *testing.T) {
 			buf := []byte(test.Input)
 			start := pos{
 				Filename: "",
 				Pos: zcl.Pos{
 					Byte:   0,
 					Line:   1,
 					Column: 1,
 				},
 			}
 			got := scan(buf, start)
 			if !reflect.DeepEqual(got, test.Want) {
 				errMsg := &bytes.Buffer{}
 				errMsg.WriteString("wrong result\ngot:\n")
 				if len(got) == 0 {
 					errMsg.WriteString("  (empty slice)\n")
 				}
 				for _, tok := range got {
 					fmt.Fprintf(errMsg, "  - %#v\n", tok)
 				}
 				errMsg.WriteString("want:\n")
 				if len(test.Want) == 0 {
 					errMsg.WriteString("  (empty slice)\n")
 				}
 				for _, tok := range test.Want {
 					fmt.Fprintf(errMsg, "  - %#v\n", tok)
 				}
 				t.Error(errMsg.String())
 			}
 		})
 	}
 }
--- a/zcl/json/tokentype_string.go
+++ b/zcl/json/tokentype_string.go
@ -0,0 +1,58 @@
 // Code generated by "stringer -type tokenType scanner.go"; DO NOT EDIT.
 package json
 import "fmt"
 const (
 	_tokenType_name_0 = "tokenInvalid"
 	_tokenType_name_1 = "tokenComma"
 	_tokenType_name_2 = "tokenColon"
 	_tokenType_name_3 = "tokenKeyword"
 	_tokenType_name_4 = "tokenNumber"
 	_tokenType_name_5 = "tokenString"
 	_tokenType_name_6 = "tokenBrackO"
 	_tokenType_name_7 = "tokenBrackC"
 	_tokenType_name_8 = "tokenBraceO"
 	_tokenType_name_9 = "tokenBraceC"
 )
 var (
 	_tokenType_index_0 = [...]uint8{0, 12}
 	_tokenType_index_1 = [...]uint8{0, 10}
 	_tokenType_index_2 = [...]uint8{0, 10}
 	_tokenType_index_3 = [...]uint8{0, 12}
 	_tokenType_index_4 = [...]uint8{0, 11}
 	_tokenType_index_5 = [...]uint8{0, 11}
 	_tokenType_index_6 = [...]uint8{0, 11}
 	_tokenType_index_7 = [...]uint8{0, 11}
 	_tokenType_index_8 = [...]uint8{0, 11}
 	_tokenType_index_9 = [...]uint8{0, 11}
 )
 func (i tokenType) String() string {
 	switch {
 	case i == 0:
 		return _tokenType_name_0
 	case i == 44:
 		return _tokenType_name_1
 	case i == 58:
 		return _tokenType_name_2
 	case i == 75:
 		return _tokenType_name_3
 	case i == 78:
 		return _tokenType_name_4
 	case i == 83:
 		return _tokenType_name_5
 	case i == 91:
 		return _tokenType_name_6
 	case i == 93:
 		return _tokenType_name_7
 	case i == 123:
 		return _tokenType_name_8
 	case i == 125:
 		return _tokenType_name_9
 	default:
 		return fmt.Sprintf("tokenType(%d)", i)
 	}
 }