diff --git a/zcl/json/doc.go b/zcl/json/doc.go new file mode 100644 index 0000000..ca8aca9 --- /dev/null +++ b/zcl/json/doc.go @@ -0,0 +1,8 @@ +// Package json is the JSON parser for ZCL. It parses JSON files and returns +// implementations of the core ZCL structural interfaces in terms of the +// JSON data inside. +// +// This is not a generic JSON parser. Instead, it deals with the mapping from +// the JSON information model to the ZCL information model, using a number +// of hard-coded structural conventions. +package json diff --git a/zcl/json/scanner.go b/zcl/json/scanner.go new file mode 100644 index 0000000..077b52b --- /dev/null +++ b/zcl/json/scanner.go @@ -0,0 +1,281 @@ +package json + +import ( + "fmt" + + "github.com/apparentlymart/go-zcl/zcl" +) + +//go:generate stringer -type tokenType scanner.go +type tokenType rune + +const ( + tokenBraceO tokenType = '{' + tokenBraceC tokenType = '}' + tokenBrackO tokenType = '[' + tokenBrackC tokenType = ']' + tokenComma tokenType = ',' + tokenColon tokenType = ':' + tokenKeyword tokenType = 'K' + tokenString tokenType = 'S' + tokenNumber tokenType = 'N' + tokenInvalid tokenType = 0 +) + +type token struct { + Type tokenType + Bytes []byte + Range zcl.Range +} + +// scan returns the primary tokens for the given JSON buffer in sequence. +// +// The responsibility of this pass is to just mark the slices of the buffer +// as being of various types. It is lax in how it interprets the multi-byte +// token types keyword, string and number, preferring to capture erroneous +// extra bytes that we presume the user intended to be part of the token +// so that we can generate more helpful diagnostics in the parser. +func scan(buf []byte, start pos) []token { + var tokens []token + p := start + for { + if len(buf) == 0 { + return tokens + } + + buf, p = skipWhitespace(buf, p) + + if len(buf) == 0 { + return tokens + } + + start = p + + first := buf[0] + switch { + case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':': + p.Pos.Column++ + p.Pos.Byte++ + tokens = append(tokens, token{ + Type: tokenType(first), + Bytes: buf[0:1], + Range: posRange(start, p), + }) + buf = buf[1:] + case first == '"': + var tokBuf []byte + tokBuf, buf, p = scanString(buf, p) + tokens = append(tokens, token{ + Type: tokenString, + Bytes: tokBuf, + Range: posRange(start, p), + }) + case byteCanStartNumber(first): + var tokBuf []byte + tokBuf, buf, p = scanNumber(buf, p) + tokens = append(tokens, token{ + Type: tokenNumber, + Bytes: tokBuf, + Range: posRange(start, p), + }) + case byteCanStartKeyword(first): + var tokBuf []byte + tokBuf, buf, p = scanKeyword(buf, p) + tokens = append(tokens, token{ + Type: tokenKeyword, + Bytes: tokBuf, + Range: posRange(start, p), + }) + default: + tokens = append(tokens, token{ + Type: tokenInvalid, + Bytes: buf[:1], + Range: start.Range(1, 1), + }) + // If we've encountered an invalid then we might as well stop + // scanning since the parser won't proceed beyond this point. + return tokens + } + } +} + +func byteCanStartNumber(b byte) bool { + switch b { + // We are slightly more tolerant than JSON requires here since we + // expect the parser will make a stricter interpretation of the + // number bytes, but we specifically don't allow 'e' or 'E' here + // since we want the scanner to treat that as the start of an + // invalid keyword instead, to produce more intelligible error messages. + case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return true + default: + return false + } +} + +func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) { + // The scanner doesn't check that the sequence of digit-ish bytes is + // in a valid order. The parser must do this when decoding a number + // token. + var i int + p := start +Byte: + for i = 0; i < len(buf); i++ { + switch buf[i] { + case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + p.Pos.Byte++ + p.Pos.Column++ + default: + break Byte + } + } + return buf[:i], buf[i:], p +} + +func byteCanStartKeyword(b byte) bool { + switch { + // We allow any sequence of alphabetical characters here, even though + // JSON is more constrained, so that we can collect what we presume + // the user intended to be a single keyword and then check its validity + // in the parser, where we can generate better diagnostics. + // So e.g. we want to be able to say: + // unrecognized keyword "True". Did you mean "true"? + case b >= 'a' || b <= 'z' || b >= 'A' || b <= 'Z': + return true + default: + return false + } +} + +func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) { + var i int + p := start +Byte: + for i = 0; i < len(buf); i++ { + b := buf[i] + switch { + case b >= 'a' || b <= 'z' || b >= 'A' || b <= 'Z' || b == '_': + p.Pos.Byte++ + p.Pos.Column++ + default: + break Byte + } + } + return buf[:i], buf[i:], p +} + +func scanString(buf []byte, start pos) ([]byte, []byte, pos) { + // The scanner doesn't validate correct use of escapes, etc. It pays + // attention to escapes only for the purpose of identifying the closing + // quote character. It's the parser's responsibility to do proper + // validation. + // + // The scanner also doesn't specifically detect unterminated string + // literals, though they can be identified in the parser by checking if + // the final byte in a string token is the double-quote character. + + // Skip the opening quote symbol + i := 1 + p := start + p.Pos.Byte++ + p.Pos.Column++ + escaping := false +Byte: + for i < len(buf) { + b := buf[i] + + switch { + case b == '\\': + escaping = !escaping + p.Pos.Byte++ + p.Pos.Column++ + i++ + case b == '"': + p.Pos.Byte++ + p.Pos.Column++ + i++ + if !escaping { + break Byte + } + escaping = false + case b < 32: + break Byte + default: + // TODO: Use Unicode Text Segmentation spec to advance + // Column only once per grapheme cluster, rather than once per + // byte. + // Consume one or more UTF-8 codepoints that together form + // a single grapheme cluster. + + p.Pos.Byte++ + p.Pos.Column++ + i++ + + escaping = false + } + } + return buf[:i], buf[i:], p +} + +func skipWhitespace(buf []byte, start pos) ([]byte, pos) { + var i int + p := start +Byte: + for i = 0; i < len(buf); i++ { + switch buf[i] { + case ' ': + p.Pos.Byte++ + p.Pos.Column++ + case '\n': + p.Pos.Byte++ + p.Pos.Column = 1 + p.Pos.Line++ + case '\r': + // For the purpose of line/column counting we consider a + // carriage return to take up no space, assuming that it will + // be paired up with a newline (on Windows, for example) that + // will account for both of them. + p.Pos.Byte++ + case '\t': + // We arbitrarily count a tab as if it were two spaces, because + // we need to choose _some_ number here. This means any system + // that renders code on-screen with markers must itself treat + // tabs as a pair of spaces for rendering purposes, or instead + // use the byte offset and back into its own column position. + p.Pos.Byte++ + p.Pos.Column += 2 + default: + break Byte + } + } + return buf[i:], p +} + +type pos struct { + Filename string + Pos zcl.Pos +} + +func (p *pos) Range(byteLen, charLen int) zcl.Range { + start := p.Pos + end := p.Pos + end.Byte += byteLen + end.Column += charLen + return zcl.Range{ + Filename: p.Filename, + Start: start, + End: end, + } +} + +func posRange(start, end pos) zcl.Range { + return zcl.Range{ + Filename: start.Filename, + Start: start.Pos, + End: end.Pos, + } +} + +func (t token) GoString() string { + return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range) +} diff --git a/zcl/json/scanner_test.go b/zcl/json/scanner_test.go new file mode 100644 index 0000000..205865f --- /dev/null +++ b/zcl/json/scanner_test.go @@ -0,0 +1,462 @@ +package json + +import ( + "bytes" + "fmt" + "reflect" + "testing" + + "github.com/apparentlymart/go-zcl/zcl" +) + +func TestScan(t *testing.T) { + tests := []struct { + Input string + Want []token + }{ + { + ``, + nil, + }, + { + `{}`, + []token{ + { + Type: tokenBraceO, + Bytes: []byte(`{`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 0, + Line: 1, + Column: 1, + }, + End: zcl.Pos{ + Byte: 1, + Line: 1, + Column: 2, + }, + }, + }, + { + Type: tokenBraceC, + Bytes: []byte(`}`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 1, + Line: 1, + Column: 2, + }, + End: zcl.Pos{ + Byte: 2, + Line: 1, + Column: 3, + }, + }, + }, + }, + }, + { + `][`, + []token{ + { + Type: tokenBrackC, + Bytes: []byte(`]`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 0, + Line: 1, + Column: 1, + }, + End: zcl.Pos{ + Byte: 1, + Line: 1, + Column: 2, + }, + }, + }, + { + Type: tokenBrackO, + Bytes: []byte(`[`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 1, + Line: 1, + Column: 2, + }, + End: zcl.Pos{ + Byte: 2, + Line: 1, + Column: 3, + }, + }, + }, + }, + }, + { + `:,`, + []token{ + { + Type: tokenColon, + Bytes: []byte(`:`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 0, + Line: 1, + Column: 1, + }, + End: zcl.Pos{ + Byte: 1, + Line: 1, + Column: 2, + }, + }, + }, + { + Type: tokenComma, + Bytes: []byte(`,`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 1, + Line: 1, + Column: 2, + }, + End: zcl.Pos{ + Byte: 2, + Line: 1, + Column: 3, + }, + }, + }, + }, + }, + { + `1`, + []token{ + { + Type: tokenNumber, + Bytes: []byte(`1`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 0, + Line: 1, + Column: 1, + }, + End: zcl.Pos{ + Byte: 1, + Line: 1, + Column: 2, + }, + }, + }, + }, + }, + { + ` 1`, + []token{ + { + Type: tokenNumber, + Bytes: []byte(`1`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 2, + Line: 1, + Column: 3, + }, + End: zcl.Pos{ + Byte: 3, + Line: 1, + Column: 4, + }, + }, + }, + }, + }, + { + ` 12`, + []token{ + { + Type: tokenNumber, + Bytes: []byte(`12`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 2, + Line: 1, + Column: 3, + }, + End: zcl.Pos{ + Byte: 4, + Line: 1, + Column: 5, + }, + }, + }, + }, + }, + { + `1 2`, + []token{ + { + Type: tokenNumber, + Bytes: []byte(`1`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 0, + Line: 1, + Column: 1, + }, + End: zcl.Pos{ + Byte: 1, + Line: 1, + Column: 2, + }, + }, + }, + { + Type: tokenNumber, + Bytes: []byte(`2`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 2, + Line: 1, + Column: 3, + }, + End: zcl.Pos{ + Byte: 3, + Line: 1, + Column: 4, + }, + }, + }, + }, + }, + { + "\n1\n 2", + []token{ + { + Type: tokenNumber, + Bytes: []byte(`1`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 1, + Line: 2, + Column: 1, + }, + End: zcl.Pos{ + Byte: 2, + Line: 2, + Column: 2, + }, + }, + }, + { + Type: tokenNumber, + Bytes: []byte(`2`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 4, + Line: 3, + Column: 2, + }, + End: zcl.Pos{ + Byte: 5, + Line: 3, + Column: 3, + }, + }, + }, + }, + }, + { + `-1 2.5`, + []token{ + { + Type: tokenNumber, + Bytes: []byte(`-1`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 0, + Line: 1, + Column: 1, + }, + End: zcl.Pos{ + Byte: 2, + Line: 1, + Column: 3, + }, + }, + }, + { + Type: tokenNumber, + Bytes: []byte(`2.5`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 3, + Line: 1, + Column: 4, + }, + End: zcl.Pos{ + Byte: 6, + Line: 1, + Column: 7, + }, + }, + }, + }, + }, + { + `true`, + []token{ + { + Type: tokenKeyword, + Bytes: []byte(`true`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 0, + Line: 1, + Column: 1, + }, + End: zcl.Pos{ + Byte: 4, + Line: 1, + Column: 5, + }, + }, + }, + }, + }, + { + `""`, + []token{ + { + Type: tokenString, + Bytes: []byte(`""`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 0, + Line: 1, + Column: 1, + }, + End: zcl.Pos{ + Byte: 2, + Line: 1, + Column: 3, + }, + }, + }, + }, + }, + { + `"hello"`, + []token{ + { + Type: tokenString, + Bytes: []byte(`"hello"`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 0, + Line: 1, + Column: 1, + }, + End: zcl.Pos{ + Byte: 7, + Line: 1, + Column: 8, + }, + }, + }, + }, + }, + { + `"he\"llo"`, + []token{ + { + Type: tokenString, + Bytes: []byte(`"he\"llo"`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 0, + Line: 1, + Column: 1, + }, + End: zcl.Pos{ + Byte: 9, + Line: 1, + Column: 10, + }, + }, + }, + }, + }, + { + `"hello\\" 1`, + []token{ + { + Type: tokenString, + Bytes: []byte(`"hello\\"`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 0, + Line: 1, + Column: 1, + }, + End: zcl.Pos{ + Byte: 9, + Line: 1, + Column: 10, + }, + }, + }, + { + Type: tokenNumber, + Bytes: []byte(`1`), + Range: zcl.Range{ + Start: zcl.Pos{ + Byte: 10, + Line: 1, + Column: 11, + }, + End: zcl.Pos{ + Byte: 11, + Line: 1, + Column: 12, + }, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.Input, func(t *testing.T) { + buf := []byte(test.Input) + start := pos{ + Filename: "", + Pos: zcl.Pos{ + Byte: 0, + Line: 1, + Column: 1, + }, + } + got := scan(buf, start) + + if !reflect.DeepEqual(got, test.Want) { + errMsg := &bytes.Buffer{} + errMsg.WriteString("wrong result\ngot:\n") + if len(got) == 0 { + errMsg.WriteString(" (empty slice)\n") + } + for _, tok := range got { + fmt.Fprintf(errMsg, " - %#v\n", tok) + } + errMsg.WriteString("want:\n") + if len(test.Want) == 0 { + errMsg.WriteString(" (empty slice)\n") + } + for _, tok := range test.Want { + fmt.Fprintf(errMsg, " - %#v\n", tok) + } + t.Error(errMsg.String()) + } + }) + } +} diff --git a/zcl/json/tokentype_string.go b/zcl/json/tokentype_string.go new file mode 100644 index 0000000..ed1b57e --- /dev/null +++ b/zcl/json/tokentype_string.go @@ -0,0 +1,58 @@ +// Code generated by "stringer -type tokenType scanner.go"; DO NOT EDIT. + +package json + +import "fmt" + +const ( + _tokenType_name_0 = "tokenInvalid" + _tokenType_name_1 = "tokenComma" + _tokenType_name_2 = "tokenColon" + _tokenType_name_3 = "tokenKeyword" + _tokenType_name_4 = "tokenNumber" + _tokenType_name_5 = "tokenString" + _tokenType_name_6 = "tokenBrackO" + _tokenType_name_7 = "tokenBrackC" + _tokenType_name_8 = "tokenBraceO" + _tokenType_name_9 = "tokenBraceC" +) + +var ( + _tokenType_index_0 = [...]uint8{0, 12} + _tokenType_index_1 = [...]uint8{0, 10} + _tokenType_index_2 = [...]uint8{0, 10} + _tokenType_index_3 = [...]uint8{0, 12} + _tokenType_index_4 = [...]uint8{0, 11} + _tokenType_index_5 = [...]uint8{0, 11} + _tokenType_index_6 = [...]uint8{0, 11} + _tokenType_index_7 = [...]uint8{0, 11} + _tokenType_index_8 = [...]uint8{0, 11} + _tokenType_index_9 = [...]uint8{0, 11} +) + +func (i tokenType) String() string { + switch { + case i == 0: + return _tokenType_name_0 + case i == 44: + return _tokenType_name_1 + case i == 58: + return _tokenType_name_2 + case i == 75: + return _tokenType_name_3 + case i == 78: + return _tokenType_name_4 + case i == 83: + return _tokenType_name_5 + case i == 91: + return _tokenType_name_6 + case i == 93: + return _tokenType_name_7 + case i == 123: + return _tokenType_name_8 + case i == 125: + return _tokenType_name_9 + default: + return fmt.Sprintf("tokenType(%d)", i) + } +}