package json import ( "fmt" "github.com/apparentlymart/go-textseg/v13/textseg" "github.com/hashicorp/hcl/v2" ) //go:generate stringer -type tokenType scanner.go type tokenType rune const ( tokenBraceO tokenType = '{' tokenBraceC tokenType = '}' tokenBrackO tokenType = '[' tokenBrackC tokenType = ']' tokenComma tokenType = ',' tokenColon tokenType = ':' tokenKeyword tokenType = 'K' tokenString tokenType = 'S' tokenNumber tokenType = 'N' tokenEOF tokenType = '␄' tokenInvalid tokenType = 0 tokenEquals tokenType = '=' // used only for reminding the user of JSON syntax ) type token struct { Type tokenType Bytes []byte Range hcl.Range } // scan returns the primary tokens for the given JSON buffer in sequence. // // The responsibility of this pass is to just mark the slices of the buffer // as being of various types. It is lax in how it interprets the multi-byte // token types keyword, string and number, preferring to capture erroneous // extra bytes that we presume the user intended to be part of the token // so that we can generate more helpful diagnostics in the parser. func scan(buf []byte, start pos) []token { var tokens []token p := start for { if len(buf) == 0 { tokens = append(tokens, token{ Type: tokenEOF, Bytes: nil, Range: posRange(p, p), }) return tokens } buf, p = skipWhitespace(buf, p) if len(buf) == 0 { tokens = append(tokens, token{ Type: tokenEOF, Bytes: nil, Range: posRange(p, p), }) return tokens } start = p first := buf[0] switch { case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':' || first == '=': p.Pos.Column++ p.Pos.Byte++ tokens = append(tokens, token{ Type: tokenType(first), Bytes: buf[0:1], Range: posRange(start, p), }) buf = buf[1:] case first == '"': var tokBuf []byte tokBuf, buf, p = scanString(buf, p) tokens = append(tokens, token{ Type: tokenString, Bytes: tokBuf, Range: posRange(start, p), }) case byteCanStartNumber(first): var tokBuf []byte tokBuf, buf, p = scanNumber(buf, p) tokens = append(tokens, token{ Type: tokenNumber, Bytes: tokBuf, Range: posRange(start, p), }) case byteCanStartKeyword(first): var tokBuf []byte tokBuf, buf, p = scanKeyword(buf, p) tokens = append(tokens, token{ Type: tokenKeyword, Bytes: tokBuf, Range: posRange(start, p), }) default: tokens = append(tokens, token{ Type: tokenInvalid, Bytes: buf[:1], Range: start.Range(1, 1), }) // If we've encountered an invalid then we might as well stop // scanning since the parser won't proceed beyond this point. // We insert a synthetic EOF marker here to match the expectations // of consumers of this data structure. p.Pos.Column++ p.Pos.Byte++ tokens = append(tokens, token{ Type: tokenEOF, Bytes: nil, Range: posRange(p, p), }) return tokens } } } func byteCanStartNumber(b byte) bool { switch b { // We are slightly more tolerant than JSON requires here since we // expect the parser will make a stricter interpretation of the // number bytes, but we specifically don't allow 'e' or 'E' here // since we want the scanner to treat that as the start of an // invalid keyword instead, to produce more intelligible error messages. case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return true default: return false } } func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) { // The scanner doesn't check that the sequence of digit-ish bytes is // in a valid order. The parser must do this when decoding a number // token. var i int p := start Byte: for i = 0; i < len(buf); i++ { switch buf[i] { case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': p.Pos.Byte++ p.Pos.Column++ default: break Byte } } return buf[:i], buf[i:], p } func byteCanStartKeyword(b byte) bool { switch { // We allow any sequence of alphabetical characters here, even though // JSON is more constrained, so that we can collect what we presume // the user intended to be a single keyword and then check its validity // in the parser, where we can generate better diagnostics. // So e.g. we want to be able to say: // unrecognized keyword "True". Did you mean "true"? case isAlphabetical(b): return true default: return false } } func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) { var i int p := start Byte: for i = 0; i < len(buf); i++ { b := buf[i] switch { case isAlphabetical(b) || b == '_': p.Pos.Byte++ p.Pos.Column++ default: break Byte } } return buf[:i], buf[i:], p } func scanString(buf []byte, start pos) ([]byte, []byte, pos) { // The scanner doesn't validate correct use of escapes, etc. It pays // attention to escapes only for the purpose of identifying the closing // quote character. It's the parser's responsibility to do proper // validation. // // The scanner also doesn't specifically detect unterminated string // literals, though they can be identified in the parser by checking if // the final byte in a string token is the double-quote character. // Skip the opening quote symbol i := 1 p := start p.Pos.Byte++ p.Pos.Column++ escaping := false Byte: for i < len(buf) { b := buf[i] switch { case b == '\\': escaping = !escaping p.Pos.Byte++ p.Pos.Column++ i++ case b == '"': p.Pos.Byte++ p.Pos.Column++ i++ if !escaping { break Byte } escaping = false case b < 32: break Byte default: // Advance by one grapheme cluster, so that we consider each // grapheme to be a "column". // Ignoring error because this scanner cannot produce errors. advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true) p.Pos.Byte += advance p.Pos.Column++ i += advance escaping = false } } return buf[:i], buf[i:], p } func skipWhitespace(buf []byte, start pos) ([]byte, pos) { var i int p := start Byte: for i = 0; i < len(buf); i++ { switch buf[i] { case ' ': p.Pos.Byte++ p.Pos.Column++ case '\n': p.Pos.Byte++ p.Pos.Column = 1 p.Pos.Line++ case '\r': // For the purpose of line/column counting we consider a // carriage return to take up no space, assuming that it will // be paired up with a newline (on Windows, for example) that // will account for both of them. p.Pos.Byte++ case '\t': // We arbitrarily count a tab as if it were two spaces, because // we need to choose _some_ number here. This means any system // that renders code on-screen with markers must itself treat // tabs as a pair of spaces for rendering purposes, or instead // use the byte offset and back into its own column position. p.Pos.Byte++ p.Pos.Column += 2 default: break Byte } } return buf[i:], p } type pos struct { Filename string Pos hcl.Pos } func (p *pos) Range(byteLen, charLen int) hcl.Range { start := p.Pos end := p.Pos end.Byte += byteLen end.Column += charLen return hcl.Range{ Filename: p.Filename, Start: start, End: end, } } func posRange(start, end pos) hcl.Range { return hcl.Range{ Filename: start.Filename, Start: start.Pos, End: end.Pos, } } func (t token) GoString() string { return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range) } func isAlphabetical(b byte) bool { return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') }