hcl/zcl/json/scanner.go

package json

import (
	"fmt"

	"github.com/apparentlymart/go-zcl/zcl"
)

//go:generate stringer -type tokenType scanner.go
type tokenType rune

const (
	tokenBraceO  tokenType = '{'
	tokenBraceC  tokenType = '}'
	tokenBrackO  tokenType = '['
	tokenBrackC  tokenType = ']'
	tokenComma   tokenType = ','
	tokenColon   tokenType = ':'
	tokenKeyword tokenType = 'K'
	tokenString  tokenType = 'S'
	tokenNumber  tokenType = 'N'
	tokenInvalid tokenType = 0
)

type token struct {
	Type  tokenType
	Bytes []byte
	Range zcl.Range
}

// scan returns the primary tokens for the given JSON buffer in sequence.
//
// The responsibility of this pass is to just mark the slices of the buffer
// as being of various types. It is lax in how it interprets the multi-byte
// token types keyword, string and number, preferring to capture erroneous
// extra bytes that we presume the user intended to be part of the token
// so that we can generate more helpful diagnostics in the parser.
func scan(buf []byte, start pos) []token {
	var tokens []token
	p := start
	for {
		if len(buf) == 0 {
			return tokens
		}

		buf, p = skipWhitespace(buf, p)

		if len(buf) == 0 {
			return tokens
		}

		start = p

		first := buf[0]
		switch {
		case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':':
			p.Pos.Column++
			p.Pos.Byte++
			tokens = append(tokens, token{
				Type:  tokenType(first),
				Bytes: buf[0:1],
				Range: posRange(start, p),
			})
			buf = buf[1:]
		case first == '"':
			var tokBuf []byte
			tokBuf, buf, p = scanString(buf, p)
			tokens = append(tokens, token{
				Type:  tokenString,
				Bytes: tokBuf,
				Range: posRange(start, p),
			})
		case byteCanStartNumber(first):
			var tokBuf []byte
			tokBuf, buf, p = scanNumber(buf, p)
			tokens = append(tokens, token{
				Type:  tokenNumber,
				Bytes: tokBuf,
				Range: posRange(start, p),
			})
		case byteCanStartKeyword(first):
			var tokBuf []byte
			tokBuf, buf, p = scanKeyword(buf, p)
			tokens = append(tokens, token{
				Type:  tokenKeyword,
				Bytes: tokBuf,
				Range: posRange(start, p),
			})
		default:
			tokens = append(tokens, token{
				Type:  tokenInvalid,
				Bytes: buf[:1],
				Range: start.Range(1, 1),
			})
			// If we've encountered an invalid then we might as well stop
			// scanning since the parser won't proceed beyond this point.
			return tokens
		}
	}
}

func byteCanStartNumber(b byte) bool {
	switch b {
	// We are slightly more tolerant than JSON requires here since we
	// expect the parser will make a stricter interpretation of the
	// number bytes, but we specifically don't allow 'e' or 'E' here
	// since we want the scanner to treat that as the start of an
	// invalid keyword instead, to produce more intelligible error messages.
	case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
		return true
	default:
		return false
	}
}

func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) {
	// The scanner doesn't check that the sequence of digit-ish bytes is
	// in a valid order. The parser must do this when decoding a number
	// token.
	var i int
	p := start
Byte:
	for i = 0; i < len(buf); i++ {
		switch buf[i] {
		case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
			p.Pos.Byte++
			p.Pos.Column++
		default:
			break Byte
		}
	}
	return buf[:i], buf[i:], p
}

func byteCanStartKeyword(b byte) bool {
	switch {
	// We allow any sequence of alphabetical characters here, even though
	// JSON is more constrained, so that we can collect what we presume
	// the user intended to be a single keyword and then check its validity
	// in the parser, where we can generate better diagnostics.
	// So e.g. we want to be able to say:
	//   unrecognized keyword "True". Did you mean "true"?
	case b >= 'a' || b <= 'z' || b >= 'A' || b <= 'Z':
		return true
	default:
		return false
	}
}

func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) {
	var i int
	p := start
Byte:
	for i = 0; i < len(buf); i++ {
		b := buf[i]
		switch {
		case b >= 'a' || b <= 'z' || b >= 'A' || b <= 'Z' || b == '_':
			p.Pos.Byte++
			p.Pos.Column++
		default:
			break Byte
		}
	}
	return buf[:i], buf[i:], p
}

func scanString(buf []byte, start pos) ([]byte, []byte, pos) {
	// The scanner doesn't validate correct use of escapes, etc. It pays
	// attention to escapes only for the purpose of identifying the closing
	// quote character. It's the parser's responsibility to do proper
	// validation.
	//
	// The scanner also doesn't specifically detect unterminated string
	// literals, though they can be identified in the parser by checking if
	// the final byte in a string token is the double-quote character.

	// Skip the opening quote symbol
	i := 1
	p := start
	p.Pos.Byte++
	p.Pos.Column++
	escaping := false
Byte:
	for i < len(buf) {
		b := buf[i]

		switch {
		case b == '\\':
			escaping = !escaping
			p.Pos.Byte++
			p.Pos.Column++
			i++
		case b == '"':
			p.Pos.Byte++
			p.Pos.Column++
			i++
			if !escaping {
				break Byte
			}
			escaping = false
		case b < 32:
			break Byte
		default:
			// TODO: Use Unicode Text Segmentation spec to advance
			// Column only once per grapheme cluster, rather than once per
			// byte.
			// Consume one or more UTF-8 codepoints that together form
			// a single grapheme cluster.

			p.Pos.Byte++
			p.Pos.Column++
			i++

			escaping = false
		}
	}
	return buf[:i], buf[i:], p
}

func skipWhitespace(buf []byte, start pos) ([]byte, pos) {
	var i int
	p := start
Byte:
	for i = 0; i < len(buf); i++ {
		switch buf[i] {
		case ' ':
			p.Pos.Byte++
			p.Pos.Column++
		case '\n':
			p.Pos.Byte++
			p.Pos.Column = 1
			p.Pos.Line++
		case '\r':
			// For the purpose of line/column counting we consider a
			// carriage return to take up no space, assuming that it will
			// be paired up with a newline (on Windows, for example) that
			// will account for both of them.
			p.Pos.Byte++
		case '\t':
			// We arbitrarily count a tab as if it were two spaces, because
			// we need to choose _some_ number here. This means any system
			// that renders code on-screen with markers must itself treat
			// tabs as a pair of spaces for rendering purposes, or instead
			// use the byte offset and back into its own column position.
			p.Pos.Byte++
			p.Pos.Column += 2
		default:
			break Byte
		}
	}
	return buf[i:], p
}

type pos struct {
	Filename string
	Pos      zcl.Pos
}

func (p *pos) Range(byteLen, charLen int) zcl.Range {
	start := p.Pos
	end := p.Pos
	end.Byte += byteLen
	end.Column += charLen
	return zcl.Range{
		Filename: p.Filename,
		Start:    start,
		End:      end,
	}
}

func posRange(start, end pos) zcl.Range {
	return zcl.Range{
		Filename: start.Filename,
		Start:    start.Pos,
		End:      end.Pos,
	}
}

func (t token) GoString() string {
	return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range)
}
json: initial scanner implementation Currently lacking correct support for unicode text segmentation into grapheme clusters, so it miscounts "Column" in positions. This will be addressed later. 2017-05-15 14:02:13 +00:00			`package json`

			`import (`
			`"fmt"`

			`"github.com/apparentlymart/go-zcl/zcl"`
			`)`

			`//go:generate stringer -type tokenType scanner.go`
			`type tokenType rune`

			`const (`
			`tokenBraceO tokenType = '{'`
			`tokenBraceC tokenType = '}'`
			`tokenBrackO tokenType = '['`
			`tokenBrackC tokenType = ']'`
			`tokenComma tokenType = ','`
			`tokenColon tokenType = ':'`
			`tokenKeyword tokenType = 'K'`
			`tokenString tokenType = 'S'`
			`tokenNumber tokenType = 'N'`
			`tokenInvalid tokenType = 0`
			`)`

			`type token struct {`
			`Type tokenType`
			`Bytes []byte`
			`Range zcl.Range`
			`}`

			`// scan returns the primary tokens for the given JSON buffer in sequence.`
			`//`
			`// The responsibility of this pass is to just mark the slices of the buffer`
			`// as being of various types. It is lax in how it interprets the multi-byte`
			`// token types keyword, string and number, preferring to capture erroneous`
			`// extra bytes that we presume the user intended to be part of the token`
			`// so that we can generate more helpful diagnostics in the parser.`
			`func scan(buf []byte, start pos) []token {`
			`var tokens []token`
			`p := start`
			`for {`
			`if len(buf) == 0 {`
			`return tokens`
			`}`

			`buf, p = skipWhitespace(buf, p)`

			`if len(buf) == 0 {`
			`return tokens`
			`}`

			`start = p`

			`first := buf[0]`
			`switch {`
			`case first == '{' \|\| first == '}' \|\| first == '[' \|\| first == ']' \|\| first == ',' \|\| first == ':':`
			`p.Pos.Column++`
			`p.Pos.Byte++`
			`tokens = append(tokens, token{`
			`Type: tokenType(first),`
			`Bytes: buf[0:1],`
			`Range: posRange(start, p),`
			`})`
			`buf = buf[1:]`
			`case first == '"':`
			`var tokBuf []byte`
			`tokBuf, buf, p = scanString(buf, p)`
			`tokens = append(tokens, token{`
			`Type: tokenString,`
			`Bytes: tokBuf,`
			`Range: posRange(start, p),`
			`})`
			`case byteCanStartNumber(first):`
			`var tokBuf []byte`
			`tokBuf, buf, p = scanNumber(buf, p)`
			`tokens = append(tokens, token{`
			`Type: tokenNumber,`
			`Bytes: tokBuf,`
			`Range: posRange(start, p),`
			`})`
			`case byteCanStartKeyword(first):`
			`var tokBuf []byte`
			`tokBuf, buf, p = scanKeyword(buf, p)`
			`tokens = append(tokens, token{`
			`Type: tokenKeyword,`
			`Bytes: tokBuf,`
			`Range: posRange(start, p),`
			`})`
			`default:`
			`tokens = append(tokens, token{`
			`Type: tokenInvalid,`
			`Bytes: buf[:1],`
			`Range: start.Range(1, 1),`
			`})`
			`// If we've encountered an invalid then we might as well stop`
			`// scanning since the parser won't proceed beyond this point.`
			`return tokens`
			`}`
			`}`
			`}`

			`func byteCanStartNumber(b byte) bool {`
			`switch b {`
			`// We are slightly more tolerant than JSON requires here since we`
			`// expect the parser will make a stricter interpretation of the`
			`// number bytes, but we specifically don't allow 'e' or 'E' here`
			`// since we want the scanner to treat that as the start of an`
			`// invalid keyword instead, to produce more intelligible error messages.`
			`case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':`
			`return true`
			`default:`
			`return false`
			`}`
			`}`

			`func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) {`
			`// The scanner doesn't check that the sequence of digit-ish bytes is`
			`// in a valid order. The parser must do this when decoding a number`
			`// token.`
			`var i int`
			`p := start`
			`Byte:`
			`for i = 0; i < len(buf); i++ {`
			`switch buf[i] {`
			`case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':`
			`p.Pos.Byte++`
			`p.Pos.Column++`
			`default:`
			`break Byte`
			`}`
			`}`
			`return buf[:i], buf[i:], p`
			`}`

			`func byteCanStartKeyword(b byte) bool {`
			`switch {`
			`// We allow any sequence of alphabetical characters here, even though`
			`// JSON is more constrained, so that we can collect what we presume`
			`// the user intended to be a single keyword and then check its validity`
			`// in the parser, where we can generate better diagnostics.`
			`// So e.g. we want to be able to say:`
			`// unrecognized keyword "True". Did you mean "true"?`
			`case b >= 'a' \|\| b <= 'z' \|\| b >= 'A' \|\| b <= 'Z':`
			`return true`
			`default:`
			`return false`
			`}`
			`}`

			`func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) {`
			`var i int`
			`p := start`
			`Byte:`
			`for i = 0; i < len(buf); i++ {`
			`b := buf[i]`
			`switch {`
			`case b >= 'a' \|\| b <= 'z' \|\| b >= 'A' \|\| b <= 'Z' \|\| b == '_':`
			`p.Pos.Byte++`
			`p.Pos.Column++`
			`default:`
			`break Byte`
			`}`
			`}`
			`return buf[:i], buf[i:], p`
			`}`

			`func scanString(buf []byte, start pos) ([]byte, []byte, pos) {`
			`// The scanner doesn't validate correct use of escapes, etc. It pays`
			`// attention to escapes only for the purpose of identifying the closing`
			`// quote character. It's the parser's responsibility to do proper`
			`// validation.`
			`//`
			`// The scanner also doesn't specifically detect unterminated string`
			`// literals, though they can be identified in the parser by checking if`
			`// the final byte in a string token is the double-quote character.`

			`// Skip the opening quote symbol`
			`i := 1`
			`p := start`
			`p.Pos.Byte++`
			`p.Pos.Column++`
			`escaping := false`
			`Byte:`
			`for i < len(buf) {`
			`b := buf[i]`

			`switch {`
			`case b == '\\':`
			`escaping = !escaping`
			`p.Pos.Byte++`
			`p.Pos.Column++`
			`i++`
			`case b == '"':`
			`p.Pos.Byte++`
			`p.Pos.Column++`
			`i++`
			`if !escaping {`
			`break Byte`
			`}`
			`escaping = false`
			`case b < 32:`
			`break Byte`
			`default:`
			`// TODO: Use Unicode Text Segmentation spec to advance`
			`// Column only once per grapheme cluster, rather than once per`
			`// byte.`
			`// Consume one or more UTF-8 codepoints that together form`
			`// a single grapheme cluster.`

			`p.Pos.Byte++`
			`p.Pos.Column++`
			`i++`

			`escaping = false`
			`}`
			`}`
			`return buf[:i], buf[i:], p`
			`}`

			`func skipWhitespace(buf []byte, start pos) ([]byte, pos) {`
			`var i int`
			`p := start`
			`Byte:`
			`for i = 0; i < len(buf); i++ {`
			`switch buf[i] {`
			`case ' ':`
			`p.Pos.Byte++`
			`p.Pos.Column++`
			`case '\n':`
			`p.Pos.Byte++`
			`p.Pos.Column = 1`
			`p.Pos.Line++`
			`case '\r':`
			`// For the purpose of line/column counting we consider a`
			`// carriage return to take up no space, assuming that it will`
			`// be paired up with a newline (on Windows, for example) that`
			`// will account for both of them.`
			`p.Pos.Byte++`
			`case '\t':`
			`// We arbitrarily count a tab as if it were two spaces, because`
			`// we need to choose _some_ number here. This means any system`
			`// that renders code on-screen with markers must itself treat`
			`// tabs as a pair of spaces for rendering purposes, or instead`
			`// use the byte offset and back into its own column position.`
			`p.Pos.Byte++`
			`p.Pos.Column += 2`
			`default:`
			`break Byte`
			`}`
			`}`
			`return buf[i:], p`
			`}`

			`type pos struct {`
			`Filename string`
			`Pos zcl.Pos`
			`}`

			`func (p *pos) Range(byteLen, charLen int) zcl.Range {`
			`start := p.Pos`
			`end := p.Pos`
			`end.Byte += byteLen`
			`end.Column += charLen`
			`return zcl.Range{`
			`Filename: p.Filename,`
			`Start: start,`
			`End: end,`
			`}`
			`}`

			`func posRange(start, end pos) zcl.Range {`
			`return zcl.Range{`
			`Filename: start.Filename,`
			`Start: start.Pos,`
			`End: end.Pos,`
			`}`
			`}`

			`func (t token) GoString() string {`
			`return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range)`
			`}`