hcl/zcl/zclsyntax/token.go

package zclsyntax

import (
	"fmt"

	"github.com/apparentlymart/go-textseg/textseg"
	"github.com/zclconf/go-zcl/zcl"
)

// Token represents a sequence of bytes from some zcl code that has been
// tagged with a type and its range within the source file.
type Token struct {
	Type  TokenType
	Bytes []byte
	Range zcl.Range
}

// Tokens is a slice of Token.
type Tokens []Token

// TokenType is an enumeration used for the Type field on Token.
type TokenType rune

//go:generate stringer -type TokenType -output token_type_string.go

const (
	// Single-character tokens are represented by their own character, for
	// convenience in producing these within the scanner. However, the values
	// are otherwise arbitrary and just intended to be mnemonic for humans
	// who might see them in debug output.

	TokenOBrace   TokenType = '{'
	TokenCBrace   TokenType = '}'
	TokenOBrack   TokenType = '['
	TokenCBrack   TokenType = ']'
	TokenOParen   TokenType = '('
	TokenCParen   TokenType = ')'
	TokenOQuote   TokenType = '«'
	TokenCQuote   TokenType = '»'
	TokenOHeredoc TokenType = 'H'
	TokenCHeredoc TokenType = 'h'

	TokenStar    TokenType = '*'
	TokenSlash   TokenType = '/'
	TokenPlus    TokenType = '+'
	TokenMinus   TokenType = '-'
	TokenPercent TokenType = '%'

	TokenEqual         TokenType = '='
	TokenNotEqual      TokenType = '≠'
	TokenLessThan      TokenType = '<'
	TokenLessThanEq    TokenType = '≤'
	TokenGreaterThan   TokenType = '>'
	TokenGreaterThanEq TokenType = '≥'

	TokenAnd  TokenType = '∧'
	TokenOr   TokenType = '∨'
	TokenBang TokenType = '!'

	TokenDot   TokenType = '.'
	TokenComma TokenType = ','

	TokenQuestion TokenType = '?'
	TokenColon    TokenType = ':'

	TokenTemplateInterp  TokenType = '∫'
	TokenTemplateControl TokenType = 'λ'
	TokenTemplateSeqEnd  TokenType = '∎'

	TokenQuotedLit TokenType = 'Q' // might contain backslash escapes
	TokenStringLit TokenType = 'S' // cannot contain backslash escapes
	TokenNumberLit TokenType = 'N'
	TokenIdent     TokenType = 'I'

	TokenComment TokenType = 'C'

	TokenNewline TokenType = '\n'
	TokenEOF     TokenType = '␄'

	// The rest are not used in the language but recognized by the scanner so
	// we can generate good diagnostics in the parser when users try to write
	// things that might work in other languages they are familiar with, or
	// simply make incorrect assumptions about the zcl language.

	TokenBitwiseAnd TokenType = '&'
	TokenBitwiseOr  TokenType = '|'
	TokenBitwiseNot TokenType = '~'
	TokenBitwiseXor TokenType = '^'
	TokenStarStar   TokenType = '➚'
	TokenBacktick   TokenType = '`'
	TokenSemicolon  TokenType = ';'
	TokenTabs       TokenType = '␉'
	TokenInvalid    TokenType = '<27>'
	TokenBadUTF8    TokenType = '💩'

	// TokenNil is a placeholder for when a token is required but none is
	// available, e.g. when reporting errors. The scanner will never produce
	// this as part of a token stream.
	TokenNil TokenType = '\x00'
)

func (t TokenType) GoString() string {
	return fmt.Sprintf("zclsyntax.%s", t.String())
}

type scanMode int

const (
	scanNormal scanMode = iota
	scanTemplate
)

type tokenAccum struct {
	Filename string
	Bytes    []byte
	Pos      zcl.Pos
	Tokens   []Token
}

func (f *tokenAccum) emitToken(ty TokenType, startOfs, endOfs int) {
	// Walk through our buffer to figure out how much we need to adjust
	// the start pos to get our end pos.

	start := f.Pos
	start.Column += startOfs - f.Pos.Byte // Safe because only ASCII spaces can be in the offset
	start.Byte = startOfs

	end := start
	end.Byte = endOfs
	b := f.Bytes[startOfs:endOfs]
	for len(b) > 0 {
		advance, seq, _ := textseg.ScanGraphemeClusters(b, true)
		if len(seq) == 1 && seq[0] == '\n' {
			end.Line++
			end.Column = 1
		} else {
			end.Column++
		}
		b = b[advance:]
	}

	f.Pos = end

	f.Tokens = append(f.Tokens, Token{
		Type:  ty,
		Bytes: f.Bytes[startOfs:endOfs],
		Range: zcl.Range{
			Filename: f.Filename,
			Start:    start,
			End:      end,
		},
	})
}

type heredocInProgress struct {
	Marker      []byte
	StartOfLine bool
}

// checkInvalidTokens does a simple pass across the given tokens and generates
// diagnostics for tokens that should _never_ appear in ZCL source. This
// is intended to avoid the need for the parser to have special support
// for them all over.
//
// Returns a diagnostics with no errors if everything seems acceptable.
// Otherwise, returns zero or more error diagnostics, though tries to limit
// repetition of the same information.
func checkInvalidTokens(tokens Tokens) zcl.Diagnostics {
	var diags zcl.Diagnostics

	toldBitwise := 0
	toldExponent := 0
	toldBacktick := 0
	toldSemicolon := 0
	toldTabs := 0
	toldBadUTF8 := 0

	for _, tok := range tokens {
		switch tok.Type {
		case TokenBitwiseAnd, TokenBitwiseOr, TokenBitwiseXor, TokenBitwiseNot:
			if toldBitwise < 4 {
				var suggestion string
				switch tok.Type {
				case TokenBitwiseAnd:
					suggestion = " Did you mean boolean AND (\"&&\")?"
				case TokenBitwiseOr:
					suggestion = " Did you mean boolean OR (\"&&\")?"
				case TokenBitwiseNot:
					suggestion = " Did you mean boolean NOT (\"!\")?"
				}

				diags = append(diags, &zcl.Diagnostic{
					Severity: zcl.DiagError,
					Summary:  "Unsupported operator",
					Detail:   fmt.Sprintf("Bitwise operators are not supported.%s", suggestion),
					Subject:  &tok.Range,
				})
				toldBitwise++
			}
		case TokenStarStar:
			if toldExponent < 1 {
				diags = append(diags, &zcl.Diagnostic{
					Severity: zcl.DiagError,
					Summary:  "Unsupported operator",
					Detail:   "\"**\" is not a supported operator. Exponentiation is not supported as an operator.",
					Subject:  &tok.Range,
				})

				toldExponent++
			}
		case TokenBacktick:
			// Only report for alternating (even) backticks, so we won't report both start and ends of the same
			// backtick-quoted string.
			if toldExponent < 4 && (toldExponent%2) == 0 {
				diags = append(diags, &zcl.Diagnostic{
					Severity: zcl.DiagError,
					Summary:  "Invalid character",
					Detail:   "The \"`\" character is not valid. To create a multi-line string, use the \"heredoc\" syntax, like \"<<EOT\".",
					Subject:  &tok.Range,
				})

				toldBacktick++
			}
		case TokenSemicolon:
			if toldSemicolon < 1 {
				diags = append(diags, &zcl.Diagnostic{
					Severity: zcl.DiagError,
					Summary:  "Invalid character",
					Detail:   "The \";\" character is not valid. Use newlines to separate attributes and blocks, and commas to separate items in collection values.",
					Subject:  &tok.Range,
				})

				toldSemicolon++
			}
		case TokenTabs:
			if toldTabs < 1 {
				diags = append(diags, &zcl.Diagnostic{
					Severity: zcl.DiagError,
					Summary:  "Invalid character",
					Detail:   "Tab characters may not be used. The recommended indentation style is two spaces per indent.",
					Subject:  &tok.Range,
				})

				toldTabs++
			}
		case TokenBadUTF8:
			if toldBadUTF8 < 1 {
				diags = append(diags, &zcl.Diagnostic{
					Severity: zcl.DiagError,
					Summary:  "Invalid character encoding",
					Detail:   "All input files must be UTF-8 encoded. Ensure that UTF-8 encoding is selected in your editor.",
					Subject:  &tok.Range,
				})

				toldBadUTF8++
			}
		case TokenInvalid:
			diags = append(diags, &zcl.Diagnostic{
				Severity: zcl.DiagError,
				Summary:  "Invalid character",
				Detail:   "This character is not used within the language.",
				Subject:  &tok.Range,
			})

			toldTabs++
		}
	}
	return diags
}
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+								package zclsyntax
 								import (
-												zclsyntax: use stringer for TokenType stringification

											
										
										
											2017-05-28 14:38:17 +00:00
+									"fmt"
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									"github.com/apparentlymart/go-textseg/textseg"
 									"github.com/zclconf/go-zcl/zcl"
 								)
 								// Token represents a sequence of bytes from some zcl code that has been
 								// tagged with a type and its range within the source file.
 								type Token struct {
 									Type  TokenType
 									Bytes []byte
 									Range zcl.Range
 								}
-												zclsyntax: public interface to the scanner

This LexConfig, LexExpression and LexTemplate set of functions allow
outside callers to use the scanner in isolation, skipping the parser.
This may be useful for use-cases such as syntax highlighting, separate
parsers (such as the one in zclwrite), and so forth. Most callers should
use the parser (once implemented) though, to get a semantic AST.

											
										
										
											2017-05-29 23:17:07 +00:00
+								// Tokens is a slice of Token.
 								type Tokens []Token
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+								// TokenType is an enumeration used for the Type field on Token.
 								type TokenType rune
-												zclsyntax: use stringer for TokenType stringification

											
										
										
											2017-05-28 14:38:17 +00:00
+								//go:generate stringer -type TokenType -output token_type_string.go
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+								const (
 									// Single-character tokens are represented by their own character, for
 									// convenience in producing these within the scanner. However, the values
 									// are otherwise arbitrary and just intended to be mnemonic for humans
 									// who might see them in debug output.
-												zclsyntax: heredoc to be separate start/end tokens

Just as we have OQuote and CQuote, we need the same for heredocs so that
we can parse their contents as templates that may span multiple tokens.

											
										
										
											2017-05-28 16:36:32 +00:00
+									TokenOBrace   TokenType = '{'
 									TokenCBrace   TokenType = '}'
 									TokenOBrack   TokenType = '['
 									TokenCBrack   TokenType = ']'
 									TokenOParen   TokenType = '('
 									TokenCParen   TokenType = ')'
 									TokenOQuote   TokenType = '«'
 									TokenCQuote   TokenType = '»'
 									TokenOHeredoc TokenType = 'H'
 									TokenCHeredoc TokenType = 'h'
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
-												zclsyntax: add TokenPercent

This was missed on the first pass, and is needed for the modulo operator.

											
										
										
											2017-05-31 14:31:49 +00:00
+									TokenStar    TokenType = '*'
 									TokenSlash   TokenType = '/'
 									TokenPlus    TokenType = '+'
 									TokenMinus   TokenType = '-'
 									TokenPercent TokenType = '%'
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
 									TokenEqual         TokenType = '='
 									TokenNotEqual      TokenType = '≠'
 									TokenLessThan      TokenType = '<'
 									TokenLessThanEq    TokenType = '≤'
 									TokenGreaterThan   TokenType = '>'
 									TokenGreaterThanEq TokenType = '≥'
 									TokenAnd  TokenType = '∧'
 									TokenOr   TokenType = '∨'
 									TokenBang TokenType = '!'
-												zclsyntax: add TokenComma TokenType

											
										
										
											2017-06-02 14:40:42 +00:00
+									TokenDot   TokenType = '.'
 									TokenComma TokenType = ','
-												zclsyntax: add TokenPercent

This was missed on the first pass, and is needed for the modulo operator.

											
										
										
											2017-05-31 14:31:49 +00:00
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									TokenQuestion TokenType = '?'
 									TokenColon    TokenType = ':'
 									TokenTemplateInterp  TokenType = '∫'
 									TokenTemplateControl TokenType = 'λ'
-												zclsyntax: include a token for the end of a template sequence

Although this end symbol appears as just a close-brace in source, it's
worth differentiating it because the scanner must differentiate it anyway
(to recognize moving back into template-scanning mode) and it avoids the
parser from having to similarly re-recognize the difference.

											
										
										
											2017-05-28 14:20:39 +00:00
+									TokenTemplateSeqEnd  TokenType = '∎'
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
-												zclsyntax: differentiate quoted and unquoted string literals

The context where a string literal was found affects what sort of escaping
it can have, so we need to distinguish these cases so that we will only
look for and handle backslash escapes in quoted strings.

											
										
										
											2017-05-31 02:03:25 +00:00
+									TokenQuotedLit TokenType = 'Q' // might contain backslash escapes
 									TokenStringLit TokenType = 'S' // cannot contain backslash escapes
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									TokenNumberLit TokenType = 'N'
 									TokenIdent     TokenType = 'I'
-												zclsyntax: scanning of comments

											
										
										
											2017-05-29 16:13:35 +00:00
+									TokenComment TokenType = 'C'
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									TokenNewline TokenType = '\n'
 									TokenEOF     TokenType = '␄'
 									// The rest are not used in the language but recognized by the scanner so
 									// we can generate good diagnostics in the parser when users try to write
 									// things that might work in other languages they are familiar with, or
 									// simply make incorrect assumptions about the zcl language.
 									TokenBitwiseAnd TokenType = '&'
 									TokenBitwiseOr  TokenType = '|'
 									TokenBitwiseNot TokenType = '~'
 									TokenBitwiseXor TokenType = '^'
 									TokenStarStar   TokenType = '➚'
 									TokenBacktick   TokenType = '`'
 									TokenSemicolon  TokenType = ';'
-												zclsyntax: re-organize and simplify the scanner

											
										
										
											2017-05-28 15:38:13 +00:00
+									TokenTabs       TokenType = '␉'
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									TokenInvalid    TokenType = '<27>'
 									TokenBadUTF8    TokenType = '💩'
-												zclsyntax: initial pass at body parsing

Only able to parse empty bodies so far.

											
										
										
											2017-05-30 02:28:10 +00:00
 									// TokenNil is a placeholder for when a token is required but none is
 									// available, e.g. when reporting errors. The scanner will never produce
 									// this as part of a token stream.
 									TokenNil TokenType = '\x00'
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+								)
-												zclsyntax: use stringer for TokenType stringification

											
										
										
											2017-05-28 14:38:17 +00:00
+								func (t TokenType) GoString() string {
 									return fmt.Sprintf("zclsyntax.%s", t.String())
 								}
-												zclsyntax: allow scanner to support multiple modes

A scanner "mode" decides which state it starts in, allowing us to start
in template mode for parsing top-level templates. However, currently the
only mode implemented is "normal" mode, which is the behavior we had
before.

											
										
										
											2017-05-28 22:44:22 +00:00
+								type scanMode int
 								const (
 									scanNormal scanMode = iota
 									scanTemplate
 								)
-												zclsyntax: scanner to return whole token slice at once

On reflection, it seems easier to maintain the necessary state we need
by doing all of the scanning in a single pass, since we can then just
use local variables within the scanner function.

											
										
										
											2017-05-28 14:11:24 +00:00
+								type tokenAccum struct {
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									Filename string
 									Bytes    []byte
-												zclsyntax: re-organize and simplify the scanner

											
										
										
											2017-05-28 15:38:13 +00:00
+									Pos      zcl.Pos
-												zclsyntax: scanner to return whole token slice at once

On reflection, it seems easier to maintain the necessary state we need
by doing all of the scanning in a single pass, since we can then just
use local variables within the scanner function.

											
										
										
											2017-05-28 14:11:24 +00:00
+									Tokens   []Token
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+								}
-												zclsyntax: re-organize and simplify the scanner

											
										
										
											2017-05-28 15:38:13 +00:00
+								func (f *tokenAccum) emitToken(ty TokenType, startOfs, endOfs int) {
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									// Walk through our buffer to figure out how much we need to adjust
 									// the start pos to get our end pos.
-												zclsyntax: re-organize and simplify the scanner

											
										
										
											2017-05-28 15:38:13 +00:00
+									start := f.Pos
 									start.Column += startOfs - f.Pos.Byte // Safe because only ASCII spaces can be in the offset
 									start.Byte = startOfs
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
 									end := start
-												zclsyntax: re-organize and simplify the scanner

											
										
										
											2017-05-28 15:38:13 +00:00
+									end.Byte = endOfs
 									b := f.Bytes[startOfs:endOfs]
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									for len(b) > 0 {
 										advance, seq, _ := textseg.ScanGraphemeClusters(b, true)
 										if len(seq) == 1 && seq[0] == '\n' {
 											end.Line++
 											end.Column = 1
 										} else {
 											end.Column++
 										}
 										b = b[advance:]
 									}
-												zclsyntax: re-organize and simplify the scanner

											
										
										
											2017-05-28 15:38:13 +00:00
+									f.Pos = end
-												zclsyntax: scanner to return whole token slice at once

On reflection, it seems easier to maintain the necessary state we need
by doing all of the scanning in a single pass, since we can then just
use local variables within the scanner function.

											
										
										
											2017-05-28 14:11:24 +00:00
+									f.Tokens = append(f.Tokens, Token{
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+										Type:  ty,
 										Bytes: f.Bytes[startOfs:endOfs],
 										Range: zcl.Range{
 											Filename: f.Filename,
 											Start:    start,
 											End:      end,
 										},
-												zclsyntax: scanner to return whole token slice at once

On reflection, it seems easier to maintain the necessary state we need
by doing all of the scanning in a single pass, since we can then just
use local variables within the scanner function.

											
										
										
											2017-05-28 14:11:24 +00:00
+									})
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+								}
-												zclsyntax: heredoc support in the scanner

											
										
										
											2017-05-29 15:55:53 +00:00
 								type heredocInProgress struct {
 									Marker      []byte
 									StartOfLine bool
 								}
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
 								// checkInvalidTokens does a simple pass across the given tokens and generates
 								// diagnostics for tokens that should _never_ appear in ZCL source. This
 								// is intended to avoid the need for the parser to have special support
 								// for them all over.
 								//
 								// Returns a diagnostics with no errors if everything seems acceptable.
 								// Otherwise, returns zero or more error diagnostics, though tries to limit
 								// repetition of the same information.
 								func checkInvalidTokens(tokens Tokens) zcl.Diagnostics {
 									var diags zcl.Diagnostics
 									toldBitwise := 0
 									toldExponent := 0
 									toldBacktick := 0
 									toldSemicolon := 0
 									toldTabs := 0
 									toldBadUTF8 := 0
 									for _, tok := range tokens {
 										switch tok.Type {
 										case TokenBitwiseAnd, TokenBitwiseOr, TokenBitwiseXor, TokenBitwiseNot:
 											if toldBitwise < 4 {
 												var suggestion string
 												switch tok.Type {
 												case TokenBitwiseAnd:
 													suggestion = " Did you mean boolean AND (\"&&\")?"
 												case TokenBitwiseOr:
 													suggestion = " Did you mean boolean OR (\"&&\")?"
 												case TokenBitwiseNot:
 													suggestion = " Did you mean boolean NOT (\"!\")?"
 												}
 												diags = append(diags, &zcl.Diagnostic{
 													Severity: zcl.DiagError,
 													Summary:  "Unsupported operator",
 													Detail:   fmt.Sprintf("Bitwise operators are not supported.%s", suggestion),
 													Subject:  &tok.Range,
 												})
 												toldBitwise++
 											}
 										case TokenStarStar:
 											if toldExponent < 1 {
 												diags = append(diags, &zcl.Diagnostic{
 													Severity: zcl.DiagError,
 													Summary:  "Unsupported operator",
 													Detail:   "\"**\" is not a supported operator. Exponentiation is not supported as an operator.",
 													Subject:  &tok.Range,
 												})
 												toldExponent++
 											}
 										case TokenBacktick:
 											// Only report for alternating (even) backticks, so we won't report both start and ends of the same
 											// backtick-quoted string.
 											if toldExponent < 4 && (toldExponent%2) == 0 {
 												diags = append(diags, &zcl.Diagnostic{
 													Severity: zcl.DiagError,
 													Summary:  "Invalid character",
 													Detail:   "The \"`\" character is not valid. To create a multi-line string, use the \"heredoc\" syntax, like \"<<EOT\".",
 													Subject:  &tok.Range,
 												})
 												toldBacktick++
 											}
 										case TokenSemicolon:
 											if toldSemicolon < 1 {
 												diags = append(diags, &zcl.Diagnostic{
 													Severity: zcl.DiagError,
 													Summary:  "Invalid character",
 													Detail:   "The \";\" character is not valid. Use newlines to separate attributes and blocks, and commas to separate items in collection values.",
 													Subject:  &tok.Range,
 												})
 												toldSemicolon++
 											}
 										case TokenTabs:
 											if toldTabs < 1 {
 												diags = append(diags, &zcl.Diagnostic{
 													Severity: zcl.DiagError,
 													Summary:  "Invalid character",
 													Detail:   "Tab characters may not be used. The recommended indentation style is two spaces per indent.",
 													Subject:  &tok.Range,
 												})
 												toldTabs++
 											}
 										case TokenBadUTF8:
 											if toldBadUTF8 < 1 {
 												diags = append(diags, &zcl.Diagnostic{
 													Severity: zcl.DiagError,
 													Summary:  "Invalid character encoding",
 													Detail:   "All input files must be UTF-8 encoded. Ensure that UTF-8 encoding is selected in your editor.",
 													Subject:  &tok.Range,
 												})
 												toldBadUTF8++
 											}
 										case TokenInvalid:
 											diags = append(diags, &zcl.Diagnostic{
 												Severity: zcl.DiagError,
 												Summary:  "Invalid character",
 												Detail:   "This character is not used within the language.",
 												Subject:  &tok.Range,
 											})
 											toldTabs++
 										}
 									}
 									return diags
 								}