hcl/hclsyntax/token.go

package hclsyntax

import (
	"bytes"
	"fmt"

	"github.com/apparentlymart/go-textseg/v13/textseg"
	"github.com/hashicorp/hcl/v2"
)

// Token represents a sequence of bytes from some HCL code that has been
// tagged with a type and its range within the source file.
type Token struct {
	Type  TokenType
	Bytes []byte
	Range hcl.Range
}

// Tokens is a slice of Token.
type Tokens []Token

// TokenType is an enumeration used for the Type field on Token.
type TokenType rune

const (
	// Single-character tokens are represented by their own character, for
	// convenience in producing these within the scanner. However, the values
	// are otherwise arbitrary and just intended to be mnemonic for humans
	// who might see them in debug output.

	TokenOBrace   TokenType = '{'
	TokenCBrace   TokenType = '}'
	TokenOBrack   TokenType = '['
	TokenCBrack   TokenType = ']'
	TokenOParen   TokenType = '('
	TokenCParen   TokenType = ')'
	TokenOQuote   TokenType = '«'
	TokenCQuote   TokenType = '»'
	TokenOHeredoc TokenType = 'H'
	TokenCHeredoc TokenType = 'h'

	TokenStar    TokenType = '*'
	TokenSlash   TokenType = '/'
	TokenPlus    TokenType = '+'
	TokenMinus   TokenType = '-'
	TokenPercent TokenType = '%'

	TokenEqual         TokenType = '='
	TokenEqualOp       TokenType = '≔'
	TokenNotEqual      TokenType = '≠'
	TokenLessThan      TokenType = '<'
	TokenLessThanEq    TokenType = '≤'
	TokenGreaterThan   TokenType = '>'
	TokenGreaterThanEq TokenType = '≥'

	TokenAnd  TokenType = '∧'
	TokenOr   TokenType = '∨'
	TokenBang TokenType = '!'

	TokenDot   TokenType = '.'
	TokenComma TokenType = ','

	TokenEllipsis TokenType = '…'
	TokenFatArrow TokenType = '⇒'

	TokenQuestion TokenType = '?'
	TokenColon    TokenType = ':'

	TokenTemplateInterp  TokenType = '∫'
	TokenTemplateControl TokenType = 'λ'
	TokenTemplateSeqEnd  TokenType = '∎'

	TokenQuotedLit TokenType = 'Q' // might contain backslash escapes
	TokenStringLit TokenType = 'S' // cannot contain backslash escapes
	TokenNumberLit TokenType = 'N'
	TokenIdent     TokenType = 'I'

	TokenComment TokenType = 'C'

	TokenNewline TokenType = '\n'
	TokenEOF     TokenType = '␄'

	// The rest are not used in the language but recognized by the scanner so
	// we can generate good diagnostics in the parser when users try to write
	// things that might work in other languages they are familiar with, or
	// simply make incorrect assumptions about the HCL language.

	TokenBitwiseAnd    TokenType = '&'
	TokenBitwiseOr     TokenType = '|'
	TokenBitwiseNot    TokenType = '~'
	TokenBitwiseXor    TokenType = '^'
	TokenStarStar      TokenType = '➚'
	TokenApostrophe    TokenType = '\''
	TokenBacktick      TokenType = '`'
	TokenSemicolon     TokenType = ';'
	TokenTabs          TokenType = '␉'
	TokenInvalid       TokenType = '<27>'
	TokenBadUTF8       TokenType = '💩'
	TokenQuotedNewline TokenType = '␤'

	// TokenNil is a placeholder for when a token is required but none is
	// available, e.g. when reporting errors. The scanner will never produce
	// this as part of a token stream.
	TokenNil TokenType = '\x00'
)

func (t TokenType) GoString() string {
	return fmt.Sprintf("hclsyntax.%s", t.String())
}

type scanMode int

const (
	scanNormal scanMode = iota
	scanTemplate
	scanIdentOnly
)

type tokenAccum struct {
	Filename  string
	Bytes     []byte
	Pos       hcl.Pos
	Tokens    []Token
	StartByte int
}

func (f *tokenAccum) emitToken(ty TokenType, startOfs, endOfs int) {
	// Walk through our buffer to figure out how much we need to adjust
	// the start pos to get our end pos.

	start := f.Pos
	start.Column += startOfs + f.StartByte - f.Pos.Byte // Safe because only ASCII spaces can be in the offset
	start.Byte = startOfs + f.StartByte

	end := start
	end.Byte = endOfs + f.StartByte
	b := f.Bytes[startOfs:endOfs]
	for len(b) > 0 {
		advance, seq, _ := textseg.ScanGraphemeClusters(b, true)
		if (len(seq) == 1 && seq[0] == '\n') || (len(seq) == 2 && seq[0] == '\r' && seq[1] == '\n') {
			end.Line++
			end.Column = 1
		} else {
			end.Column++
		}
		b = b[advance:]
	}

	f.Pos = end

	f.Tokens = append(f.Tokens, Token{
		Type:  ty,
		Bytes: f.Bytes[startOfs:endOfs],
		Range: hcl.Range{
			Filename: f.Filename,
			Start:    start,
			End:      end,
		},
	})
}

type heredocInProgress struct {
	Marker      []byte
	StartOfLine bool
}

func tokenOpensFlushHeredoc(tok Token) bool {
	if tok.Type != TokenOHeredoc {
		return false
	}
	return bytes.HasPrefix(tok.Bytes, []byte{'<', '<', '-'})
}

// checkInvalidTokens does a simple pass across the given tokens and generates
// diagnostics for tokens that should _never_ appear in HCL source. This
// is intended to avoid the need for the parser to have special support
// for them all over.
//
// Returns a diagnostics with no errors if everything seems acceptable.
// Otherwise, returns zero or more error diagnostics, though tries to limit
// repetition of the same information.
func checkInvalidTokens(tokens Tokens) hcl.Diagnostics {
	var diags hcl.Diagnostics

	toldBitwise := 0
	toldExponent := 0
	toldBacktick := 0
	toldApostrophe := 0
	toldSemicolon := 0
	toldTabs := 0
	toldBadUTF8 := 0

	for _, tok := range tokens {
		// copy token so it's safe to point to it
		tok := tok

		switch tok.Type {
		case TokenBitwiseAnd, TokenBitwiseOr, TokenBitwiseXor, TokenBitwiseNot:
			if toldBitwise < 4 {
				var suggestion string
				switch tok.Type {
				case TokenBitwiseAnd:
					suggestion = " Did you mean boolean AND (\"&&\")?"
				case TokenBitwiseOr:
					suggestion = " Did you mean boolean OR (\"||\")?"
				case TokenBitwiseNot:
					suggestion = " Did you mean boolean NOT (\"!\")?"
				}

				diags = append(diags, &hcl.Diagnostic{
					Severity: hcl.DiagError,
					Summary:  "Unsupported operator",
					Detail:   fmt.Sprintf("Bitwise operators are not supported.%s", suggestion),
					Subject:  &tok.Range,
				})
				toldBitwise++
			}
		case TokenStarStar:
			if toldExponent < 1 {
				diags = append(diags, &hcl.Diagnostic{
					Severity: hcl.DiagError,
					Summary:  "Unsupported operator",
					Detail:   "\"**\" is not a supported operator. Exponentiation is not supported as an operator.",
					Subject:  &tok.Range,
				})

				toldExponent++
			}
		case TokenBacktick:
			// Only report for alternating (even) backticks, so we won't report both start and ends of the same
			// backtick-quoted string.
			if (toldBacktick % 2) == 0 {
				diags = append(diags, &hcl.Diagnostic{
					Severity: hcl.DiagError,
					Summary:  "Invalid character",
					Detail:   "The \"`\" character is not valid. To create a multi-line string, use the \"heredoc\" syntax, like \"<<EOT\".",
					Subject:  &tok.Range,
				})
			}
			if toldBacktick <= 2 {
				toldBacktick++
			}
		case TokenApostrophe:
			if (toldApostrophe % 2) == 0 {
				newDiag := &hcl.Diagnostic{
					Severity: hcl.DiagError,
					Summary:  "Invalid character",
					Detail:   "Single quotes are not valid. Use double quotes (\") to enclose strings.",
					Subject:  &tok.Range,
				}
				diags = append(diags, newDiag)
			}
			if toldApostrophe <= 2 {
				toldApostrophe++
			}
		case TokenSemicolon:
			if toldSemicolon < 1 {
				diags = append(diags, &hcl.Diagnostic{
					Severity: hcl.DiagError,
					Summary:  "Invalid character",
					Detail:   "The \";\" character is not valid. Use newlines to separate arguments and blocks, and commas to separate items in collection values.",
					Subject:  &tok.Range,
				})

				toldSemicolon++
			}
		case TokenTabs:
			if toldTabs < 1 {
				diags = append(diags, &hcl.Diagnostic{
					Severity: hcl.DiagError,
					Summary:  "Invalid character",
					Detail:   "Tab characters may not be used. The recommended indentation style is two spaces per indent.",
					Subject:  &tok.Range,
				})

				toldTabs++
			}
		case TokenBadUTF8:
			if toldBadUTF8 < 1 {
				diags = append(diags, &hcl.Diagnostic{
					Severity: hcl.DiagError,
					Summary:  "Invalid character encoding",
					Detail:   "All input files must be UTF-8 encoded. Ensure that UTF-8 encoding is selected in your editor.",
					Subject:  &tok.Range,
				})

				toldBadUTF8++
			}
		case TokenQuotedNewline:
			diags = append(diags, &hcl.Diagnostic{
				Severity: hcl.DiagError,
				Summary:  "Invalid multi-line string",
				Detail:   "Quoted strings may not be split over multiple lines. To produce a multi-line string, either use the \\n escape to represent a newline character or use the \"heredoc\" multi-line template syntax.",
				Subject:  &tok.Range,
			})
		case TokenInvalid:
			chars := string(tok.Bytes)
			switch chars {
			case "“", "”":
				diags = append(diags, &hcl.Diagnostic{
					Severity: hcl.DiagError,
					Summary:  "Invalid character",
					Detail:   "\"Curly quotes\" are not valid here. These can sometimes be inadvertently introduced when sharing code via documents or discussion forums. It might help to replace the character with a \"straight quote\".",
					Subject:  &tok.Range,
				})
			default:
				diags = append(diags, &hcl.Diagnostic{
					Severity: hcl.DiagError,
					Summary:  "Invalid character",
					Detail:   "This character is not used within the language.",
					Subject:  &tok.Range,
				})
			}
		}
	}
	return diags
}

var utf8BOM = []byte{0xef, 0xbb, 0xbf}

// stripUTF8BOM checks whether the given buffer begins with a UTF-8 byte order
// mark (0xEF 0xBB 0xBF) and, if so, returns a truncated slice with the same
// backing array but with the BOM skipped.
//
// If there is no BOM present, the given slice is returned verbatim.
func stripUTF8BOM(src []byte) []byte {
	if bytes.HasPrefix(src, utf8BOM) {
		return src[3:]
	}
	return src
}
-												Move the zcl package and its two parsing subpackages to "hcl" names

This is a super-invasive update since the "zcl" package in particular
is referenced all over.

There are probably still a few zcl references hanging around in comments,
etc but this takes care of most of it.

											
										
										
											2017-09-11 23:40:37 +00:00
+								package hclsyntax
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
 								import (
-												hcl/hclsyntax: Fix up parsing of flush heredocs

This was implemented a long time ago in the original template parser, but
it was missed in the rewrite of the template parser to make it use a
two-stage parsing strategy.

It's implemented as a post-processing step on the result of the first
stage of parsing, which produces a flat sequence of literal strings,
interpolation markers, and control markers, and prior to the second stage
which matches opening and closing control markers to produce an expression
AST.

It's important to do this at parse time rather than eval time since it is
the static layout of the source code that decides the indentation level,
and so an interpolation marker at the start of a line that itself produces
spaces does not affect the result.

											
										
										
											2018-12-14 01:22:41 +00:00
+									"bytes"
-												zclsyntax: use stringer for TokenType stringification

											
										
										
											2017-05-28 14:38:17 +00:00
+									"fmt"
-												Use Unicode 13 text segmentation rules

HCL uses a number of upstream libraries that implement algorithms defined
in Unicode. This commit is updating those libraries all to versions that
have Unicode 13 support.

The main implication of this for HCL directly is that when it returns
column numbers in source locations it will count characters using the
Unicode 13 definition of "character", which includes various new
multi-codeunit characters added in Unicode 13.

These new version dependencies will also make Unicode 13 support available
for other functionality that HCL callers might use, such as the stdlib
functions in upstream cty, even though HCL itself does not directly use
those.

											
										
										
											2021-02-23 00:31:36 +00:00
+									"github.com/apparentlymart/go-textseg/v13/textseg"
-												Unfold the "hcl" directory up into the root

The main HCL package is more visible this way, and so it's easier than
having to pick it out from dozens of other package directories.

											
										
										
											2019-09-09 23:08:19 +00:00
+									"github.com/hashicorp/hcl/v2"
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+								)
-												More miscellaneous renaming of ZCL to HCL.

											
										
										
											2017-09-12 01:36:56 +00:00
+								// Token represents a sequence of bytes from some HCL code that has been
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+								// tagged with a type and its range within the source file.
 								type Token struct {
 									Type  TokenType
 									Bytes []byte
-												Move the zcl package and its two parsing subpackages to "hcl" names

This is a super-invasive update since the "zcl" package in particular
is referenced all over.

There are probably still a few zcl references hanging around in comments,
etc but this takes care of most of it.

											
										
										
											2017-09-11 23:40:37 +00:00
+									Range hcl.Range
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+								}
-												zclsyntax: public interface to the scanner

This LexConfig, LexExpression and LexTemplate set of functions allow
outside callers to use the scanner in isolation, skipping the parser.
This may be useful for use-cases such as syntax highlighting, separate
parsers (such as the one in zclwrite), and so forth. Most callers should
use the parser (once implemented) though, to get a semantic AST.

											
										
										
											2017-05-29 23:17:07 +00:00
+								// Tokens is a slice of Token.
 								type Tokens []Token
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+								// TokenType is an enumeration used for the Type field on Token.
 								type TokenType rune
 								const (
 									// Single-character tokens are represented by their own character, for
 									// convenience in producing these within the scanner. However, the values
 									// are otherwise arbitrary and just intended to be mnemonic for humans
 									// who might see them in debug output.
-												zclsyntax: heredoc to be separate start/end tokens

Just as we have OQuote and CQuote, we need the same for heredocs so that
we can parse their contents as templates that may span multiple tokens.

											
										
										
											2017-05-28 16:36:32 +00:00
+									TokenOBrace   TokenType = '{'
 									TokenCBrace   TokenType = '}'
 									TokenOBrack   TokenType = '['
 									TokenCBrack   TokenType = ']'
 									TokenOParen   TokenType = '('
 									TokenCParen   TokenType = ')'
 									TokenOQuote   TokenType = '«'
 									TokenCQuote   TokenType = '»'
 									TokenOHeredoc TokenType = 'H'
 									TokenCHeredoc TokenType = 'h'
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
-												zclsyntax: add TokenPercent

This was missed on the first pass, and is needed for the modulo operator.

											
										
										
											2017-05-31 14:31:49 +00:00
+									TokenStar    TokenType = '*'
 									TokenSlash   TokenType = '/'
 									TokenPlus    TokenType = '+'
 									TokenMinus   TokenType = '-'
 									TokenPercent TokenType = '%'
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
 									TokenEqual         TokenType = '='
-												zclsyntax: separate tokens for assign =  and equality test ==

											
										
										
											2017-06-04 23:00:40 +00:00
+									TokenEqualOp       TokenType = '≔'
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									TokenNotEqual      TokenType = '≠'
 									TokenLessThan      TokenType = '<'
 									TokenLessThanEq    TokenType = '≤'
 									TokenGreaterThan   TokenType = '>'
 									TokenGreaterThanEq TokenType = '≥'
 									TokenAnd  TokenType = '∧'
 									TokenOr   TokenType = '∨'
 									TokenBang TokenType = '!'
-												zclsyntax: add TokenComma TokenType

											
										
										
											2017-06-02 14:40:42 +00:00
+									TokenDot   TokenType = '.'
 									TokenComma TokenType = ','
-												zclsyntax: add TokenPercent

This was missed on the first pass, and is needed for the modulo operator.

											
										
										
											2017-05-31 14:31:49 +00:00
-												zclsyntax: Ellipsis and "fat arrow" tokens

These will be used in the "for" expression, and later ellipsis will also
be used within calls to expand tuples as args.

											
										
										
											2017-06-13 15:50:20 +00:00
+									TokenEllipsis TokenType = '…'
 									TokenFatArrow TokenType = '⇒'
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									TokenQuestion TokenType = '?'
 									TokenColon    TokenType = ':'
 									TokenTemplateInterp  TokenType = '∫'
 									TokenTemplateControl TokenType = 'λ'
-												zclsyntax: include a token for the end of a template sequence

Although this end symbol appears as just a close-brace in source, it's
worth differentiating it because the scanner must differentiate it anyway
(to recognize moving back into template-scanning mode) and it avoids the
parser from having to similarly re-recognize the difference.

											
										
										
											2017-05-28 14:20:39 +00:00
+									TokenTemplateSeqEnd  TokenType = '∎'
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
-												zclsyntax: differentiate quoted and unquoted string literals

The context where a string literal was found affects what sort of escaping
it can have, so we need to distinguish these cases so that we will only
look for and handle backslash escapes in quoted strings.

											
										
										
											2017-05-31 02:03:25 +00:00
+									TokenQuotedLit TokenType = 'Q' // might contain backslash escapes
 									TokenStringLit TokenType = 'S' // cannot contain backslash escapes
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									TokenNumberLit TokenType = 'N'
 									TokenIdent     TokenType = 'I'
-												zclsyntax: scanning of comments

											
										
										
											2017-05-29 16:13:35 +00:00
+									TokenComment TokenType = 'C'
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									TokenNewline TokenType = '\n'
 									TokenEOF     TokenType = '␄'
 									// The rest are not used in the language but recognized by the scanner so
 									// we can generate good diagnostics in the parser when users try to write
 									// things that might work in other languages they are familiar with, or
-												More miscellaneous renaming of ZCL to HCL.

											
										
										
											2017-09-12 01:36:56 +00:00
+									// simply make incorrect assumptions about the HCL language.
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
-												hcl/hclsyntax: Correct scanning of literal $ and % before quotes

The TemplateStringLiteral production was not quite right, causing a
literal $ or % immediately followed by " to consume the quotes and any
following characters on the line if there were any more characters on the
line.

Now we match things more precisely, but at the expense of generating some
redundant extra tokens when escapes and literal dollar/percent signs are
present. Those extra tokens don't matter in practice because the resulting
strings get concatenated together anyway, which is proven by the fact
that this changeset includes changes only to the scanner and parser tests,
and not to any of the expression result tests.

While here, I also improved the error message for when the user attempts
to split a quoted string over multiple lines. Previously it was just using
the generic "invalid character" message, which isn't particularly
actionable. Now we'll give the user a couple options of what to do
instead.

											
										
										
											2019-05-03 21:00:54 +00:00
+									TokenBitwiseAnd    TokenType = '&'
 									TokenBitwiseOr     TokenType = '|'
 									TokenBitwiseNot    TokenType = '~'
 									TokenBitwiseXor    TokenType = '^'
 									TokenStarStar      TokenType = '➚'
 									TokenApostrophe    TokenType = '\''
 									TokenBacktick      TokenType = '`'
 									TokenSemicolon     TokenType = ';'
 									TokenTabs          TokenType = '␉'
 									TokenInvalid       TokenType = '<27>'
 									TokenBadUTF8       TokenType = '💩'
 									TokenQuotedNewline TokenType = '␤'
-												zclsyntax: initial pass at body parsing

Only able to parse empty bodies so far.

											
										
										
											2017-05-30 02:28:10 +00:00
 									// TokenNil is a placeholder for when a token is required but none is
 									// available, e.g. when reporting errors. The scanner will never produce
 									// this as part of a token stream.
 									TokenNil TokenType = '\x00'
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+								)
-												zclsyntax: use stringer for TokenType stringification

											
										
										
											2017-05-28 14:38:17 +00:00
+								func (t TokenType) GoString() string {
-												More miscellaneous renaming of ZCL to HCL.

											
										
										
											2017-09-12 01:36:56 +00:00
+									return fmt.Sprintf("hclsyntax.%s", t.String())
-												zclsyntax: use stringer for TokenType stringification

											
										
										
											2017-05-28 14:38:17 +00:00
+								}
-												zclsyntax: allow scanner to support multiple modes

A scanner "mode" decides which state it starts in, allowing us to start
in template mode for parsing top-level templates. However, currently the
only mode implemented is "normal" mode, which is the behavior we had
before.

											
										
										
											2017-05-28 22:44:22 +00:00
+								type scanMode int
 								const (
 									scanNormal scanMode = iota
 									scanTemplate
-												hclsyntax: ValidIdentifier function

Calling applications often need to validate strings provided by the user
that will eventually be variable or attribute names in the evaluation
scope, to ensure that they will be evaluable.

Rather than having each application specify its own different subset of
the full set we support (which is derived from Unicode specifications),
we provide a simple function to let callers easily check the validity
of a potential identifier using exactly the same scanning rules we use
within the expression scanner.

To achieve this we actually invoke the scanner and then assert on its
result, which is a pretty expensive way to just check one string but it's
easy to do with code we already have in place and we don't expect this
sort of validation to be going on in a tight loop.

											
										
										
											2018-02-02 16:09:40 +00:00
+									scanIdentOnly
-												zclsyntax: allow scanner to support multiple modes

A scanner "mode" decides which state it starts in, allowing us to start
in template mode for parsing top-level templates. However, currently the
only mode implemented is "normal" mode, which is the behavior we had
before.

											
										
										
											2017-05-28 22:44:22 +00:00
+								)
-												zclsyntax: scanner to return whole token slice at once

On reflection, it seems easier to maintain the necessary state we need
by doing all of the scanning in a single pass, since we can then just
use local variables within the scanner function.

											
										
										
											2017-05-28 14:11:24 +00:00
+								type tokenAccum struct {
-												hcl/hclsyntax: Properly support scanning from a non-zero start offset

Although our API had a place to provide a start position for scanning, it
didn't actually work in practice because the scanner wasn't aware of it
and so it would immediately undo the effect of that start offset when
making the first position adjustment.

Now we'll remember the byte offset we started at and offset the indices
the generate scanner produces so that they are are treated as relative
to that start byte instead of byte zero.

Since we rarely start with a non-zero pos this doesn't affect much, but
one specific thing it affects is the positions of native syntax templates
inside JSON syntax strings.

											
										
										
											2018-12-13 00:38:20 +00:00
+									Filename  string
 									Bytes     []byte
 									Pos       hcl.Pos
 									Tokens    []Token
 									StartByte int
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+								}
-												zclsyntax: re-organize and simplify the scanner

											
										
										
											2017-05-28 15:38:13 +00:00
+								func (f *tokenAccum) emitToken(ty TokenType, startOfs, endOfs int) {
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									// Walk through our buffer to figure out how much we need to adjust
 									// the start pos to get our end pos.
-												zclsyntax: re-organize and simplify the scanner

											
										
										
											2017-05-28 15:38:13 +00:00
+									start := f.Pos
-												hcl/hclsyntax: Properly support scanning from a non-zero start offset

Although our API had a place to provide a start position for scanning, it
didn't actually work in practice because the scanner wasn't aware of it
and so it would immediately undo the effect of that start offset when
making the first position adjustment.

Now we'll remember the byte offset we started at and offset the indices
the generate scanner produces so that they are are treated as relative
to that start byte instead of byte zero.

Since we rarely start with a non-zero pos this doesn't affect much, but
one specific thing it affects is the positions of native syntax templates
inside JSON syntax strings.

											
										
										
											2018-12-13 00:38:20 +00:00
+									start.Column += startOfs + f.StartByte - f.Pos.Byte // Safe because only ASCII spaces can be in the offset
 									start.Byte = startOfs + f.StartByte
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
 									end := start
-												hcl/hclsyntax: Properly support scanning from a non-zero start offset

Although our API had a place to provide a start position for scanning, it
didn't actually work in practice because the scanner wasn't aware of it
and so it would immediately undo the effect of that start offset when
making the first position adjustment.

Now we'll remember the byte offset we started at and offset the indices
the generate scanner produces so that they are are treated as relative
to that start byte instead of byte zero.

Since we rarely start with a non-zero pos this doesn't affect much, but
one specific thing it affects is the positions of native syntax templates
inside JSON syntax strings.

											
										
										
											2018-12-13 00:38:20 +00:00
+									end.Byte = endOfs + f.StartByte
-												zclsyntax: re-organize and simplify the scanner

											
										
										
											2017-05-28 15:38:13 +00:00
+									b := f.Bytes[startOfs:endOfs]
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+									for len(b) > 0 {
 										advance, seq, _ := textseg.ScanGraphemeClusters(b, true)
-												hclsyntax: count \r\n line endings properly in source ranges

Previously we were only counting a \n as starting a new line, so input
using \r\n endings would get treated as one long line for source-range
purposes.

Now we also consider \r\n to be a newline marker, resetting the column
count to zero and incrementing the line just as we would do for a single
\n. This is made easier because the unicode definition of "grapheme
cluster" considers \r\n to be a single character, so we don't need to
do anything special in order to match it.

											
										
										
											2018-03-08 16:30:58 +00:00
+										if (len(seq) == 1 && seq[0] == '\n') || (len(seq) == 2 && seq[0] == '\r' && seq[1] == '\n') {
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+											end.Line++
 											end.Column = 1
 										} else {
 											end.Column++
 										}
 										b = b[advance:]
 									}
-												zclsyntax: re-organize and simplify the scanner

											
										
										
											2017-05-28 15:38:13 +00:00
+									f.Pos = end
-												zclsyntax: scanner to return whole token slice at once

On reflection, it seems easier to maintain the necessary state we need
by doing all of the scanning in a single pass, since we can then just
use local variables within the scanner function.

											
										
										
											2017-05-28 14:11:24 +00:00
+									f.Tokens = append(f.Tokens, Token{
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+										Type:  ty,
 										Bytes: f.Bytes[startOfs:endOfs],
-												Move the zcl package and its two parsing subpackages to "hcl" names

This is a super-invasive update since the "zcl" package in particular
is referenced all over.

There are probably still a few zcl references hanging around in comments,
etc but this takes care of most of it.

											
										
										
											2017-09-11 23:40:37 +00:00
+										Range: hcl.Range{
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+											Filename: f.Filename,
 											Start:    start,
 											End:      end,
 										},
-												zclsyntax: scanner to return whole token slice at once

On reflection, it seems easier to maintain the necessary state we need
by doing all of the scanning in a single pass, since we can then just
use local variables within the scanner function.

											
										
										
											2017-05-28 14:11:24 +00:00
+									})
-												zclsyntax: define the initial set of language tokens for the scanner

											
										
										
											2017-05-28 02:00:00 +00:00
+								}
-												zclsyntax: heredoc support in the scanner

											
										
										
											2017-05-29 15:55:53 +00:00
 								type heredocInProgress struct {
 									Marker      []byte
 									StartOfLine bool
 								}
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
-												hcl/hclsyntax: Fix up parsing of flush heredocs

This was implemented a long time ago in the original template parser, but
it was missed in the rewrite of the template parser to make it use a
two-stage parsing strategy.

It's implemented as a post-processing step on the result of the first
stage of parsing, which produces a flat sequence of literal strings,
interpolation markers, and control markers, and prior to the second stage
which matches opening and closing control markers to produce an expression
AST.

It's important to do this at parse time rather than eval time since it is
the static layout of the source code that decides the indentation level,
and so an interpolation marker at the start of a line that itself produces
spaces does not affect the result.

											
										
										
											2018-12-14 01:22:41 +00:00
+								func tokenOpensFlushHeredoc(tok Token) bool {
 									if tok.Type != TokenOHeredoc {
 										return false
 									}
 									return bytes.HasPrefix(tok.Bytes, []byte{'<', '<', '-'})
 								}
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+								// checkInvalidTokens does a simple pass across the given tokens and generates
-												More miscellaneous renaming of ZCL to HCL.

											
										
										
											2017-09-12 01:36:56 +00:00
+								// diagnostics for tokens that should _never_ appear in HCL source. This
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+								// is intended to avoid the need for the parser to have special support
 								// for them all over.
 								//
 								// Returns a diagnostics with no errors if everything seems acceptable.
 								// Otherwise, returns zero or more error diagnostics, though tries to limit
 								// repetition of the same information.
-												Move the zcl package and its two parsing subpackages to "hcl" names

This is a super-invasive update since the "zcl" package in particular
is referenced all over.

There are probably still a few zcl references hanging around in comments,
etc but this takes care of most of it.

											
										
										
											2017-09-11 23:40:37 +00:00
+								func checkInvalidTokens(tokens Tokens) hcl.Diagnostics {
 									var diags hcl.Diagnostics
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
 									toldBitwise := 0
 									toldExponent := 0
 									toldBacktick := 0
-												hcl/hclsyntax: Produce better error message for invalid apostrophe

											
										
										
											2019-01-25 13:51:43 +00:00
+									toldApostrophe := 0
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+									toldSemicolon := 0
 									toldTabs := 0
 									toldBadUTF8 := 0
 									for _, tok := range tokens {
-												hcl/hclsyntax: Fix token range reporting for invalid characters

											
										
										
											2019-01-25 13:45:53 +00:00
+										// copy token so it's safe to point to it
 										tok := tok
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+										switch tok.Type {
 										case TokenBitwiseAnd, TokenBitwiseOr, TokenBitwiseXor, TokenBitwiseNot:
 											if toldBitwise < 4 {
 												var suggestion string
 												switch tok.Type {
 												case TokenBitwiseAnd:
 													suggestion = " Did you mean boolean AND (\"&&\")?"
 												case TokenBitwiseOr:
-												Fix a wrong error message for bitwise OR

Fixes #376

											
										
										
											2020-05-21 01:12:28 +00:00
+													suggestion = " Did you mean boolean OR (\"||\")?"
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+												case TokenBitwiseNot:
 													suggestion = " Did you mean boolean NOT (\"!\")?"
 												}
-												Move the zcl package and its two parsing subpackages to "hcl" names

This is a super-invasive update since the "zcl" package in particular
is referenced all over.

There are probably still a few zcl references hanging around in comments,
etc but this takes care of most of it.

											
										
										
											2017-09-11 23:40:37 +00:00
+												diags = append(diags, &hcl.Diagnostic{
 													Severity: hcl.DiagError,
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+													Summary:  "Unsupported operator",
 													Detail:   fmt.Sprintf("Bitwise operators are not supported.%s", suggestion),
 													Subject:  &tok.Range,
 												})
 												toldBitwise++
 											}
 										case TokenStarStar:
 											if toldExponent < 1 {
-												Move the zcl package and its two parsing subpackages to "hcl" names

This is a super-invasive update since the "zcl" package in particular
is referenced all over.

There are probably still a few zcl references hanging around in comments,
etc but this takes care of most of it.

											
										
										
											2017-09-11 23:40:37 +00:00
+												diags = append(diags, &hcl.Diagnostic{
 													Severity: hcl.DiagError,
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+													Summary:  "Unsupported operator",
 													Detail:   "\"**\" is not a supported operator. Exponentiation is not supported as an operator.",
 													Subject:  &tok.Range,
 												})
 												toldExponent++
 											}
 										case TokenBacktick:
 											// Only report for alternating (even) backticks, so we won't report both start and ends of the same
 											// backtick-quoted string.
-												hcl/hclsyntax: Fix backtick and tab duplicate detection

											
										
										
											2019-01-25 13:49:30 +00:00
+											if (toldBacktick % 2) == 0 {
-												Move the zcl package and its two parsing subpackages to "hcl" names

This is a super-invasive update since the "zcl" package in particular
is referenced all over.

There are probably still a few zcl references hanging around in comments,
etc but this takes care of most of it.

											
										
										
											2017-09-11 23:40:37 +00:00
+												diags = append(diags, &hcl.Diagnostic{
 													Severity: hcl.DiagError,
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+													Summary:  "Invalid character",
 													Detail:   "The \"`\" character is not valid. To create a multi-line string, use the \"heredoc\" syntax, like \"<<EOT\".",
 													Subject:  &tok.Range,
 												})
-												hcl/hclsyntax: Fix backtick and tab duplicate detection

											
										
										
											2019-01-25 13:49:30 +00:00
+											}
 											if toldBacktick <= 2 {
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+												toldBacktick++
 											}
-												hcl/hclsyntax: Produce better error message for invalid apostrophe

											
										
										
											2019-01-25 13:51:43 +00:00
+										case TokenApostrophe:
 											if (toldApostrophe % 2) == 0 {
 												newDiag := &hcl.Diagnostic{
 													Severity: hcl.DiagError,
 													Summary:  "Invalid character",
-												hcl/hclsyntax: Clarify character in error message

Per PR feedback https://github.com/hashicorp/hcl2/pull/33#discussion_r251081391
											
										
										
											2019-01-28 10:32:56 +00:00
+													Detail:   "Single quotes are not valid. Use double quotes (\") to enclose strings.",
-												hcl/hclsyntax: Produce better error message for invalid apostrophe

											
										
										
											2019-01-25 13:51:43 +00:00
+													Subject:  &tok.Range,
 												}
 												diags = append(diags, newDiag)
 											}
 											if toldApostrophe <= 2 {
 												toldApostrophe++
 											}
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+										case TokenSemicolon:
 											if toldSemicolon < 1 {
-												Move the zcl package and its two parsing subpackages to "hcl" names

This is a super-invasive update since the "zcl" package in particular
is referenced all over.

There are probably still a few zcl references hanging around in comments,
etc but this takes care of most of it.

											
										
										
											2017-09-11 23:40:37 +00:00
+												diags = append(diags, &hcl.Diagnostic{
 													Severity: hcl.DiagError,
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+													Summary:  "Invalid character",
-												Fix "attribute" vs "argument" terminology in diagnostics

During implementation of HCL in other applications, it became clear that
the overloading of the word "attribute" to mean both a key/value pair in
a body and an element within an object value creates confusion.

It's too late to change that in the HCL Go API now, but here we at least
update the diagnostic messages. The new convention is that a key/value
pair within a block is now called an "argument", while an element of an
object is still called an "attribute".

It is unfortunate that the Go-facing API still uses the word "attribute"
for both, but the user experience is the most important thing and in
practice many applications will treat block arguments as one way to set
the attributes of some object anyway, and in that case arguments can be
thought of as the subset of attributes of an object whose values come
from that object's associated block.

This also includes a few other minor terminology tweaks in the diagnostic
messages the reflect how our lexicon has evolved during development and
authoring of user-facing documentation.

											
										
										
											2018-07-18 22:41:35 +00:00
+													Detail:   "The \";\" character is not valid. Use newlines to separate arguments and blocks, and commas to separate items in collection values.",
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+													Subject:  &tok.Range,
 												})
 												toldSemicolon++
 											}
 										case TokenTabs:
 											if toldTabs < 1 {
-												Move the zcl package and its two parsing subpackages to "hcl" names

This is a super-invasive update since the "zcl" package in particular
is referenced all over.

There are probably still a few zcl references hanging around in comments,
etc but this takes care of most of it.

											
										
										
											2017-09-11 23:40:37 +00:00
+												diags = append(diags, &hcl.Diagnostic{
 													Severity: hcl.DiagError,
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+													Summary:  "Invalid character",
 													Detail:   "Tab characters may not be used. The recommended indentation style is two spaces per indent.",
 													Subject:  &tok.Range,
 												})
 												toldTabs++
 											}
 										case TokenBadUTF8:
 											if toldBadUTF8 < 1 {
-												Move the zcl package and its two parsing subpackages to "hcl" names

This is a super-invasive update since the "zcl" package in particular
is referenced all over.

There are probably still a few zcl references hanging around in comments,
etc but this takes care of most of it.

											
										
										
											2017-09-11 23:40:37 +00:00
+												diags = append(diags, &hcl.Diagnostic{
 													Severity: hcl.DiagError,
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+													Summary:  "Invalid character encoding",
 													Detail:   "All input files must be UTF-8 encoded. Ensure that UTF-8 encoding is selected in your editor.",
 													Subject:  &tok.Range,
 												})
 												toldBadUTF8++
 											}
-												hcl/hclsyntax: Correct scanning of literal $ and % before quotes

The TemplateStringLiteral production was not quite right, causing a
literal $ or % immediately followed by " to consume the quotes and any
following characters on the line if there were any more characters on the
line.

Now we match things more precisely, but at the expense of generating some
redundant extra tokens when escapes and literal dollar/percent signs are
present. Those extra tokens don't matter in practice because the resulting
strings get concatenated together anyway, which is proven by the fact
that this changeset includes changes only to the scanner and parser tests,
and not to any of the expression result tests.

While here, I also improved the error message for when the user attempts
to split a quoted string over multiple lines. Previously it was just using
the generic "invalid character" message, which isn't particularly
actionable. Now we'll give the user a couple options of what to do
instead.

											
										
										
											2019-05-03 21:00:54 +00:00
+										case TokenQuotedNewline:
 											diags = append(diags, &hcl.Diagnostic{
 												Severity: hcl.DiagError,
 												Summary:  "Invalid multi-line string",
 												Detail:   "Quoted strings may not be split over multiple lines. To produce a multi-line string, either use the \\n escape to represent a newline character or use the \"heredoc\" multi-line template syntax.",
 												Subject:  &tok.Range,
 											})
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+										case TokenInvalid:
-												hclsyntax: Tailored error for "curly quotes"

It seems to be somewhat common for someone to share HCL code via a forum
or a document and have the well-meaning word processor or CMS replace the
straight quotes with curly quotes, which then lead to confusing errors
when someone copies the result and tries to use it as valid HCL
configuration.

Here we add a special hint for that, giving a tailored error message
instead of the generic "This character is not used within the language"
error message.

HCL has always had some of these special hints implemented here, and they
were originally implemented with special token types to allow the parser
handle them. However, we later refactored to do the check all at once
inside the Lex* family of functions, prior to parsing, so it's now
relatively straightforward to handle it as a special case of TokenInvalid
rather than an entirely new token type. Perhaps later we'll rework the
existing ones to also just use TokenInvalid, but that's a decision for
another day.

											
										
										
											2020-08-24 17:43:03 +00:00
+											chars := string(tok.Bytes)
 											switch chars {
 											case "“", "”":
 												diags = append(diags, &hcl.Diagnostic{
 													Severity: hcl.DiagError,
 													Summary:  "Invalid character",
 													Detail:   "\"Curly quotes\" are not valid here. These can sometimes be inadvertently introduced when sharing code via documents or discussion forums. It might help to replace the character with a \"straight quote\".",
 													Subject:  &tok.Range,
 												})
 											default:
 												diags = append(diags, &hcl.Diagnostic{
 													Severity: hcl.DiagError,
 													Summary:  "Invalid character",
 													Detail:   "This character is not used within the language.",
 													Subject:  &tok.Range,
 												})
 											}
-												zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well
catch them early in the Lex... functions rather than having to handle
them in many different contexts within the parser.

Unfortunately for now when such errors occur they tend to be echoed by
more confusing errors coming from the parser, but we'll accept that for
now.

											
										
										
											2017-06-04 14:34:26 +00:00
+										}
 									}
 									return diags
 								}
-												hcl/hclsyntax: Accept and ignore UTF-8 byte order marks

A BOM is pointless in a UTF-8 file because it has a fixed encoding
agnostic of host byte ordering, but since Windows tends to use UTF-16
internally lots of Windows software will tend to generate redundant BOM
sequences at the start of UTF-8 files too.

By tolerating a leading BOM we can make life easier for those using such
Windows software, without any significant loss for normal use. This
slightly violates some of our normal assumptions about token positioning
since the BOM occupies bytes but not visible columns, but we'll just
accept that this may cause some slightly-odd behavior for use-cases such
as the diagnostic renderer and hclwrite.

											
										
										
											2018-12-19 23:52:15 +00:00
 								var utf8BOM = []byte{0xef, 0xbb, 0xbf}
 								// stripUTF8BOM checks whether the given buffer begins with a UTF-8 byte order
 								// mark (0xEF 0xBB 0xBF) and, if so, returns a truncated slice with the same
 								// backing array but with the BOM skipped.
 								//
 								// If there is no BOM present, the given slice is returned verbatim.
 								func stripUTF8BOM(src []byte) []byte {
 									if bytes.HasPrefix(src, utf8BOM) {
 										return src[3:]
 									}
 									return src
 								}