2017-09-11 23:40:37 +00:00
package hclsyntax
2017-05-28 02:00:00 +00:00
import (
2018-12-14 01:22:41 +00:00
"bytes"
2017-05-28 14:38:17 +00:00
"fmt"
2020-03-07 01:17:12 +00:00
"github.com/apparentlymart/go-textseg/v12/textseg"
2019-09-09 23:08:19 +00:00
"github.com/hashicorp/hcl/v2"
2017-05-28 02:00:00 +00:00
)
2017-09-12 01:36:56 +00:00
// Token represents a sequence of bytes from some HCL code that has been
2017-05-28 02:00:00 +00:00
// tagged with a type and its range within the source file.
type Token struct {
Type TokenType
Bytes [ ] byte
2017-09-11 23:40:37 +00:00
Range hcl . Range
2017-05-28 02:00:00 +00:00
}
2017-05-29 23:17:07 +00:00
// Tokens is a slice of Token.
type Tokens [ ] Token
2017-05-28 02:00:00 +00:00
// TokenType is an enumeration used for the Type field on Token.
type TokenType rune
const (
// Single-character tokens are represented by their own character, for
// convenience in producing these within the scanner. However, the values
// are otherwise arbitrary and just intended to be mnemonic for humans
// who might see them in debug output.
2017-05-28 16:36:32 +00:00
TokenOBrace TokenType = '{'
TokenCBrace TokenType = '}'
TokenOBrack TokenType = '['
TokenCBrack TokenType = ']'
TokenOParen TokenType = '('
TokenCParen TokenType = ')'
TokenOQuote TokenType = '«'
TokenCQuote TokenType = '»'
TokenOHeredoc TokenType = 'H'
TokenCHeredoc TokenType = 'h'
2017-05-28 02:00:00 +00:00
2017-05-31 14:31:49 +00:00
TokenStar TokenType = '*'
TokenSlash TokenType = '/'
TokenPlus TokenType = '+'
TokenMinus TokenType = '-'
TokenPercent TokenType = '%'
2017-05-28 02:00:00 +00:00
TokenEqual TokenType = '='
2017-06-04 23:00:40 +00:00
TokenEqualOp TokenType = '≔'
2017-05-28 02:00:00 +00:00
TokenNotEqual TokenType = '≠'
TokenLessThan TokenType = '<'
TokenLessThanEq TokenType = '≤'
TokenGreaterThan TokenType = '>'
TokenGreaterThanEq TokenType = '≥'
TokenAnd TokenType = '∧'
TokenOr TokenType = '∨ '
TokenBang TokenType = '!'
2017-06-02 14:40:42 +00:00
TokenDot TokenType = '.'
TokenComma TokenType = ','
2017-05-31 14:31:49 +00:00
2017-06-13 15:50:20 +00:00
TokenEllipsis TokenType = '…'
TokenFatArrow TokenType = '⇒'
2017-05-28 02:00:00 +00:00
TokenQuestion TokenType = '?'
TokenColon TokenType = ':'
TokenTemplateInterp TokenType = '∫'
TokenTemplateControl TokenType = 'λ'
2017-05-28 14:20:39 +00:00
TokenTemplateSeqEnd TokenType = '∎'
2017-05-28 02:00:00 +00:00
2017-05-31 02:03:25 +00:00
TokenQuotedLit TokenType = 'Q' // might contain backslash escapes
TokenStringLit TokenType = 'S' // cannot contain backslash escapes
2017-05-28 02:00:00 +00:00
TokenNumberLit TokenType = 'N'
TokenIdent TokenType = 'I'
2017-05-29 16:13:35 +00:00
TokenComment TokenType = 'C'
2017-05-28 02:00:00 +00:00
TokenNewline TokenType = '\n'
TokenEOF TokenType = '␄'
// The rest are not used in the language but recognized by the scanner so
// we can generate good diagnostics in the parser when users try to write
// things that might work in other languages they are familiar with, or
2017-09-12 01:36:56 +00:00
// simply make incorrect assumptions about the HCL language.
2017-05-28 02:00:00 +00:00
2019-05-03 21:00:54 +00:00
TokenBitwiseAnd TokenType = '&'
TokenBitwiseOr TokenType = '|'
TokenBitwiseNot TokenType = '~'
TokenBitwiseXor TokenType = '^'
TokenStarStar TokenType = '➚'
TokenApostrophe TokenType = '\''
TokenBacktick TokenType = '`'
TokenSemicolon TokenType = ';'
TokenTabs TokenType = '␉'
TokenInvalid TokenType = '<27> '
TokenBadUTF8 TokenType = '💩'
TokenQuotedNewline TokenType = ''
2017-05-30 02:28:10 +00:00
// TokenNil is a placeholder for when a token is required but none is
// available, e.g. when reporting errors. The scanner will never produce
// this as part of a token stream.
TokenNil TokenType = '\x00'
2017-05-28 02:00:00 +00:00
)
2017-05-28 14:38:17 +00:00
func ( t TokenType ) GoString ( ) string {
2017-09-12 01:36:56 +00:00
return fmt . Sprintf ( "hclsyntax.%s" , t . String ( ) )
2017-05-28 14:38:17 +00:00
}
2017-05-28 22:44:22 +00:00
type scanMode int
const (
scanNormal scanMode = iota
scanTemplate
2018-02-02 16:09:40 +00:00
scanIdentOnly
2017-05-28 22:44:22 +00:00
)
2017-05-28 14:11:24 +00:00
type tokenAccum struct {
2018-12-13 00:38:20 +00:00
Filename string
Bytes [ ] byte
Pos hcl . Pos
Tokens [ ] Token
StartByte int
2017-05-28 02:00:00 +00:00
}
2017-05-28 15:38:13 +00:00
func ( f * tokenAccum ) emitToken ( ty TokenType , startOfs , endOfs int ) {
2017-05-28 02:00:00 +00:00
// Walk through our buffer to figure out how much we need to adjust
// the start pos to get our end pos.
2017-05-28 15:38:13 +00:00
start := f . Pos
2018-12-13 00:38:20 +00:00
start . Column += startOfs + f . StartByte - f . Pos . Byte // Safe because only ASCII spaces can be in the offset
start . Byte = startOfs + f . StartByte
2017-05-28 02:00:00 +00:00
end := start
2018-12-13 00:38:20 +00:00
end . Byte = endOfs + f . StartByte
2017-05-28 15:38:13 +00:00
b := f . Bytes [ startOfs : endOfs ]
2017-05-28 02:00:00 +00:00
for len ( b ) > 0 {
advance , seq , _ := textseg . ScanGraphemeClusters ( b , true )
2018-03-08 16:30:58 +00:00
if ( len ( seq ) == 1 && seq [ 0 ] == '\n' ) || ( len ( seq ) == 2 && seq [ 0 ] == '\r' && seq [ 1 ] == '\n' ) {
2017-05-28 02:00:00 +00:00
end . Line ++
end . Column = 1
} else {
end . Column ++
}
b = b [ advance : ]
}
2017-05-28 15:38:13 +00:00
f . Pos = end
2017-05-28 14:11:24 +00:00
f . Tokens = append ( f . Tokens , Token {
2017-05-28 02:00:00 +00:00
Type : ty ,
Bytes : f . Bytes [ startOfs : endOfs ] ,
2017-09-11 23:40:37 +00:00
Range : hcl . Range {
2017-05-28 02:00:00 +00:00
Filename : f . Filename ,
Start : start ,
End : end ,
} ,
2017-05-28 14:11:24 +00:00
} )
2017-05-28 02:00:00 +00:00
}
2017-05-29 15:55:53 +00:00
type heredocInProgress struct {
Marker [ ] byte
StartOfLine bool
}
2017-06-04 14:34:26 +00:00
2018-12-14 01:22:41 +00:00
func tokenOpensFlushHeredoc ( tok Token ) bool {
if tok . Type != TokenOHeredoc {
return false
}
return bytes . HasPrefix ( tok . Bytes , [ ] byte { '<' , '<' , '-' } )
}
2017-06-04 14:34:26 +00:00
// checkInvalidTokens does a simple pass across the given tokens and generates
2017-09-12 01:36:56 +00:00
// diagnostics for tokens that should _never_ appear in HCL source. This
2017-06-04 14:34:26 +00:00
// is intended to avoid the need for the parser to have special support
// for them all over.
//
// Returns a diagnostics with no errors if everything seems acceptable.
// Otherwise, returns zero or more error diagnostics, though tries to limit
// repetition of the same information.
2017-09-11 23:40:37 +00:00
func checkInvalidTokens ( tokens Tokens ) hcl . Diagnostics {
var diags hcl . Diagnostics
2017-06-04 14:34:26 +00:00
toldBitwise := 0
toldExponent := 0
toldBacktick := 0
2019-01-25 13:51:43 +00:00
toldApostrophe := 0
2017-06-04 14:34:26 +00:00
toldSemicolon := 0
toldTabs := 0
toldBadUTF8 := 0
for _ , tok := range tokens {
2019-01-25 13:45:53 +00:00
// copy token so it's safe to point to it
tok := tok
2017-06-04 14:34:26 +00:00
switch tok . Type {
case TokenBitwiseAnd , TokenBitwiseOr , TokenBitwiseXor , TokenBitwiseNot :
if toldBitwise < 4 {
var suggestion string
switch tok . Type {
case TokenBitwiseAnd :
suggestion = " Did you mean boolean AND (\"&&\")?"
case TokenBitwiseOr :
2020-05-21 01:12:28 +00:00
suggestion = " Did you mean boolean OR (\"||\")?"
2017-06-04 14:34:26 +00:00
case TokenBitwiseNot :
suggestion = " Did you mean boolean NOT (\"!\")?"
}
2017-09-11 23:40:37 +00:00
diags = append ( diags , & hcl . Diagnostic {
Severity : hcl . DiagError ,
2017-06-04 14:34:26 +00:00
Summary : "Unsupported operator" ,
Detail : fmt . Sprintf ( "Bitwise operators are not supported.%s" , suggestion ) ,
Subject : & tok . Range ,
} )
toldBitwise ++
}
case TokenStarStar :
if toldExponent < 1 {
2017-09-11 23:40:37 +00:00
diags = append ( diags , & hcl . Diagnostic {
Severity : hcl . DiagError ,
2017-06-04 14:34:26 +00:00
Summary : "Unsupported operator" ,
Detail : "\"**\" is not a supported operator. Exponentiation is not supported as an operator." ,
Subject : & tok . Range ,
} )
toldExponent ++
}
case TokenBacktick :
// Only report for alternating (even) backticks, so we won't report both start and ends of the same
// backtick-quoted string.
2019-01-25 13:49:30 +00:00
if ( toldBacktick % 2 ) == 0 {
2017-09-11 23:40:37 +00:00
diags = append ( diags , & hcl . Diagnostic {
Severity : hcl . DiagError ,
2017-06-04 14:34:26 +00:00
Summary : "Invalid character" ,
Detail : "The \"`\" character is not valid. To create a multi-line string, use the \"heredoc\" syntax, like \"<<EOT\"." ,
Subject : & tok . Range ,
} )
2019-01-25 13:49:30 +00:00
}
if toldBacktick <= 2 {
2017-06-04 14:34:26 +00:00
toldBacktick ++
}
2019-01-25 13:51:43 +00:00
case TokenApostrophe :
if ( toldApostrophe % 2 ) == 0 {
newDiag := & hcl . Diagnostic {
Severity : hcl . DiagError ,
Summary : "Invalid character" ,
2019-01-28 10:32:56 +00:00
Detail : "Single quotes are not valid. Use double quotes (\") to enclose strings." ,
2019-01-25 13:51:43 +00:00
Subject : & tok . Range ,
}
diags = append ( diags , newDiag )
}
if toldApostrophe <= 2 {
toldApostrophe ++
}
2017-06-04 14:34:26 +00:00
case TokenSemicolon :
if toldSemicolon < 1 {
2017-09-11 23:40:37 +00:00
diags = append ( diags , & hcl . Diagnostic {
Severity : hcl . DiagError ,
2017-06-04 14:34:26 +00:00
Summary : "Invalid character" ,
2018-07-18 22:41:35 +00:00
Detail : "The \";\" character is not valid. Use newlines to separate arguments and blocks, and commas to separate items in collection values." ,
2017-06-04 14:34:26 +00:00
Subject : & tok . Range ,
} )
toldSemicolon ++
}
case TokenTabs :
if toldTabs < 1 {
2017-09-11 23:40:37 +00:00
diags = append ( diags , & hcl . Diagnostic {
Severity : hcl . DiagError ,
2017-06-04 14:34:26 +00:00
Summary : "Invalid character" ,
Detail : "Tab characters may not be used. The recommended indentation style is two spaces per indent." ,
Subject : & tok . Range ,
} )
toldTabs ++
}
case TokenBadUTF8 :
if toldBadUTF8 < 1 {
2017-09-11 23:40:37 +00:00
diags = append ( diags , & hcl . Diagnostic {
Severity : hcl . DiagError ,
2017-06-04 14:34:26 +00:00
Summary : "Invalid character encoding" ,
Detail : "All input files must be UTF-8 encoded. Ensure that UTF-8 encoding is selected in your editor." ,
Subject : & tok . Range ,
} )
toldBadUTF8 ++
}
2019-05-03 21:00:54 +00:00
case TokenQuotedNewline :
diags = append ( diags , & hcl . Diagnostic {
Severity : hcl . DiagError ,
Summary : "Invalid multi-line string" ,
Detail : "Quoted strings may not be split over multiple lines. To produce a multi-line string, either use the \\n escape to represent a newline character or use the \"heredoc\" multi-line template syntax." ,
Subject : & tok . Range ,
} )
2017-06-04 14:34:26 +00:00
case TokenInvalid :
2020-08-24 17:43:03 +00:00
chars := string ( tok . Bytes )
switch chars {
case "“" , "”" :
diags = append ( diags , & hcl . Diagnostic {
Severity : hcl . DiagError ,
Summary : "Invalid character" ,
Detail : "\"Curly quotes\" are not valid here. These can sometimes be inadvertently introduced when sharing code via documents or discussion forums. It might help to replace the character with a \"straight quote\"." ,
Subject : & tok . Range ,
} )
default :
diags = append ( diags , & hcl . Diagnostic {
Severity : hcl . DiagError ,
Summary : "Invalid character" ,
Detail : "This character is not used within the language." ,
Subject : & tok . Range ,
} )
}
2017-06-04 14:34:26 +00:00
}
}
return diags
}
2018-12-19 23:52:15 +00:00
var utf8BOM = [ ] byte { 0xef , 0xbb , 0xbf }
// stripUTF8BOM checks whether the given buffer begins with a UTF-8 byte order
// mark (0xEF 0xBB 0xBF) and, if so, returns a truncated slice with the same
// backing array but with the BOM skipped.
//
// If there is no BOM present, the given slice is returned verbatim.
func stripUTF8BOM ( src [ ] byte ) [ ] byte {
if bytes . HasPrefix ( src , utf8BOM ) {
return src [ 3 : ]
}
return src
}