zclsyntax: generate lexer diagnostics

There are certain tokens that are _never_ valid, so we might as well catch them early in the Lex... functions rather than having to handle them in many different contexts within the parser. Unfortunately for now when such errors occur they tend to be echoed by more confusing errors coming from the parser, but we'll accept that for now.
2017-06-04 07:34:26 -07:00 · 2017-06-04 07:34:26 -07:00 · e100bf4723
commit e100bf4723
parent f220c26836
6 changed files with 197 additions and 28 deletions
--- a/zcl/zclsyntax/parser_test.go
+++ b/zcl/zclsyntax/parser_test.go
@ -404,6 +404,39 @@ block "valid" {}
 				},
 			},
 		},
+
+		{
+			`	`,
+			2, // tabs not allowed, and body item is required here
+			&Body{
+				Attributes: Attributes{},
+				Blocks:     Blocks{},
+				SrcRange: zcl.Range{
+					Start: zcl.Pos{Line: 1, Column: 1, Byte: 0},
+					End:   zcl.Pos{Line: 1, Column: 2, Byte: 1},
+				},
+				EndRange: zcl.Range{
+					Start: zcl.Pos{Line: 1, Column: 2, Byte: 1},
+					End:   zcl.Pos{Line: 1, Column: 2, Byte: 1},
+				},
+			},
+		},
+		{
+			`\x81`,
+			2, // invalid UTF-8, and body item is required here
+			&Body{
+				Attributes: Attributes{},
+				Blocks:     Blocks{},
+				SrcRange: zcl.Range{
+					Start: zcl.Pos{Line: 1, Column: 1, Byte: 0},
+					End:   zcl.Pos{Line: 1, Column: 2, Byte: 1},
+				},
+				EndRange: zcl.Range{
+					Start: zcl.Pos{Line: 1, Column: 2, Byte: 1},
+					End:   zcl.Pos{Line: 1, Column: 2, Byte: 1},
+				},
+			},
+		},
 	}

 	prettyConfig := &pretty.Config{
--- a/zcl/zclsyntax/public.go
+++ b/zcl/zclsyntax/public.go
@ -15,10 +15,11 @@ import (
 // should be served using the zcl.Body interface to ensure compatibility with
 // other configurationg syntaxes, such as JSON.
 func ParseConfig(src []byte, filename string, start zcl.Pos) (*zcl.File, zcl.Diagnostics) {
-	tokens := LexConfig(src, filename, start)
+	tokens, diags := LexConfig(src, filename, start)
 	peeker := newPeeker(tokens, false)
 	parser := &parser{peeker: peeker}
-	body, diags := parser.ParseBody(TokenEOF)
+	body, parseDiags := parser.ParseBody(TokenEOF)
+	diags = append(diags, parseDiags...)
 	return &zcl.File{
 		Body:  body,
 		Bytes: src,
@ -28,7 +29,7 @@ func ParseConfig(src []byte, filename string, start zcl.Pos) (*zcl.File, zcl.Dia
 // ParseExpression parses the given buffer as a standalone zcl expression,
 // returning it as an instance of Expression.
 func ParseExpression(src []byte, filename string, start zcl.Pos) (Expression, zcl.Diagnostics) {
-	tokens := LexExpression(src, filename, start)
+	tokens, diags := LexExpression(src, filename, start)
 	peeker := newPeeker(tokens, false)
 	parser := &parser{peeker: peeker}

@ -36,7 +37,8 @@ func ParseExpression(src []byte, filename string, start zcl.Pos) (Expression, zc
 	// they were wrapped in parentheses.
 	parser.PushIncludeNewlines(false)

-	expr, diags := parser.ParseExpression()
+	expr, parseDiags := parser.ParseExpression()
+	diags = append(diags, parseDiags...)

 	next := parser.Peek()
 	if next.Type != TokenEOF && !parser.recovery {
@ -54,28 +56,51 @@ func ParseExpression(src []byte, filename string, start zcl.Pos) (Expression, zc
 // ParseTemplate parses the given buffer as a standalone zcl template,
 // returning it as an instance of Expression.
 func ParseTemplate(src []byte, filename string, start zcl.Pos) (Expression, zcl.Diagnostics) {
-	tokens := LexTemplate(src, filename, start)
+	tokens, diags := LexTemplate(src, filename, start)
 	peeker := newPeeker(tokens, false)
 	parser := &parser{peeker: peeker}
-	return parser.ParseTemplate(TokenEOF)
+	expr, parseDiags := parser.ParseTemplate(TokenEOF)
+	diags = append(diags, parseDiags...)
+	return expr, diags
 }

 // LexConfig performs lexical analysis on the given buffer, treating it as a
 // whole zcl config file, and returns the resulting tokens.
-func LexConfig(src []byte, filename string, start zcl.Pos) Tokens {
-	return scanTokens(src, filename, start, scanNormal)
+//
+// Only minimal validation is done during lexical analysis, so the returned
+// diagnostics may include errors about lexical issues such as bad character
+// encodings or unrecognized characters, but full parsing is required to
+// detect _all_ syntax errors.
+func LexConfig(src []byte, filename string, start zcl.Pos) (Tokens, zcl.Diagnostics) {
+	tokens := scanTokens(src, filename, start, scanNormal)
+	diags := checkInvalidTokens(tokens)
+	return tokens, diags
 }

 // LexExpression performs lexical analysis on the given buffer, treating it as
 // a standalone zcl expression, and returns the resulting tokens.
-func LexExpression(src []byte, filename string, start zcl.Pos) Tokens {
+//
+// Only minimal validation is done during lexical analysis, so the returned
+// diagnostics may include errors about lexical issues such as bad character
+// encodings or unrecognized characters, but full parsing is required to
+// detect _all_ syntax errors.
+func LexExpression(src []byte, filename string, start zcl.Pos) (Tokens, zcl.Diagnostics) {
 	// This is actually just the same thing as LexConfig, since configs
 	// and expressions lex in the same way.
-	return scanTokens(src, filename, start, scanNormal)
+	tokens := scanTokens(src, filename, start, scanNormal)
+	diags := checkInvalidTokens(tokens)
+	return tokens, diags
 }

 // LexTemplate performs lexical analysis on the given buffer, treating it as a
 // standalone zcl template, and returns the resulting tokens.
-func LexTemplate(src []byte, filename string, start zcl.Pos) Tokens {
-	return scanTokens(src, filename, start, scanTemplate)
+//
+// Only minimal validation is done during lexical analysis, so the returned
+// diagnostics may include errors about lexical issues such as bad character
+// encodings or unrecognized characters, but full parsing is required to
+// detect _all_ syntax errors.
+func LexTemplate(src []byte, filename string, start zcl.Pos) (Tokens, zcl.Diagnostics) {
+	tokens := scanTokens(src, filename, start, scanTemplate)
+	diags := checkInvalidTokens(tokens)
+	return tokens, diags
 }
--- a/zcl/zclsyntax/scan_tokens.go
+++ b/zcl/zclsyntax/scan_tokens.go
@ -2284,7 +2284,7 @@ var _zcltok_trans_actions []byte = []byte{
 	0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 121, 95, 0, 0,
+	0, 0, 0, 0, 121, 97, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
@ -2300,18 +2300,18 @@ var _zcltok_trans_actions []byte = []byte{
 	0, 29, 0, 49, 35, 47, 128, 0,
 	0, 0, 0, 69, 55, 67, 134, 0,
 	0, 0, 0, 0, 71, 0, 0, 0,
-	91, 149, 0, 83, 146, 5, 0, 85,
-	0, 87, 0, 97, 0, 0, 0, 0,
+	91, 152, 0, 83, 146, 5, 0, 85,
+	0, 87, 0, 95, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 5, 5, 5, 152, 152, 152,
-	152, 152, 152, 5, 5, 152, 5, 109,
-	111, 99, 107, 73, 79, 105, 101, 0,
+	0, 0, 5, 5, 5, 149, 149, 149,
+	149, 149, 149, 5, 5, 149, 5, 109,
+	113, 99, 107, 73, 79, 105, 101, 0,
 	0, 77, 75, 103, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 81,
-	89, 113, 0, 0, 0, 0, 0, 0,
+	89, 111, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
@ -3552,14 +3552,14 @@ func scanTokens(data []byte, filename string, start zcl.Pos, mode scanMode) []To

 				te = p + 1
 				{
-					token(TokenInvalid)
+					token(TokenBadUTF8)
 				}
 			case 58:
 				// line 261 "scan_tokens.rl"

 				te = p + 1
 				{
-					token(TokenBadUTF8)
+					token(TokenInvalid)
 				}
 			case 59:
 				// line 237 "scan_tokens.rl"
@ -3613,7 +3613,7 @@ func scanTokens(data []byte, filename string, start zcl.Pos, mode scanMode) []To
 				te = p
 				p--
 				{
-					token(TokenInvalid)
+					token(TokenBadUTF8)
 				}
 			case 66:
 				// line 261 "scan_tokens.rl"
@ -3621,7 +3621,7 @@ func scanTokens(data []byte, filename string, start zcl.Pos, mode scanMode) []To
 				te = p
 				p--
 				{
-					token(TokenBadUTF8)
+					token(TokenInvalid)
 				}
 			case 67:
 				// line 238 "scan_tokens.rl"
@ -3645,7 +3645,7 @@ func scanTokens(data []byte, filename string, start zcl.Pos, mode scanMode) []To
 					selfToken()
 				}
 			case 70:
-				// line 261 "scan_tokens.rl"
+				// line 260 "scan_tokens.rl"

 				p = (te) - 1
 				{
@ -3673,12 +3673,12 @@ func scanTokens(data []byte, filename string, start zcl.Pos, mode scanMode) []To
 				case 33:
 					{
 						p = (te) - 1
-						token(TokenInvalid)
+						token(TokenBadUTF8)
 					}
 				case 34:
 					{
 						p = (te) - 1
-						token(TokenBadUTF8)
+						token(TokenInvalid)
 					}
 				}

--- a/zcl/zclsyntax/scan_tokens.rl
+++ b/zcl/zclsyntax/scan_tokens.rl
@ -257,8 +257,8 @@ func scanTokens(data []byte, filename string, start zcl.Pos, mode scanMode) []To
            BeginHeredocTmpl => beginHeredocTemplate;

            Tabs             => { token(TokenTabs) };
-            AnyUTF8          => { token(TokenInvalid) };
            BrokenUTF8       => { token(TokenBadUTF8) };
+            AnyUTF8          => { token(TokenInvalid) };
        *|;

    }%%
--- a/zcl/zclsyntax/token.go
+++ b/zcl/zclsyntax/token.go
@ -156,3 +156,114 @@ type heredocInProgress struct {
 	Marker      []byte
 	StartOfLine bool
 }
+
+// checkInvalidTokens does a simple pass across the given tokens and generates
+// diagnostics for tokens that should _never_ appear in ZCL source. This
+// is intended to avoid the need for the parser to have special support
+// for them all over.
+//
+// Returns a diagnostics with no errors if everything seems acceptable.
+// Otherwise, returns zero or more error diagnostics, though tries to limit
+// repetition of the same information.
+func checkInvalidTokens(tokens Tokens) zcl.Diagnostics {
+	var diags zcl.Diagnostics
+
+	toldBitwise := 0
+	toldExponent := 0
+	toldBacktick := 0
+	toldSemicolon := 0
+	toldTabs := 0
+	toldBadUTF8 := 0
+
+	for _, tok := range tokens {
+		switch tok.Type {
+		case TokenBitwiseAnd, TokenBitwiseOr, TokenBitwiseXor, TokenBitwiseNot:
+			if toldBitwise < 4 {
+				var suggestion string
+				switch tok.Type {
+				case TokenBitwiseAnd:
+					suggestion = " Did you mean boolean AND (\"&&\")?"
+				case TokenBitwiseOr:
+					suggestion = " Did you mean boolean OR (\"&&\")?"
+				case TokenBitwiseNot:
+					suggestion = " Did you mean boolean NOT (\"!\")?"
+				}
+
+				diags = append(diags, &zcl.Diagnostic{
+					Severity: zcl.DiagError,
+					Summary:  "Unsupported operator",
+					Detail:   fmt.Sprintf("Bitwise operators are not supported.%s", suggestion),
+					Subject:  &tok.Range,
+				})
+				toldBitwise++
+			}
+		case TokenStarStar:
+			if toldExponent < 1 {
+				diags = append(diags, &zcl.Diagnostic{
+					Severity: zcl.DiagError,
+					Summary:  "Unsupported operator",
+					Detail:   "\"**\" is not a supported operator. Exponentiation is not supported as an operator.",
+					Subject:  &tok.Range,
+				})
+
+				toldExponent++
+			}
+		case TokenBacktick:
+			// Only report for alternating (even) backticks, so we won't report both start and ends of the same
+			// backtick-quoted string.
+			if toldExponent < 4 && (toldExponent%2) == 0 {
+				diags = append(diags, &zcl.Diagnostic{
+					Severity: zcl.DiagError,
+					Summary:  "Invalid character",
+					Detail:   "The \"`\" character is not valid. To create a multi-line string, use the \"heredoc\" syntax, like \"<<EOT\".",
+					Subject:  &tok.Range,
+				})
+
+				toldBacktick++
+			}
+		case TokenSemicolon:
+			if toldSemicolon < 1 {
+				diags = append(diags, &zcl.Diagnostic{
+					Severity: zcl.DiagError,
+					Summary:  "Invalid character",
+					Detail:   "The \";\" character is not valid. Use newlines to separate attributes and blocks, and commas to separate items in collection values.",
+					Subject:  &tok.Range,
+				})
+
+				toldSemicolon++
+			}
+		case TokenTabs:
+			if toldTabs < 1 {
+				diags = append(diags, &zcl.Diagnostic{
+					Severity: zcl.DiagError,
+					Summary:  "Invalid character",
+					Detail:   "Tab characters may not be used. The recommended indentation style is two spaces per indent.",
+					Subject:  &tok.Range,
+				})
+
+				toldTabs++
+			}
+		case TokenBadUTF8:
+			if toldBadUTF8 < 1 {
+				diags = append(diags, &zcl.Diagnostic{
+					Severity: zcl.DiagError,
+					Summary:  "Invalid character encoding",
+					Detail:   "All input files must be UTF-8 encoded. Ensure that UTF-8 encoding is selected in your editor.",
+					Subject:  &tok.Range,
+				})
+
+				toldBadUTF8++
+			}
+		case TokenInvalid:
+			diags = append(diags, &zcl.Diagnostic{
+				Severity: zcl.DiagError,
+				Summary:  "Invalid character",
+				Detail:   "This character is not used within the language.",
+				Subject:  &tok.Range,
+			})
+
+			toldTabs++
+		}
+	}
+	return diags
+}
--- a/zclwrite/parser.go
+++ b/zclwrite/parser.go
@ -8,7 +8,7 @@ import (
 // lexConfig uses the zclsyntax scanner to get a token stream and then
 // rewrites it into this package's token model.
 func lexConfig(src []byte) Tokens {
-	mainTokens := zclsyntax.LexConfig(src, "", zcl.Pos{Byte: 0, Line: 1, Column: 1})
+	mainTokens, _ := zclsyntax.LexConfig(src, "", zcl.Pos{Byte: 0, Line: 1, Column: 1})
 	ret := make(Tokens, len(mainTokens))
 	var lastByteOffset int
 	for i, mainToken := range mainTokens {