diff --git a/hcl/hclsyntax/scan_tokens.go b/hcl/hclsyntax/scan_tokens.go index 2645186..2ed038d 100644 --- a/hcl/hclsyntax/scan_tokens.go +++ b/hcl/hclsyntax/scan_tokens.go @@ -4304,6 +4304,10 @@ const hcltok_en_main int = 1464 // line 16 "scan_tokens.rl" func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []Token { + stripData := stripUTF8BOM(data) + start.Byte += len(data) - len(stripData) + data = stripData + f := &tokenAccum{ Filename: filename, Bytes: data, @@ -4311,7 +4315,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To StartByte: start.Byte, } - // line 295 "scan_tokens.rl" + // line 299 "scan_tokens.rl" // Ragel state p := 0 // "Pointer" into data @@ -4339,7 +4343,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To var retBraces []int // stack of brace levels that cause us to use fret var heredocs []heredocInProgress // stack of heredocs we're currently processing - // line 330 "scan_tokens.rl" + // line 334 "scan_tokens.rl" // Make Go compiler happy _ = ts @@ -4359,7 +4363,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To f.emitToken(TokenType(b[0]), ts, te) } - // line 4371 "scan_tokens.go" + // line 4375 "scan_tokens.go" { top = 0 ts = 0 @@ -4367,7 +4371,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To act = 0 } - // line 4379 "scan_tokens.go" + // line 4383 "scan_tokens.go" { var _klen int var _trans int @@ -4392,7 +4396,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To ts = p - // line 4403 "scan_tokens.go" + // line 4407 "scan_tokens.go" } } @@ -4464,22 +4468,22 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To _acts++ switch _hcltok_actions[_acts-1] { case 0: - // line 219 "scan_tokens.rl" + // line 223 "scan_tokens.rl" p-- case 1: - // line 220 "scan_tokens.rl" + // line 224 "scan_tokens.rl" p-- case 2: - // line 225 "scan_tokens.rl" + // line 229 "scan_tokens.rl" p-- case 3: - // line 226 "scan_tokens.rl" + // line 230 "scan_tokens.rl" p-- @@ -4489,7 +4493,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To te = p + 1 case 8: - // line 156 "scan_tokens.rl" + // line 160 "scan_tokens.rl" te = p + 1 { @@ -4508,7 +4512,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 9: - // line 166 "scan_tokens.rl" + // line 170 "scan_tokens.rl" te = p + 1 { @@ -4527,7 +4531,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 10: - // line 80 "scan_tokens.rl" + // line 84 "scan_tokens.rl" te = p + 1 { @@ -4541,21 +4545,21 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } case 11: - // line 240 "scan_tokens.rl" + // line 244 "scan_tokens.rl" te = p + 1 { token(TokenInvalid) } case 12: - // line 241 "scan_tokens.rl" + // line 245 "scan_tokens.rl" te = p + 1 { token(TokenBadUTF8) } case 13: - // line 156 "scan_tokens.rl" + // line 160 "scan_tokens.rl" te = p p-- @@ -4575,7 +4579,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 14: - // line 166 "scan_tokens.rl" + // line 170 "scan_tokens.rl" te = p p-- @@ -4595,7 +4599,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 15: - // line 239 "scan_tokens.rl" + // line 243 "scan_tokens.rl" te = p p-- @@ -4603,7 +4607,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenQuotedLit) } case 16: - // line 240 "scan_tokens.rl" + // line 244 "scan_tokens.rl" te = p p-- @@ -4611,7 +4615,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenInvalid) } case 17: - // line 241 "scan_tokens.rl" + // line 245 "scan_tokens.rl" te = p p-- @@ -4619,29 +4623,29 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenBadUTF8) } case 18: - // line 239 "scan_tokens.rl" + // line 243 "scan_tokens.rl" p = (te) - 1 { token(TokenQuotedLit) } case 19: - // line 241 "scan_tokens.rl" + // line 245 "scan_tokens.rl" p = (te) - 1 { token(TokenBadUTF8) } case 20: - // line 144 "scan_tokens.rl" + // line 148 "scan_tokens.rl" act = 10 case 21: - // line 249 "scan_tokens.rl" + // line 253 "scan_tokens.rl" act = 11 case 22: - // line 156 "scan_tokens.rl" + // line 160 "scan_tokens.rl" te = p + 1 { @@ -4660,7 +4664,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 23: - // line 166 "scan_tokens.rl" + // line 170 "scan_tokens.rl" te = p + 1 { @@ -4679,7 +4683,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 24: - // line 107 "scan_tokens.rl" + // line 111 "scan_tokens.rl" te = p + 1 { @@ -4725,14 +4729,14 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenStringLit) } case 25: - // line 249 "scan_tokens.rl" + // line 253 "scan_tokens.rl" te = p + 1 { token(TokenBadUTF8) } case 26: - // line 156 "scan_tokens.rl" + // line 160 "scan_tokens.rl" te = p p-- @@ -4752,7 +4756,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 27: - // line 166 "scan_tokens.rl" + // line 170 "scan_tokens.rl" te = p p-- @@ -4772,7 +4776,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 28: - // line 144 "scan_tokens.rl" + // line 148 "scan_tokens.rl" te = p p-- @@ -4784,7 +4788,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenStringLit) } case 29: - // line 249 "scan_tokens.rl" + // line 253 "scan_tokens.rl" te = p p-- @@ -4792,7 +4796,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenBadUTF8) } case 30: - // line 144 "scan_tokens.rl" + // line 148 "scan_tokens.rl" p = (te) - 1 { @@ -4829,15 +4833,15 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } case 32: - // line 152 "scan_tokens.rl" + // line 156 "scan_tokens.rl" act = 14 case 33: - // line 256 "scan_tokens.rl" + // line 260 "scan_tokens.rl" act = 15 case 34: - // line 156 "scan_tokens.rl" + // line 160 "scan_tokens.rl" te = p + 1 { @@ -4856,7 +4860,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 35: - // line 166 "scan_tokens.rl" + // line 170 "scan_tokens.rl" te = p + 1 { @@ -4875,21 +4879,21 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 36: - // line 152 "scan_tokens.rl" + // line 156 "scan_tokens.rl" te = p + 1 { token(TokenStringLit) } case 37: - // line 256 "scan_tokens.rl" + // line 260 "scan_tokens.rl" te = p + 1 { token(TokenBadUTF8) } case 38: - // line 156 "scan_tokens.rl" + // line 160 "scan_tokens.rl" te = p p-- @@ -4909,7 +4913,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 39: - // line 166 "scan_tokens.rl" + // line 170 "scan_tokens.rl" te = p p-- @@ -4929,7 +4933,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 40: - // line 152 "scan_tokens.rl" + // line 156 "scan_tokens.rl" te = p p-- @@ -4937,7 +4941,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenStringLit) } case 41: - // line 256 "scan_tokens.rl" + // line 260 "scan_tokens.rl" te = p p-- @@ -4945,7 +4949,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenBadUTF8) } case 42: - // line 152 "scan_tokens.rl" + // line 156 "scan_tokens.rl" p = (te) - 1 { @@ -4974,29 +4978,29 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } case 44: - // line 260 "scan_tokens.rl" + // line 264 "scan_tokens.rl" act = 16 case 45: - // line 261 "scan_tokens.rl" + // line 265 "scan_tokens.rl" act = 17 case 46: - // line 261 "scan_tokens.rl" + // line 265 "scan_tokens.rl" te = p + 1 { token(TokenBadUTF8) } case 47: - // line 262 "scan_tokens.rl" + // line 266 "scan_tokens.rl" te = p + 1 { token(TokenInvalid) } case 48: - // line 260 "scan_tokens.rl" + // line 264 "scan_tokens.rl" te = p p-- @@ -5004,7 +5008,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenIdent) } case 49: - // line 261 "scan_tokens.rl" + // line 265 "scan_tokens.rl" te = p p-- @@ -5012,14 +5016,14 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenBadUTF8) } case 50: - // line 260 "scan_tokens.rl" + // line 264 "scan_tokens.rl" p = (te) - 1 { token(TokenIdent) } case 51: - // line 261 "scan_tokens.rl" + // line 265 "scan_tokens.rl" p = (te) - 1 { @@ -5042,100 +5046,100 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } case 53: - // line 268 "scan_tokens.rl" + // line 272 "scan_tokens.rl" act = 21 case 54: - // line 281 "scan_tokens.rl" + // line 285 "scan_tokens.rl" act = 32 case 55: - // line 291 "scan_tokens.rl" + // line 295 "scan_tokens.rl" act = 38 case 56: - // line 292 "scan_tokens.rl" + // line 296 "scan_tokens.rl" act = 39 case 57: - // line 270 "scan_tokens.rl" + // line 274 "scan_tokens.rl" te = p + 1 { token(TokenComment) } case 58: - // line 271 "scan_tokens.rl" + // line 275 "scan_tokens.rl" te = p + 1 { token(TokenNewline) } case 59: - // line 273 "scan_tokens.rl" + // line 277 "scan_tokens.rl" te = p + 1 { token(TokenEqualOp) } case 60: - // line 274 "scan_tokens.rl" + // line 278 "scan_tokens.rl" te = p + 1 { token(TokenNotEqual) } case 61: - // line 275 "scan_tokens.rl" + // line 279 "scan_tokens.rl" te = p + 1 { token(TokenGreaterThanEq) } case 62: - // line 276 "scan_tokens.rl" + // line 280 "scan_tokens.rl" te = p + 1 { token(TokenLessThanEq) } case 63: - // line 277 "scan_tokens.rl" + // line 281 "scan_tokens.rl" te = p + 1 { token(TokenAnd) } case 64: - // line 278 "scan_tokens.rl" + // line 282 "scan_tokens.rl" te = p + 1 { token(TokenOr) } case 65: - // line 279 "scan_tokens.rl" + // line 283 "scan_tokens.rl" te = p + 1 { token(TokenEllipsis) } case 66: - // line 280 "scan_tokens.rl" + // line 284 "scan_tokens.rl" te = p + 1 { token(TokenFatArrow) } case 67: - // line 281 "scan_tokens.rl" + // line 285 "scan_tokens.rl" te = p + 1 { selfToken() } case 68: - // line 176 "scan_tokens.rl" + // line 180 "scan_tokens.rl" te = p + 1 { @@ -5143,7 +5147,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To braces++ } case 69: - // line 181 "scan_tokens.rl" + // line 185 "scan_tokens.rl" te = p + 1 { @@ -5164,7 +5168,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 70: - // line 193 "scan_tokens.rl" + // line 197 "scan_tokens.rl" te = p + 1 { @@ -5194,7 +5198,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 71: - // line 75 "scan_tokens.rl" + // line 79 "scan_tokens.rl" te = p + 1 { @@ -5208,7 +5212,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 72: - // line 85 "scan_tokens.rl" + // line 89 "scan_tokens.rl" te = p + 1 { @@ -5239,27 +5243,27 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } case 73: - // line 291 "scan_tokens.rl" + // line 295 "scan_tokens.rl" te = p + 1 { token(TokenBadUTF8) } case 74: - // line 292 "scan_tokens.rl" + // line 296 "scan_tokens.rl" te = p + 1 { token(TokenInvalid) } case 75: - // line 266 "scan_tokens.rl" + // line 270 "scan_tokens.rl" te = p p-- case 76: - // line 267 "scan_tokens.rl" + // line 271 "scan_tokens.rl" te = p p-- @@ -5267,7 +5271,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenNumberLit) } case 77: - // line 268 "scan_tokens.rl" + // line 272 "scan_tokens.rl" te = p p-- @@ -5275,7 +5279,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenIdent) } case 78: - // line 281 "scan_tokens.rl" + // line 285 "scan_tokens.rl" te = p p-- @@ -5283,7 +5287,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To selfToken() } case 79: - // line 291 "scan_tokens.rl" + // line 295 "scan_tokens.rl" te = p p-- @@ -5291,7 +5295,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenBadUTF8) } case 80: - // line 292 "scan_tokens.rl" + // line 296 "scan_tokens.rl" te = p p-- @@ -5299,28 +5303,28 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To token(TokenInvalid) } case 81: - // line 267 "scan_tokens.rl" + // line 271 "scan_tokens.rl" p = (te) - 1 { token(TokenNumberLit) } case 82: - // line 268 "scan_tokens.rl" + // line 272 "scan_tokens.rl" p = (te) - 1 { token(TokenIdent) } case 83: - // line 281 "scan_tokens.rl" + // line 285 "scan_tokens.rl" p = (te) - 1 { selfToken() } case 84: - // line 291 "scan_tokens.rl" + // line 295 "scan_tokens.rl" p = (te) - 1 { @@ -5352,7 +5356,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } - // line 5218 "scan_tokens.go" + // line 5222 "scan_tokens.go" } } @@ -5373,7 +5377,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To act = 0 - // line 5238 "scan_tokens.go" + // line 5242 "scan_tokens.go" } } @@ -5399,7 +5403,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To } } - // line 353 "scan_tokens.rl" + // line 357 "scan_tokens.rl" // If we fall out here without being in a final state then we've // encountered something that the scanner can't match, which we'll diff --git a/hcl/hclsyntax/scan_tokens.rl b/hcl/hclsyntax/scan_tokens.rl index f9c3bf7..a7ae54f 100644 --- a/hcl/hclsyntax/scan_tokens.rl +++ b/hcl/hclsyntax/scan_tokens.rl @@ -16,6 +16,10 @@ import ( }%% func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []Token { + stripData := stripUTF8BOM(data) + start.Byte += len(data) - len(stripData) + data = stripData + f := &tokenAccum{ Filename: filename, Bytes: data, diff --git a/hcl/hclsyntax/scan_tokens_test.go b/hcl/hclsyntax/scan_tokens_test.go index ce21b1c..cc4e042 100644 --- a/hcl/hclsyntax/scan_tokens_test.go +++ b/hcl/hclsyntax/scan_tokens_test.go @@ -72,6 +72,71 @@ func TestScanTokens_normal(t *testing.T) { }, }, + // Byte-order mark + { + "\xef\xbb\xbf", // Leading UTF-8 byte-order mark is ignored... + []Token{ + { + Type: TokenEOF, + Bytes: []byte{}, + Range: hcl.Range{ // ...but its bytes still count when producing ranges + Start: hcl.Pos{Byte: 3, Line: 1, Column: 1}, + End: hcl.Pos{Byte: 3, Line: 1, Column: 1}, + }, + }, + }, + }, + { + " \xef\xbb\xbf", // Non-leading BOM is invalid + []Token{ + { + Type: TokenInvalid, + Bytes: utf8BOM, + Range: hcl.Range{ + Start: hcl.Pos{Byte: 1, Line: 1, Column: 2}, + End: hcl.Pos{Byte: 4, Line: 1, Column: 3}, + }, + }, + { + Type: TokenEOF, + Bytes: []byte{}, + Range: hcl.Range{ + Start: hcl.Pos{Byte: 4, Line: 1, Column: 3}, + End: hcl.Pos{Byte: 4, Line: 1, Column: 3}, + }, + }, + }, + }, + { + "\xfe\xff", // UTF-16 BOM is invalid + []Token{ + { + Type: TokenBadUTF8, + Bytes: []byte{0xfe}, + Range: hcl.Range{ + Start: hcl.Pos{Byte: 0, Line: 1, Column: 1}, + End: hcl.Pos{Byte: 1, Line: 1, Column: 2}, + }, + }, + { + Type: TokenBadUTF8, + Bytes: []byte{0xff}, + Range: hcl.Range{ + Start: hcl.Pos{Byte: 1, Line: 1, Column: 2}, + End: hcl.Pos{Byte: 2, Line: 1, Column: 3}, + }, + }, + { + Type: TokenEOF, + Bytes: []byte{}, + Range: hcl.Range{ + Start: hcl.Pos{Byte: 2, Line: 1, Column: 3}, + End: hcl.Pos{Byte: 2, Line: 1, Column: 3}, + }, + }, + }, + }, + // TokenNumberLit { `1`, diff --git a/hcl/hclsyntax/token.go b/hcl/hclsyntax/token.go index 7b53bd3..1967c22 100644 --- a/hcl/hclsyntax/token.go +++ b/hcl/hclsyntax/token.go @@ -279,3 +279,17 @@ func checkInvalidTokens(tokens Tokens) hcl.Diagnostics { } return diags } + +var utf8BOM = []byte{0xef, 0xbb, 0xbf} + +// stripUTF8BOM checks whether the given buffer begins with a UTF-8 byte order +// mark (0xEF 0xBB 0xBF) and, if so, returns a truncated slice with the same +// backing array but with the BOM skipped. +// +// If there is no BOM present, the given slice is returned verbatim. +func stripUTF8BOM(src []byte) []byte { + if bytes.HasPrefix(src, utf8BOM) { + return src[3:] + } + return src +}