hcl/hclsyntax: Accept and ignore UTF-8 byte order marks

A BOM is pointless in a UTF-8 file because it has a fixed encoding
agnostic of host byte ordering, but since Windows tends to use UTF-16
internally lots of Windows software will tend to generate redundant BOM
sequences at the start of UTF-8 files too.

By tolerating a leading BOM we can make life easier for those using such
Windows software, without any significant loss for normal use. This
slightly violates some of our normal assumptions about token positioning
since the BOM occupies bytes but not visible columns, but we'll just
accept that this may cause some slightly-odd behavior for use-cases such
as the diagnostic renderer and hclwrite.
This commit is contained in:
Martin Atkins 2018-12-19 15:52:15 -08:00
parent 62acf2ce82
commit 291f7fbe43
4 changed files with 173 additions and 86 deletions

View File

@ -4304,6 +4304,10 @@ const hcltok_en_main int = 1464
// line 16 "scan_tokens.rl"
func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []Token {
stripData := stripUTF8BOM(data)
start.Byte += len(data) - len(stripData)
data = stripData
f := &tokenAccum{
Filename: filename,
Bytes: data,
@ -4311,7 +4315,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
StartByte: start.Byte,
}
// line 295 "scan_tokens.rl"
// line 299 "scan_tokens.rl"
// Ragel state
p := 0 // "Pointer" into data
@ -4339,7 +4343,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
var retBraces []int // stack of brace levels that cause us to use fret
var heredocs []heredocInProgress // stack of heredocs we're currently processing
// line 330 "scan_tokens.rl"
// line 334 "scan_tokens.rl"
// Make Go compiler happy
_ = ts
@ -4359,7 +4363,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
f.emitToken(TokenType(b[0]), ts, te)
}
// line 4371 "scan_tokens.go"
// line 4375 "scan_tokens.go"
{
top = 0
ts = 0
@ -4367,7 +4371,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
act = 0
}
// line 4379 "scan_tokens.go"
// line 4383 "scan_tokens.go"
{
var _klen int
var _trans int
@ -4392,7 +4396,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
ts = p
// line 4403 "scan_tokens.go"
// line 4407 "scan_tokens.go"
}
}
@ -4464,22 +4468,22 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
_acts++
switch _hcltok_actions[_acts-1] {
case 0:
// line 219 "scan_tokens.rl"
// line 223 "scan_tokens.rl"
p--
case 1:
// line 220 "scan_tokens.rl"
// line 224 "scan_tokens.rl"
p--
case 2:
// line 225 "scan_tokens.rl"
// line 229 "scan_tokens.rl"
p--
case 3:
// line 226 "scan_tokens.rl"
// line 230 "scan_tokens.rl"
p--
@ -4489,7 +4493,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
te = p + 1
case 8:
// line 156 "scan_tokens.rl"
// line 160 "scan_tokens.rl"
te = p + 1
{
@ -4508,7 +4512,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 9:
// line 166 "scan_tokens.rl"
// line 170 "scan_tokens.rl"
te = p + 1
{
@ -4527,7 +4531,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 10:
// line 80 "scan_tokens.rl"
// line 84 "scan_tokens.rl"
te = p + 1
{
@ -4541,21 +4545,21 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
case 11:
// line 240 "scan_tokens.rl"
// line 244 "scan_tokens.rl"
te = p + 1
{
token(TokenInvalid)
}
case 12:
// line 241 "scan_tokens.rl"
// line 245 "scan_tokens.rl"
te = p + 1
{
token(TokenBadUTF8)
}
case 13:
// line 156 "scan_tokens.rl"
// line 160 "scan_tokens.rl"
te = p
p--
@ -4575,7 +4579,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 14:
// line 166 "scan_tokens.rl"
// line 170 "scan_tokens.rl"
te = p
p--
@ -4595,7 +4599,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 15:
// line 239 "scan_tokens.rl"
// line 243 "scan_tokens.rl"
te = p
p--
@ -4603,7 +4607,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenQuotedLit)
}
case 16:
// line 240 "scan_tokens.rl"
// line 244 "scan_tokens.rl"
te = p
p--
@ -4611,7 +4615,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenInvalid)
}
case 17:
// line 241 "scan_tokens.rl"
// line 245 "scan_tokens.rl"
te = p
p--
@ -4619,29 +4623,29 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenBadUTF8)
}
case 18:
// line 239 "scan_tokens.rl"
// line 243 "scan_tokens.rl"
p = (te) - 1
{
token(TokenQuotedLit)
}
case 19:
// line 241 "scan_tokens.rl"
// line 245 "scan_tokens.rl"
p = (te) - 1
{
token(TokenBadUTF8)
}
case 20:
// line 144 "scan_tokens.rl"
// line 148 "scan_tokens.rl"
act = 10
case 21:
// line 249 "scan_tokens.rl"
// line 253 "scan_tokens.rl"
act = 11
case 22:
// line 156 "scan_tokens.rl"
// line 160 "scan_tokens.rl"
te = p + 1
{
@ -4660,7 +4664,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 23:
// line 166 "scan_tokens.rl"
// line 170 "scan_tokens.rl"
te = p + 1
{
@ -4679,7 +4683,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 24:
// line 107 "scan_tokens.rl"
// line 111 "scan_tokens.rl"
te = p + 1
{
@ -4725,14 +4729,14 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenStringLit)
}
case 25:
// line 249 "scan_tokens.rl"
// line 253 "scan_tokens.rl"
te = p + 1
{
token(TokenBadUTF8)
}
case 26:
// line 156 "scan_tokens.rl"
// line 160 "scan_tokens.rl"
te = p
p--
@ -4752,7 +4756,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 27:
// line 166 "scan_tokens.rl"
// line 170 "scan_tokens.rl"
te = p
p--
@ -4772,7 +4776,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 28:
// line 144 "scan_tokens.rl"
// line 148 "scan_tokens.rl"
te = p
p--
@ -4784,7 +4788,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenStringLit)
}
case 29:
// line 249 "scan_tokens.rl"
// line 253 "scan_tokens.rl"
te = p
p--
@ -4792,7 +4796,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenBadUTF8)
}
case 30:
// line 144 "scan_tokens.rl"
// line 148 "scan_tokens.rl"
p = (te) - 1
{
@ -4829,15 +4833,15 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
case 32:
// line 152 "scan_tokens.rl"
// line 156 "scan_tokens.rl"
act = 14
case 33:
// line 256 "scan_tokens.rl"
// line 260 "scan_tokens.rl"
act = 15
case 34:
// line 156 "scan_tokens.rl"
// line 160 "scan_tokens.rl"
te = p + 1
{
@ -4856,7 +4860,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 35:
// line 166 "scan_tokens.rl"
// line 170 "scan_tokens.rl"
te = p + 1
{
@ -4875,21 +4879,21 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 36:
// line 152 "scan_tokens.rl"
// line 156 "scan_tokens.rl"
te = p + 1
{
token(TokenStringLit)
}
case 37:
// line 256 "scan_tokens.rl"
// line 260 "scan_tokens.rl"
te = p + 1
{
token(TokenBadUTF8)
}
case 38:
// line 156 "scan_tokens.rl"
// line 160 "scan_tokens.rl"
te = p
p--
@ -4909,7 +4913,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 39:
// line 166 "scan_tokens.rl"
// line 170 "scan_tokens.rl"
te = p
p--
@ -4929,7 +4933,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 40:
// line 152 "scan_tokens.rl"
// line 156 "scan_tokens.rl"
te = p
p--
@ -4937,7 +4941,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenStringLit)
}
case 41:
// line 256 "scan_tokens.rl"
// line 260 "scan_tokens.rl"
te = p
p--
@ -4945,7 +4949,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenBadUTF8)
}
case 42:
// line 152 "scan_tokens.rl"
// line 156 "scan_tokens.rl"
p = (te) - 1
{
@ -4974,29 +4978,29 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
case 44:
// line 260 "scan_tokens.rl"
// line 264 "scan_tokens.rl"
act = 16
case 45:
// line 261 "scan_tokens.rl"
// line 265 "scan_tokens.rl"
act = 17
case 46:
// line 261 "scan_tokens.rl"
// line 265 "scan_tokens.rl"
te = p + 1
{
token(TokenBadUTF8)
}
case 47:
// line 262 "scan_tokens.rl"
// line 266 "scan_tokens.rl"
te = p + 1
{
token(TokenInvalid)
}
case 48:
// line 260 "scan_tokens.rl"
// line 264 "scan_tokens.rl"
te = p
p--
@ -5004,7 +5008,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenIdent)
}
case 49:
// line 261 "scan_tokens.rl"
// line 265 "scan_tokens.rl"
te = p
p--
@ -5012,14 +5016,14 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenBadUTF8)
}
case 50:
// line 260 "scan_tokens.rl"
// line 264 "scan_tokens.rl"
p = (te) - 1
{
token(TokenIdent)
}
case 51:
// line 261 "scan_tokens.rl"
// line 265 "scan_tokens.rl"
p = (te) - 1
{
@ -5042,100 +5046,100 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
case 53:
// line 268 "scan_tokens.rl"
// line 272 "scan_tokens.rl"
act = 21
case 54:
// line 281 "scan_tokens.rl"
// line 285 "scan_tokens.rl"
act = 32
case 55:
// line 291 "scan_tokens.rl"
// line 295 "scan_tokens.rl"
act = 38
case 56:
// line 292 "scan_tokens.rl"
// line 296 "scan_tokens.rl"
act = 39
case 57:
// line 270 "scan_tokens.rl"
// line 274 "scan_tokens.rl"
te = p + 1
{
token(TokenComment)
}
case 58:
// line 271 "scan_tokens.rl"
// line 275 "scan_tokens.rl"
te = p + 1
{
token(TokenNewline)
}
case 59:
// line 273 "scan_tokens.rl"
// line 277 "scan_tokens.rl"
te = p + 1
{
token(TokenEqualOp)
}
case 60:
// line 274 "scan_tokens.rl"
// line 278 "scan_tokens.rl"
te = p + 1
{
token(TokenNotEqual)
}
case 61:
// line 275 "scan_tokens.rl"
// line 279 "scan_tokens.rl"
te = p + 1
{
token(TokenGreaterThanEq)
}
case 62:
// line 276 "scan_tokens.rl"
// line 280 "scan_tokens.rl"
te = p + 1
{
token(TokenLessThanEq)
}
case 63:
// line 277 "scan_tokens.rl"
// line 281 "scan_tokens.rl"
te = p + 1
{
token(TokenAnd)
}
case 64:
// line 278 "scan_tokens.rl"
// line 282 "scan_tokens.rl"
te = p + 1
{
token(TokenOr)
}
case 65:
// line 279 "scan_tokens.rl"
// line 283 "scan_tokens.rl"
te = p + 1
{
token(TokenEllipsis)
}
case 66:
// line 280 "scan_tokens.rl"
// line 284 "scan_tokens.rl"
te = p + 1
{
token(TokenFatArrow)
}
case 67:
// line 281 "scan_tokens.rl"
// line 285 "scan_tokens.rl"
te = p + 1
{
selfToken()
}
case 68:
// line 176 "scan_tokens.rl"
// line 180 "scan_tokens.rl"
te = p + 1
{
@ -5143,7 +5147,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
braces++
}
case 69:
// line 181 "scan_tokens.rl"
// line 185 "scan_tokens.rl"
te = p + 1
{
@ -5164,7 +5168,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 70:
// line 193 "scan_tokens.rl"
// line 197 "scan_tokens.rl"
te = p + 1
{
@ -5194,7 +5198,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 71:
// line 75 "scan_tokens.rl"
// line 79 "scan_tokens.rl"
te = p + 1
{
@ -5208,7 +5212,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 72:
// line 85 "scan_tokens.rl"
// line 89 "scan_tokens.rl"
te = p + 1
{
@ -5239,27 +5243,27 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
case 73:
// line 291 "scan_tokens.rl"
// line 295 "scan_tokens.rl"
te = p + 1
{
token(TokenBadUTF8)
}
case 74:
// line 292 "scan_tokens.rl"
// line 296 "scan_tokens.rl"
te = p + 1
{
token(TokenInvalid)
}
case 75:
// line 266 "scan_tokens.rl"
// line 270 "scan_tokens.rl"
te = p
p--
case 76:
// line 267 "scan_tokens.rl"
// line 271 "scan_tokens.rl"
te = p
p--
@ -5267,7 +5271,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenNumberLit)
}
case 77:
// line 268 "scan_tokens.rl"
// line 272 "scan_tokens.rl"
te = p
p--
@ -5275,7 +5279,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenIdent)
}
case 78:
// line 281 "scan_tokens.rl"
// line 285 "scan_tokens.rl"
te = p
p--
@ -5283,7 +5287,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
selfToken()
}
case 79:
// line 291 "scan_tokens.rl"
// line 295 "scan_tokens.rl"
te = p
p--
@ -5291,7 +5295,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenBadUTF8)
}
case 80:
// line 292 "scan_tokens.rl"
// line 296 "scan_tokens.rl"
te = p
p--
@ -5299,28 +5303,28 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
token(TokenInvalid)
}
case 81:
// line 267 "scan_tokens.rl"
// line 271 "scan_tokens.rl"
p = (te) - 1
{
token(TokenNumberLit)
}
case 82:
// line 268 "scan_tokens.rl"
// line 272 "scan_tokens.rl"
p = (te) - 1
{
token(TokenIdent)
}
case 83:
// line 281 "scan_tokens.rl"
// line 285 "scan_tokens.rl"
p = (te) - 1
{
selfToken()
}
case 84:
// line 291 "scan_tokens.rl"
// line 295 "scan_tokens.rl"
p = (te) - 1
{
@ -5352,7 +5356,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
// line 5218 "scan_tokens.go"
// line 5222 "scan_tokens.go"
}
}
@ -5373,7 +5377,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
act = 0
// line 5238 "scan_tokens.go"
// line 5242 "scan_tokens.go"
}
}
@ -5399,7 +5403,7 @@ func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []To
}
}
// line 353 "scan_tokens.rl"
// line 357 "scan_tokens.rl"
// If we fall out here without being in a final state then we've
// encountered something that the scanner can't match, which we'll

View File

@ -16,6 +16,10 @@ import (
}%%
func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []Token {
stripData := stripUTF8BOM(data)
start.Byte += len(data) - len(stripData)
data = stripData
f := &tokenAccum{
Filename: filename,
Bytes: data,

View File

@ -72,6 +72,71 @@ func TestScanTokens_normal(t *testing.T) {
},
},
// Byte-order mark
{
"\xef\xbb\xbf", // Leading UTF-8 byte-order mark is ignored...
[]Token{
{
Type: TokenEOF,
Bytes: []byte{},
Range: hcl.Range{ // ...but its bytes still count when producing ranges
Start: hcl.Pos{Byte: 3, Line: 1, Column: 1},
End: hcl.Pos{Byte: 3, Line: 1, Column: 1},
},
},
},
},
{
" \xef\xbb\xbf", // Non-leading BOM is invalid
[]Token{
{
Type: TokenInvalid,
Bytes: utf8BOM,
Range: hcl.Range{
Start: hcl.Pos{Byte: 1, Line: 1, Column: 2},
End: hcl.Pos{Byte: 4, Line: 1, Column: 3},
},
},
{
Type: TokenEOF,
Bytes: []byte{},
Range: hcl.Range{
Start: hcl.Pos{Byte: 4, Line: 1, Column: 3},
End: hcl.Pos{Byte: 4, Line: 1, Column: 3},
},
},
},
},
{
"\xfe\xff", // UTF-16 BOM is invalid
[]Token{
{
Type: TokenBadUTF8,
Bytes: []byte{0xfe},
Range: hcl.Range{
Start: hcl.Pos{Byte: 0, Line: 1, Column: 1},
End: hcl.Pos{Byte: 1, Line: 1, Column: 2},
},
},
{
Type: TokenBadUTF8,
Bytes: []byte{0xff},
Range: hcl.Range{
Start: hcl.Pos{Byte: 1, Line: 1, Column: 2},
End: hcl.Pos{Byte: 2, Line: 1, Column: 3},
},
},
{
Type: TokenEOF,
Bytes: []byte{},
Range: hcl.Range{
Start: hcl.Pos{Byte: 2, Line: 1, Column: 3},
End: hcl.Pos{Byte: 2, Line: 1, Column: 3},
},
},
},
},
// TokenNumberLit
{
`1`,

View File

@ -279,3 +279,17 @@ func checkInvalidTokens(tokens Tokens) hcl.Diagnostics {
}
return diags
}
var utf8BOM = []byte{0xef, 0xbb, 0xbf}
// stripUTF8BOM checks whether the given buffer begins with a UTF-8 byte order
// mark (0xEF 0xBB 0xBF) and, if so, returns a truncated slice with the same
// backing array but with the BOM skipped.
//
// If there is no BOM present, the given slice is returned verbatim.
func stripUTF8BOM(src []byte) []byte {
if bytes.HasPrefix(src, utf8BOM) {
return src[3:]
}
return src
}