hclsyntax: rewrite string literal decoder with ragel

Fuzz testing revealed that there were a few different crashers in the
string literal decoder, which was previously a rather-unweildy
hand-written scanner with manually-implemented lookahead.

Rather than continuing to hand-tweak that code, here instead we use
ragel (which we were already using for the main scanner anyway) to
partition our string literals into tokens that are easier for our
decoder to wrangle.

As a bonus, this also makes our source ranges in our diagnostics more
accurate.
This commit is contained in:
Martin Atkins 2018-02-04 18:35:42 -08:00
parent 93a7008e3d
commit cfd802163b
7 changed files with 897 additions and 233 deletions

View File

@ -217,6 +217,18 @@ trim`,
cty.StringVal("a\nb\nc\n"),
0,
},
{
`\n`, // backslash escapes are not interpreted in template literals
nil,
cty.StringVal("\\n"),
0,
},
{
`\uu1234`, // backslash escapes are not interpreted in template literals
nil, // (this is intentionally an invalid one to ensure we don't produce an error)
cty.StringVal("\\uu1234"),
0,
},
}
for _, test := range tests {

View File

@ -4,4 +4,6 @@ package hclsyntax
//go:generate ruby unicode2ragel.rb --url=http://www.unicode.org/Public/9.0.0/ucd/DerivedCoreProperties.txt -m UnicodeDerived -p ID_Start,ID_Continue -o unicode_derived.rl
//go:generate ragel -Z scan_tokens.rl
//go:generate gofmt -w scan_tokens.go
//go:generate ragel -Z scan_string_lit.rl
//go:generate gofmt -w scan_string_lit.go
//go:generate stringer -type TokenType -output token_type_string.go

View File

@ -1,7 +1,6 @@
package hclsyntax
import (
"bufio"
"bytes"
"fmt"
"strconv"
@ -1478,256 +1477,149 @@ func (p *parser) decodeStringLit(tok Token) (string, hcl.Diagnostics) {
var diags hcl.Diagnostics
ret := make([]byte, 0, len(tok.Bytes))
var esc []byte
slices := scanStringLit(tok.Bytes, quoted)
sc := bufio.NewScanner(bytes.NewReader(tok.Bytes))
sc.Split(textseg.ScanGraphemeClusters)
// We will mutate rng constantly as we walk through our token slices below.
// Any diagnostics must take a copy of this rng rather than simply pointing
// to it, e.g. by using rng.Ptr() rather than &rng.
rng := tok.Range
rng.End = rng.Start
pos := tok.Range.Start
newPos := pos
Character:
for sc.Scan() {
pos = newPos
ch := sc.Bytes()
// Adjust position based on our new character.
// \r\n is considered to be a single character in text segmentation,
if (len(ch) == 1 && ch[0] == '\n') || (len(ch) == 2 && ch[1] == '\n') {
newPos.Line++
newPos.Column = 0
} else {
newPos.Column++
Slices:
for _, slice := range slices {
if len(slice) == 0 {
continue
}
newPos.Byte += len(ch)
if len(esc) > 0 {
switch esc[0] {
case '\\':
// Advance the start of our range to where the previous token ended
rng.Start = rng.End
if len(esc) >= 2 {
switch esc[1] {
case 'u', 'U':
// Our new character must be an ASCII hex digit
_, err := strconv.ParseInt(string(ch), 16, 0)
if err != nil {
var detail string
switch esc[1] {
case 'u':
detail = "Escape sequence \\u must be followed by exactly four hexidecimal digits."
case 'U':
detail = "Escape sequence \\U must be followed by exactly eight hexidecimal digits."
}
diags = append(diags, &hcl.Diagnostic{
Severity: hcl.DiagError,
Summary: "Invalid escape sequence",
Detail: detail,
Subject: &hcl.Range{
Filename: tok.Range.Filename,
Start: hcl.Pos{
Line: pos.Line,
Column: pos.Column,
Byte: pos.Byte,
},
End: hcl.Pos{
Line: pos.Line,
Column: pos.Column + 1,
Byte: pos.Byte + len(ch),
},
},
})
ret = append(ret, esc...)
ret = append(ret, ch...)
esc = esc[:0]
continue Character
}
esc = append(esc, ch...)
var complete bool
switch esc[1] {
case 'u':
complete = (len(esc) == 6) // four digits plus our \u introducer
case 'U':
complete = (len(esc) == 10) // eight digits plus our \U introducer
}
if !complete {
// Keep accumulating more digits, then
continue Character
}
digits := string(esc[2:])
valInt, err := strconv.ParseInt(digits, 16, 32)
if err != nil {
// Should never happen because we validated our digits
// as they arrived, above.
panic(err)
}
r := rune(valInt)
rl := utf8.RuneLen(r)
// Make room in our ret buffer for the extra characters
for i := 0; i < rl; i++ {
ret = append(ret, 0)
}
// Fill those extra characters with the canonical UTF-8
// representation of our rune.
utf8.EncodeRune(ret[len(ret)-rl:], r)
// ...and now finally we're finished escaping!
esc = esc[:0]
continue Character
}
}
if len(ch) == 1 {
switch ch[0] {
case 'n':
ret = append(ret, '\n')
esc = esc[:0]
continue Character
case 'r':
ret = append(ret, '\r')
esc = esc[:0]
continue Character
case 't':
ret = append(ret, '\t')
esc = esc[:0]
continue Character
case '"':
ret = append(ret, '"')
esc = esc[:0]
continue Character
case '\\':
ret = append(ret, '\\')
esc = esc[:0]
continue Character
case 'u', 'U':
// For these, we'll continue working on them until
// we accumulate the expected number of digits.
esc = append(esc, ch...)
continue Character
}
}
var detail string
switch {
case len(ch) == 1 && (ch[0] == '$' || ch[0] == '%'):
detail = fmt.Sprintf(
"The characters \"\\%s\" do not form a recognized escape sequence. To escape a \"%s{\" template sequence, use \"%s%s{\".",
ch, ch, ch, ch,
)
default:
detail = fmt.Sprintf("The characters \"\\%s\" do not form a recognized escape sequence.", ch)
}
// Advance the end of our range to after our token.
b := slice
for len(b) > 0 {
adv, ch, _ := textseg.ScanGraphemeClusters(b, true)
rng.End.Byte += adv
switch ch[0] {
case '\r', '\n':
rng.End.Line++
rng.End.Column = 1
default:
rng.End.Column++
}
b = b[adv:]
}
TokenType:
switch slice[0] {
case '\\':
if !quoted {
// If we're not in quoted mode then just treat this token as
// normal. (Slices can still start with backslash even if we're
// not specifically looking for backslash sequences.)
break TokenType
}
if len(slice) < 2 {
diags = append(diags, &hcl.Diagnostic{
Severity: hcl.DiagError,
Summary: "Invalid escape sequence",
Detail: detail,
Subject: &hcl.Range{
Filename: tok.Range.Filename,
Start: hcl.Pos{
Line: pos.Line,
Column: pos.Column - 1, // safe because we know the previous character must be a backslash
Byte: pos.Byte - 1,
},
End: hcl.Pos{
Line: pos.Line,
Column: pos.Column + 1, // safe because we know the previous character must be a backslash
Byte: pos.Byte + len(ch),
},
},
Detail: "Backslash must be followed by an escape sequence selector character.",
Subject: rng.Ptr(),
})
ret = append(ret, ch...)
esc = esc[:0]
continue Character
break TokenType
}
case '$', '%':
switch len(esc) {
case 1:
if len(ch) == 1 && ch[0] == esc[0] {
esc = append(esc, ch[0])
continue Character
}
switch slice[1] {
// Any other character means this wasn't an escape sequence
// after all.
ret = append(ret, esc...)
ret = append(ret, ch...)
esc = esc[:0]
case 2:
if len(ch) == 1 && ch[0] == '{' {
// successful escape sequence
ret = append(ret, esc[0])
} else {
// not an escape sequence, so just output literal
ret = append(ret, esc...)
}
ret = append(ret, ch...)
esc = esc[:0]
default:
// should never happen
panic("have invalid escape sequence >2 characters")
case 'n':
ret = append(ret, '\n')
continue Slices
case 'r':
ret = append(ret, '\r')
continue Slices
case 't':
ret = append(ret, '\t')
continue Slices
case '"':
ret = append(ret, '"')
continue Slices
case '\\':
ret = append(ret, '\\')
continue Slices
case 'u', 'U':
if slice[1] == 'u' && len(slice) != 6 {
diags = append(diags, &hcl.Diagnostic{
Severity: hcl.DiagError,
Summary: "Invalid escape sequence",
Detail: "The \\u escape sequence must be followed by four hexadecimal digits.",
Subject: rng.Ptr(),
})
break TokenType
} else if slice[1] == 'U' && len(slice) != 10 {
diags = append(diags, &hcl.Diagnostic{
Severity: hcl.DiagError,
Summary: "Invalid escape sequence",
Detail: "The \\U escape sequence must be followed by eight hexadecimal digits.",
Subject: rng.Ptr(),
})
break TokenType
}
}
} else {
if len(ch) == 1 {
switch ch[0] {
case '\\':
if quoted { // ignore backslashes in unquoted mode
esc = append(esc, '\\')
continue Character
}
case '$':
esc = append(esc, '$')
continue Character
case '%':
esc = append(esc, '%')
continue Character
numHex := string(slice[2:])
num, err := strconv.ParseUint(numHex, 16, 32)
if err != nil {
// Should never happen because the scanner won't match
// a sequence of digits that isn't valid.
panic(err)
}
r := rune(num)
l := utf8.RuneLen(r)
if l == -1 {
diags = append(diags, &hcl.Diagnostic{
Severity: hcl.DiagError,
Summary: "Invalid escape sequence",
Detail: fmt.Sprintf("Cannot encode character U+%04x in UTF-8.", num),
Subject: rng.Ptr(),
})
break TokenType
}
for i := 0; i < l; i++ {
ret = append(ret, 0)
}
rb := ret[len(ret)-l:]
utf8.EncodeRune(rb, r)
continue Slices
default:
diags = append(diags, &hcl.Diagnostic{
Severity: hcl.DiagError,
Summary: "Invalid escape sequence",
Detail: fmt.Sprintf("The symbol %q is not a valid escape sequence selector.", slice[1:]),
Subject: rng.Ptr(),
})
ret = append(ret, slice[1:]...)
continue Slices
}
ret = append(ret, ch...)
case '$', '%':
if len(slice) != 3 {
// Not long enough to be our escape sequence, so it's literal.
break TokenType
}
if slice[1] == slice[0] && slice[2] == '{' {
ret = append(ret, slice[0])
ret = append(ret, '{')
continue Slices
}
break TokenType
}
}
// if we still have an outstanding "esc" when we fall out here then
// the literal ended with an unterminated escape sequence, which we
// must now deal with.
if len(esc) > 0 {
if esc[0] == '\\' {
// An incomplete backslash sequence is an error, since it suggests
// that e.g. the user started writing a \uXXXX sequence but didn't
// provide enough hex digits.
diags = append(diags, &hcl.Diagnostic{
Severity: hcl.DiagError,
Summary: "Invalid escape sequence",
Detail: fmt.Sprintf("The characters %q do not form a complete escape sequence.", esc),
Subject: &hcl.Range{
Filename: tok.Range.Filename,
Start: hcl.Pos{
Line: pos.Line,
Column: pos.Column,
Byte: pos.Byte,
},
End: hcl.Pos{
Line: pos.Line,
Column: pos.Column + len(esc),
Byte: pos.Byte + len(esc),
},
},
})
}
// This might also be an incomplete $${ or %%{ escape sequence, but
// that's treated as a literal rather than an error since those only
// count as escape sequences when all three characters are present.
ret = append(ret, esc...)
esc = nil
// If we fall out here or break out of here from the switch above
// then this slice is just a literal.
ret = append(ret, slice...)
}
return string(ret), diags

View File

@ -765,6 +765,56 @@ block "valid" {}
},
},
},
{
"a = \"\\uu2022\"\n",
1, // \u must be followed by four hex digits
&Body{
Attributes: Attributes{
"a": {
Name: "a",
Expr: &TemplateExpr{
Parts: []Expression{
&LiteralValueExpr{
Val: cty.StringVal("\\uu2022"),
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 6, Byte: 5},
End: hcl.Pos{Line: 1, Column: 13, Byte: 12},
},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 5, Byte: 4},
End: hcl.Pos{Line: 1, Column: 14, Byte: 13},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 14, Byte: 13},
},
NameRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 2, Byte: 1},
},
EqualsRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 3, Byte: 2},
End: hcl.Pos{Line: 1, Column: 4, Byte: 3},
},
},
},
Blocks: Blocks{},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 2, Column: 1, Byte: 14},
},
EndRange: hcl.Range{
Start: hcl.Pos{Line: 2, Column: 1, Byte: 14},
End: hcl.Pos{Line: 2, Column: 1, Byte: 14},
},
},
},
{
"a = \"\\U0001d11e\"\n",
0,
@ -968,6 +1018,106 @@ block "valid" {}
},
},
},
{
"a = \"\\U00300000\"\n",
1, // Invalid unicode character (can't encode in UTF-8)
&Body{
Attributes: Attributes{
"a": {
Name: "a",
Expr: &TemplateExpr{
Parts: []Expression{
&LiteralValueExpr{
Val: cty.StringVal("\\U00300000"),
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 6, Byte: 5},
End: hcl.Pos{Line: 1, Column: 16, Byte: 15},
},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 5, Byte: 4},
End: hcl.Pos{Line: 1, Column: 17, Byte: 16},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 17, Byte: 16},
},
NameRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 2, Byte: 1},
},
EqualsRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 3, Byte: 2},
End: hcl.Pos{Line: 1, Column: 4, Byte: 3},
},
},
},
Blocks: Blocks{},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 2, Column: 1, Byte: 17},
},
EndRange: hcl.Range{
Start: hcl.Pos{Line: 2, Column: 1, Byte: 17},
End: hcl.Pos{Line: 2, Column: 1, Byte: 17},
},
},
},
{
"a = \"\\Ub2705550\"\n",
1, // Invalid unicode character (can't encode in UTF-8)
&Body{
Attributes: Attributes{
"a": {
Name: "a",
Expr: &TemplateExpr{
Parts: []Expression{
&LiteralValueExpr{
Val: cty.StringVal("\\Ub2705550"),
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 6, Byte: 5},
End: hcl.Pos{Line: 1, Column: 16, Byte: 15},
},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 5, Byte: 4},
End: hcl.Pos{Line: 1, Column: 17, Byte: 16},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 17, Byte: 16},
},
NameRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 2, Byte: 1},
},
EqualsRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 3, Byte: 2},
End: hcl.Pos{Line: 1, Column: 4, Byte: 3},
},
},
},
Blocks: Blocks{},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 2, Column: 1, Byte: 17},
},
EndRange: hcl.Range{
Start: hcl.Pos{Line: 2, Column: 1, Byte: 17},
End: hcl.Pos{Line: 2, Column: 1, Byte: 17},
},
},
},
{
"a = foo.bar\n",
0,

View File

@ -0,0 +1,301 @@
// line 1 "scan_string_lit.rl"
package hclsyntax
// This file is generated from scan_string_lit.rl. DO NOT EDIT.
// line 9 "scan_string_lit.go"
var _hclstrtok_actions []byte = []byte{
0, 1, 0, 1, 1, 2, 1, 0,
}
var _hclstrtok_key_offsets []byte = []byte{
0, 0, 2, 4, 6, 10, 14, 18,
22, 27, 31, 36, 41, 46, 51, 57,
62, 74, 85, 96, 107, 118, 129, 140,
151,
}
var _hclstrtok_trans_keys []byte = []byte{
128, 191, 128, 191, 128, 191, 10, 13,
36, 37, 10, 13, 36, 37, 10, 13,
36, 37, 10, 13, 36, 37, 10, 13,
36, 37, 123, 10, 13, 36, 37, 10,
13, 36, 37, 92, 10, 13, 36, 37,
92, 10, 13, 36, 37, 92, 10, 13,
36, 37, 92, 10, 13, 36, 37, 92,
123, 10, 13, 36, 37, 92, 85, 117,
128, 191, 192, 223, 224, 239, 240, 247,
248, 255, 10, 13, 36, 37, 92, 48,
57, 65, 70, 97, 102, 10, 13, 36,
37, 92, 48, 57, 65, 70, 97, 102,
10, 13, 36, 37, 92, 48, 57, 65,
70, 97, 102, 10, 13, 36, 37, 92,
48, 57, 65, 70, 97, 102, 10, 13,
36, 37, 92, 48, 57, 65, 70, 97,
102, 10, 13, 36, 37, 92, 48, 57,
65, 70, 97, 102, 10, 13, 36, 37,
92, 48, 57, 65, 70, 97, 102, 10,
13, 36, 37, 92, 48, 57, 65, 70,
97, 102,
}
var _hclstrtok_single_lengths []byte = []byte{
0, 0, 0, 0, 4, 4, 4, 4,
5, 4, 5, 5, 5, 5, 6, 5,
2, 5, 5, 5, 5, 5, 5, 5,
5,
}
var _hclstrtok_range_lengths []byte = []byte{
0, 1, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
5, 3, 3, 3, 3, 3, 3, 3,
3,
}
var _hclstrtok_index_offsets []byte = []byte{
0, 0, 2, 4, 6, 11, 16, 21,
26, 32, 37, 43, 49, 55, 61, 68,
74, 82, 91, 100, 109, 118, 127, 136,
145,
}
var _hclstrtok_indicies []byte = []byte{
0, 1, 2, 1, 3, 1, 5, 6,
7, 8, 4, 10, 11, 12, 13, 9,
14, 11, 12, 13, 9, 10, 11, 15,
13, 9, 10, 11, 12, 13, 14, 9,
10, 11, 12, 15, 9, 17, 18, 19,
20, 21, 16, 23, 24, 25, 26, 27,
22, 0, 24, 25, 26, 27, 22, 23,
24, 28, 26, 27, 22, 23, 24, 25,
26, 27, 0, 22, 23, 24, 25, 28,
27, 22, 29, 30, 22, 2, 3, 31,
22, 0, 23, 24, 25, 26, 27, 32,
32, 32, 22, 23, 24, 25, 26, 27,
33, 33, 33, 22, 23, 24, 25, 26,
27, 34, 34, 34, 22, 23, 24, 25,
26, 27, 30, 30, 30, 22, 23, 24,
25, 26, 27, 35, 35, 35, 22, 23,
24, 25, 26, 27, 36, 36, 36, 22,
23, 24, 25, 26, 27, 37, 37, 37,
22, 23, 24, 25, 26, 27, 0, 0,
0, 22,
}
var _hclstrtok_trans_targs []byte = []byte{
11, 0, 1, 2, 4, 5, 6, 7,
9, 4, 5, 6, 7, 9, 5, 8,
10, 11, 12, 13, 15, 16, 10, 11,
12, 13, 15, 16, 14, 17, 21, 3,
18, 19, 20, 22, 23, 24,
}
var _hclstrtok_trans_actions []byte = []byte{
0, 0, 0, 0, 0, 1, 1, 1,
1, 3, 5, 5, 5, 5, 0, 0,
0, 1, 1, 1, 1, 1, 3, 5,
5, 5, 5, 5, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
}
var _hclstrtok_eof_actions []byte = []byte{
0, 0, 0, 0, 0, 3, 3, 3,
3, 3, 0, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3,
3,
}
const hclstrtok_start int = 4
const hclstrtok_first_final int = 4
const hclstrtok_error int = 0
const hclstrtok_en_quoted int = 10
const hclstrtok_en_unquoted int = 4
// line 10 "scan_string_lit.rl"
func scanStringLit(data []byte, quoted bool) [][]byte {
var ret [][]byte
// line 61 "scan_string_lit.rl"
// Ragel state
p := 0 // "Pointer" into data
pe := len(data) // End-of-data "pointer"
ts := 0
te := 0
eof := pe
var cs int // current state
switch {
case quoted:
cs = hclstrtok_en_quoted
default:
cs = hclstrtok_en_unquoted
}
// Make Go compiler happy
_ = ts
_ = eof
/*token := func () {
ret = append(ret, data[ts:te])
}*/
// line 154 "scan_string_lit.go"
{
}
// line 158 "scan_string_lit.go"
{
var _klen int
var _trans int
var _acts int
var _nacts uint
var _keys int
if p == pe {
goto _test_eof
}
if cs == 0 {
goto _out
}
_resume:
_keys = int(_hclstrtok_key_offsets[cs])
_trans = int(_hclstrtok_index_offsets[cs])
_klen = int(_hclstrtok_single_lengths[cs])
if _klen > 0 {
_lower := int(_keys)
var _mid int
_upper := int(_keys + _klen - 1)
for {
if _upper < _lower {
break
}
_mid = _lower + ((_upper - _lower) >> 1)
switch {
case data[p] < _hclstrtok_trans_keys[_mid]:
_upper = _mid - 1
case data[p] > _hclstrtok_trans_keys[_mid]:
_lower = _mid + 1
default:
_trans += int(_mid - int(_keys))
goto _match
}
}
_keys += _klen
_trans += _klen
}
_klen = int(_hclstrtok_range_lengths[cs])
if _klen > 0 {
_lower := int(_keys)
var _mid int
_upper := int(_keys + (_klen << 1) - 2)
for {
if _upper < _lower {
break
}
_mid = _lower + (((_upper - _lower) >> 1) & ^1)
switch {
case data[p] < _hclstrtok_trans_keys[_mid]:
_upper = _mid - 2
case data[p] > _hclstrtok_trans_keys[_mid+1]:
_lower = _mid + 2
default:
_trans += int((_mid - int(_keys)) >> 1)
goto _match
}
}
_trans += _klen
}
_match:
_trans = int(_hclstrtok_indicies[_trans])
cs = int(_hclstrtok_trans_targs[_trans])
if _hclstrtok_trans_actions[_trans] == 0 {
goto _again
}
_acts = int(_hclstrtok_trans_actions[_trans])
_nacts = uint(_hclstrtok_actions[_acts])
_acts++
for ; _nacts > 0; _nacts-- {
_acts++
switch _hclstrtok_actions[_acts-1] {
case 0:
// line 40 "scan_string_lit.rl"
// If te is behind p then we've skipped over some literal
// characters which we must now return.
if te < p {
ret = append(ret, data[te:p])
}
ts = p
case 1:
// line 48 "scan_string_lit.rl"
te = p
ret = append(ret, data[ts:te])
// line 255 "scan_string_lit.go"
}
}
_again:
if cs == 0 {
goto _out
}
p++
if p != pe {
goto _resume
}
_test_eof:
{
}
if p == eof {
__acts := _hclstrtok_eof_actions[cs]
__nacts := uint(_hclstrtok_actions[__acts])
__acts++
for ; __nacts > 0; __nacts-- {
__acts++
switch _hclstrtok_actions[__acts-1] {
case 1:
// line 48 "scan_string_lit.rl"
te = p
ret = append(ret, data[ts:te])
// line 281 "scan_string_lit.go"
}
}
}
_out:
{
}
}
// line 89 "scan_string_lit.rl"
if te < p {
// Collect any leftover literal characters at the end of the input
ret = append(ret, data[te:p])
}
// If we fall out here without being in a final state then we've
// encountered something that the scanner can't match, which should
// be impossible (the scanner matches all bytes _somehow_) but we'll
// tolerate it and let the caller deal with it.
if cs < hclstrtok_first_final {
ret = append(ret, data[p:len(data)])
}
return ret
}

View File

@ -0,0 +1,105 @@
package hclsyntax
// This file is generated from scan_string_lit.rl. DO NOT EDIT.
%%{
# (except you are actually in scan_string_lit.rl here, so edit away!)
machine hclstrtok;
write data;
}%%
func scanStringLit(data []byte, quoted bool) [][]byte {
var ret [][]byte
%%{
include UnicodeDerived "unicode_derived.rl";
UTF8Cont = 0x80 .. 0xBF;
AnyUTF8 = (
0x00..0x7F |
0xC0..0xDF . UTF8Cont |
0xE0..0xEF . UTF8Cont . UTF8Cont |
0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
);
BadUTF8 = any - AnyUTF8;
Hex = ('0'..'9' | 'a'..'f' | 'A'..'F');
# Our goal with this patterns is to capture user intent as best as
# possible, even if the input is invalid. The caller will then verify
# whether each token is valid and generate suitable error messages
# if not.
UnicodeEscapeShort = "\\u" . Hex{0,4};
UnicodeEscapeLong = "\\U" . Hex{0,8};
UnicodeEscape = (UnicodeEscapeShort | UnicodeEscapeLong);
SimpleEscape = "\\" . (AnyUTF8 - ('U'|'u'))?;
TemplateEscape = ("$" . ("$" . ("{"?))?) | ("%" . ("%" . ("{"?))?);
Newline = ("\r\n" | "\r" | "\n");
action Begin {
// If te is behind p then we've skipped over some literal
// characters which we must now return.
if te < p {
ret = append(ret, data[te:p])
}
ts = p;
}
action End {
te = p;
ret = append(ret, data[ts:te]);
}
QuotedToken = (UnicodeEscape | SimpleEscape | TemplateEscape | Newline) >Begin %End;
UnquotedToken = (TemplateEscape | Newline) >Begin %End;
QuotedLiteral = (any - ("\\" | "$" | "%" | "\r" | "\n"));
UnquotedLiteral = (any - ("$" | "%" | "\r" | "\n"));
quoted := (QuotedToken | QuotedLiteral)**;
unquoted := (UnquotedToken | UnquotedLiteral)**;
}%%
// Ragel state
p := 0 // "Pointer" into data
pe := len(data) // End-of-data "pointer"
ts := 0
te := 0
eof := pe
var cs int // current state
switch {
case quoted:
cs = hclstrtok_en_quoted
default:
cs = hclstrtok_en_unquoted
}
// Make Go compiler happy
_ = ts
_ = eof
/*token := func () {
ret = append(ret, data[ts:te])
}*/
%%{
write init nocs;
write exec;
}%%
if te < p {
// Collect any leftover literal characters at the end of the input
ret = append(ret, data[te:p])
}
// If we fall out here without being in a final state then we've
// encountered something that the scanner can't match, which should
// be impossible (the scanner matches all bytes _somehow_) but we'll
// tolerate it and let the caller deal with it.
if cs < hclstrtok_first_final {
ret = append(ret, data[p:len(data)])
}
return ret
}

View File

@ -0,0 +1,202 @@
package hclsyntax
import (
"reflect"
"testing"
"github.com/davecgh/go-spew/spew"
)
func TestScanStringLit(t *testing.T) {
tests := []struct {
Input string
WantQuoted []string
WantUnquoted []string
}{
{
``,
[]string{},
[]string{},
},
{
`hello`,
[]string{`hello`},
[]string{`hello`},
},
{
`hello world`,
[]string{`hello world`},
[]string{`hello world`},
},
{
`hello\nworld`,
[]string{`hello`, `\n`, `world`},
[]string{`hello\nworld`},
},
{
`hello\🥁world`,
[]string{`hello`, `\🥁`, `world`},
[]string{`hello\🥁world`},
},
{
`hello\uabcdworld`,
[]string{`hello`, `\uabcd`, `world`},
[]string{`hello\uabcdworld`},
},
{
`hello\uabcdabcdworld`,
[]string{`hello`, `\uabcd`, `abcdworld`},
[]string{`hello\uabcdabcdworld`},
},
{
`hello\uabcworld`,
[]string{`hello`, `\uabc`, `world`},
[]string{`hello\uabcworld`},
},
{
`hello\U01234567world`,
[]string{`hello`, `\U01234567`, `world`},
[]string{`hello\U01234567world`},
},
{
`hello\U012345670123world`,
[]string{`hello`, `\U01234567`, `0123world`},
[]string{`hello\U012345670123world`},
},
{
`hello\Uabcdworld`,
[]string{`hello`, `\Uabcd`, `world`},
[]string{`hello\Uabcdworld`},
},
{
`hello\Uabcworld`,
[]string{`hello`, `\Uabc`, `world`},
[]string{`hello\Uabcworld`},
},
{
`hello\uworld`,
[]string{`hello`, `\u`, `world`},
[]string{`hello\uworld`},
},
{
`hello\Uworld`,
[]string{`hello`, `\U`, `world`},
[]string{`hello\Uworld`},
},
{
`hello\u`,
[]string{`hello`, `\u`},
[]string{`hello\u`},
},
{
`hello\U`,
[]string{`hello`, `\U`},
[]string{`hello\U`},
},
{
`hello\`,
[]string{`hello`, `\`},
[]string{`hello\`},
},
{
`hello$${world}`,
[]string{`hello`, `$${`, `world}`},
[]string{`hello`, `$${`, `world}`},
},
{
`hello$$world`,
[]string{`hello`, `$$`, `world`},
[]string{`hello`, `$$`, `world`},
},
{
`hello$world`,
[]string{`hello`, `$`, `world`},
[]string{`hello`, `$`, `world`},
},
{
`hello$`,
[]string{`hello`, `$`},
[]string{`hello`, `$`},
},
{
`hello$${`,
[]string{`hello`, `$${`},
[]string{`hello`, `$${`},
},
{
`hello%%{world}`,
[]string{`hello`, `%%{`, `world}`},
[]string{`hello`, `%%{`, `world}`},
},
{
`hello%%world`,
[]string{`hello`, `%%`, `world`},
[]string{`hello`, `%%`, `world`},
},
{
`hello%world`,
[]string{`hello`, `%`, `world`},
[]string{`hello`, `%`, `world`},
},
{
`hello%`,
[]string{`hello`, `%`},
[]string{`hello`, `%`},
},
{
`hello%%{`,
[]string{`hello`, `%%{`},
[]string{`hello`, `%%{`},
},
{
`hello\${world}`,
[]string{`hello`, `\$`, `{world}`},
[]string{`hello\`, `$`, `{world}`},
},
{
`hello\%{world}`,
[]string{`hello`, `\%`, `{world}`},
[]string{`hello\`, `%`, `{world}`},
},
{
"hello\nworld",
[]string{`hello`, "\n", `world`},
[]string{`hello`, "\n", `world`},
},
{
"hello\rworld",
[]string{`hello`, "\r", `world`},
[]string{`hello`, "\r", `world`},
},
{
"hello\r\nworld",
[]string{`hello`, "\r\n", `world`},
[]string{`hello`, "\r\n", `world`},
},
}
for _, test := range tests {
t.Run(test.Input, func(t *testing.T) {
t.Run("quoted", func(t *testing.T) {
slices := scanStringLit([]byte(test.Input), true)
got := make([]string, len(slices))
for i, slice := range slices {
got[i] = string(slice)
}
if !reflect.DeepEqual(got, test.WantQuoted) {
t.Errorf("wrong result\ngot: %swant: %s", spew.Sdump(got), spew.Sdump(test.WantQuoted))
}
})
t.Run("unquoted", func(t *testing.T) {
slices := scanStringLit([]byte(test.Input), false)
got := make([]string, len(slices))
for i, slice := range slices {
got[i] = string(slice)
}
if !reflect.DeepEqual(got, test.WantUnquoted) {
t.Errorf("wrong result\ngot: %swant: %s", spew.Sdump(got), spew.Sdump(test.WantUnquoted))
}
})
})
}
}