hclsyntax: rewrite string literal decoder with ragel

Fuzz testing revealed that there were a few different crashers in the string literal decoder, which was previously a rather-unweildy hand-written scanner with manually-implemented lookahead. Rather than continuing to hand-tweak that code, here instead we use ragel (which we were already using for the main scanner anyway) to partition our string literals into tokens that are easier for our decoder to wrangle. As a bonus, this also makes our source ranges in our diagnostics more accurate.
2018-02-04 18:35:42 -08:00 · 2018-02-04 18:35:42 -08:00 · cfd802163b
commit cfd802163b
parent 93a7008e3d
7 changed files with 897 additions and 233 deletions
--- a/hcl/hclsyntax/expression_template_test.go
+++ b/hcl/hclsyntax/expression_template_test.go
@ -217,6 +217,18 @@ trim`,
 			cty.StringVal("a\nb\nc\n"),
 			0,
 		},
+		{
+			`\n`, // backslash escapes are not interpreted in template literals
+			nil,
+			cty.StringVal("\\n"),
+			0,
+		},
+		{
+			`\uu1234`, // backslash escapes are not interpreted in template literals
+			nil,       // (this is intentionally an invalid one to ensure we don't produce an error)
+			cty.StringVal("\\uu1234"),
+			0,
+		},
 	}

 	for _, test := range tests {
--- a/hcl/hclsyntax/generate.go
+++ b/hcl/hclsyntax/generate.go
@ -4,4 +4,6 @@ package hclsyntax
 //go:generate ruby unicode2ragel.rb --url=http://www.unicode.org/Public/9.0.0/ucd/DerivedCoreProperties.txt -m UnicodeDerived -p ID_Start,ID_Continue -o unicode_derived.rl
 //go:generate ragel -Z scan_tokens.rl
 //go:generate gofmt -w scan_tokens.go
+//go:generate ragel -Z scan_string_lit.rl
+//go:generate gofmt -w scan_string_lit.go
 //go:generate stringer -type TokenType -output token_type_string.go
--- a/hcl/hclsyntax/parser.go
+++ b/hcl/hclsyntax/parser.go
@ -1,7 +1,6 @@
 package hclsyntax

 import (
-	"bufio"
 	"bytes"
 	"fmt"
 	"strconv"
@ -1478,256 +1477,149 @@ func (p *parser) decodeStringLit(tok Token) (string, hcl.Diagnostics) {
 	var diags hcl.Diagnostics

 	ret := make([]byte, 0, len(tok.Bytes))
-	var esc []byte
+	slices := scanStringLit(tok.Bytes, quoted)

-	sc := bufio.NewScanner(bytes.NewReader(tok.Bytes))
-	sc.Split(textseg.ScanGraphemeClusters)
+	// We will mutate rng constantly as we walk through our token slices below.
+	// Any diagnostics must take a copy of this rng rather than simply pointing
+	// to it, e.g. by using rng.Ptr() rather than &rng.
+	rng := tok.Range
+	rng.End = rng.Start

-	pos := tok.Range.Start
-	newPos := pos
-Character:
-	for sc.Scan() {
-		pos = newPos
-		ch := sc.Bytes()
-
-		// Adjust position based on our new character.
-		// \r\n is considered to be a single character in text segmentation,
-		if (len(ch) == 1 && ch[0] == '\n') || (len(ch) == 2 && ch[1] == '\n') {
-			newPos.Line++
-			newPos.Column = 0
-		} else {
-			newPos.Column++
+Slices:
+	for _, slice := range slices {
+		if len(slice) == 0 {
+			continue
 		}
-		newPos.Byte += len(ch)

-		if len(esc) > 0 {
-			switch esc[0] {
-			case '\\':
+		// Advance the start of our range to where the previous token ended
+		rng.Start = rng.End

-				if len(esc) >= 2 {
-					switch esc[1] {
-					case 'u', 'U':
-						// Our new character must be an ASCII hex digit
-						_, err := strconv.ParseInt(string(ch), 16, 0)
-						if err != nil {
-							var detail string
-							switch esc[1] {
-							case 'u':
-								detail = "Escape sequence \\u must be followed by exactly four hexidecimal digits."
-							case 'U':
-								detail = "Escape sequence \\U must be followed by exactly eight hexidecimal digits."
-							}
-							diags = append(diags, &hcl.Diagnostic{
-								Severity: hcl.DiagError,
-								Summary:  "Invalid escape sequence",
-								Detail:   detail,
-								Subject: &hcl.Range{
-									Filename: tok.Range.Filename,
-									Start: hcl.Pos{
-										Line:   pos.Line,
-										Column: pos.Column,
-										Byte:   pos.Byte,
-									},
-									End: hcl.Pos{
-										Line:   pos.Line,
-										Column: pos.Column + 1,
-										Byte:   pos.Byte + len(ch),
-									},
-								},
-							})
-							ret = append(ret, esc...)
-							ret = append(ret, ch...)
-							esc = esc[:0]
-							continue Character
-						}
-
-						esc = append(esc, ch...)
-
-						var complete bool
-						switch esc[1] {
-						case 'u':
-							complete = (len(esc) == 6) // four digits plus our \u introducer
-						case 'U':
-							complete = (len(esc) == 10) // eight digits plus our \U introducer
-						}
-						if !complete {
-							// Keep accumulating more digits, then
-							continue Character
-						}
-
-						digits := string(esc[2:])
-						valInt, err := strconv.ParseInt(digits, 16, 32)
-						if err != nil {
-							// Should never happen because we validated our digits
-							// as they arrived, above.
-							panic(err)
-						}
-						r := rune(valInt)
-						rl := utf8.RuneLen(r)
-
-						// Make room in our ret buffer for the extra characters
-						for i := 0; i < rl; i++ {
-							ret = append(ret, 0)
-						}
-
-						// Fill those extra characters with the canonical UTF-8
-						// representation of our rune.
-						utf8.EncodeRune(ret[len(ret)-rl:], r)
-
-						// ...and now finally we're finished escaping!
-						esc = esc[:0]
-
-						continue Character
-					}
-				}
-
-				if len(ch) == 1 {
-					switch ch[0] {
-
-					case 'n':
-						ret = append(ret, '\n')
-						esc = esc[:0]
-						continue Character
-					case 'r':
-						ret = append(ret, '\r')
-						esc = esc[:0]
-						continue Character
-					case 't':
-						ret = append(ret, '\t')
-						esc = esc[:0]
-						continue Character
-					case '"':
-						ret = append(ret, '"')
-						esc = esc[:0]
-						continue Character
-					case '\\':
-						ret = append(ret, '\\')
-						esc = esc[:0]
-						continue Character
-					case 'u', 'U':
-						// For these, we'll continue working on them until
-						// we accumulate the expected number of digits.
-						esc = append(esc, ch...)
-						continue Character
-					}
-				}
-
-				var detail string
-				switch {
-				case len(ch) == 1 && (ch[0] == '$' || ch[0] == '%'):
-					detail = fmt.Sprintf(
-						"The characters \"\\%s\" do not form a recognized escape sequence. To escape a \"%s{\" template sequence, use \"%s%s{\".",
-						ch, ch, ch, ch,
-					)
-				default:
-					detail = fmt.Sprintf("The characters \"\\%s\" do not form a recognized escape sequence.", ch)
-				}
+		// Advance the end of our range to after our token.
+		b := slice
+		for len(b) > 0 {
+			adv, ch, _ := textseg.ScanGraphemeClusters(b, true)
+			rng.End.Byte += adv
+			switch ch[0] {
+			case '\r', '\n':
+				rng.End.Line++
+				rng.End.Column = 1
+			default:
+				rng.End.Column++
+			}
+			b = b[adv:]
+		}

+	TokenType:
+		switch slice[0] {
+		case '\\':
+			if !quoted {
+				// If we're not in quoted mode then just treat this token as
+				// normal. (Slices can still start with backslash even if we're
+				// not specifically looking for backslash sequences.)
+				break TokenType
+			}
+			if len(slice) < 2 {
 				diags = append(diags, &hcl.Diagnostic{
 					Severity: hcl.DiagError,
 					Summary:  "Invalid escape sequence",
-					Detail:   detail,
-					Subject: &hcl.Range{
-						Filename: tok.Range.Filename,
-						Start: hcl.Pos{
-							Line:   pos.Line,
-							Column: pos.Column - 1, // safe because we know the previous character must be a backslash
-							Byte:   pos.Byte - 1,
-						},
-						End: hcl.Pos{
-							Line:   pos.Line,
-							Column: pos.Column + 1, // safe because we know the previous character must be a backslash
-							Byte:   pos.Byte + len(ch),
-						},
-					},
+					Detail:   "Backslash must be followed by an escape sequence selector character.",
+					Subject:  rng.Ptr(),
 				})
-				ret = append(ret, ch...)
-				esc = esc[:0]
-				continue Character
+				break TokenType
+			}

-			case '$', '%':
-				switch len(esc) {
-				case 1:
-					if len(ch) == 1 && ch[0] == esc[0] {
-						esc = append(esc, ch[0])
-						continue Character
-					}
+			switch slice[1] {

-					// Any other character means this wasn't an escape sequence
-					// after all.
-					ret = append(ret, esc...)
-					ret = append(ret, ch...)
-					esc = esc[:0]
-				case 2:
-					if len(ch) == 1 && ch[0] == '{' {
-						// successful escape sequence
-						ret = append(ret, esc[0])
-					} else {
-						// not an escape sequence, so just output literal
-						ret = append(ret, esc...)
-					}
-					ret = append(ret, ch...)
-					esc = esc[:0]
-				default:
-					// should never happen
-					panic("have invalid escape sequence >2 characters")
+			case 'n':
+				ret = append(ret, '\n')
+				continue Slices
+			case 'r':
+				ret = append(ret, '\r')
+				continue Slices
+			case 't':
+				ret = append(ret, '\t')
+				continue Slices
+			case '"':
+				ret = append(ret, '"')
+				continue Slices
+			case '\\':
+				ret = append(ret, '\\')
+				continue Slices
+			case 'u', 'U':
+				if slice[1] == 'u' && len(slice) != 6 {
+					diags = append(diags, &hcl.Diagnostic{
+						Severity: hcl.DiagError,
+						Summary:  "Invalid escape sequence",
+						Detail:   "The \\u escape sequence must be followed by four hexadecimal digits.",
+						Subject:  rng.Ptr(),
+					})
+					break TokenType
+				} else if slice[1] == 'U' && len(slice) != 10 {
+					diags = append(diags, &hcl.Diagnostic{
+						Severity: hcl.DiagError,
+						Summary:  "Invalid escape sequence",
+						Detail:   "The \\U escape sequence must be followed by eight hexadecimal digits.",
+						Subject:  rng.Ptr(),
+					})
+					break TokenType
 				}

-			}
-		} else {
-			if len(ch) == 1 {
-				switch ch[0] {
-				case '\\':
-					if quoted { // ignore backslashes in unquoted mode
-						esc = append(esc, '\\')
-						continue Character
-					}
-				case '$':
-					esc = append(esc, '$')
-					continue Character
-				case '%':
-					esc = append(esc, '%')
-					continue Character
+				numHex := string(slice[2:])
+				num, err := strconv.ParseUint(numHex, 16, 32)
+				if err != nil {
+					// Should never happen because the scanner won't match
+					// a sequence of digits that isn't valid.
+					panic(err)
 				}
+
+				r := rune(num)
+				l := utf8.RuneLen(r)
+				if l == -1 {
+					diags = append(diags, &hcl.Diagnostic{
+						Severity: hcl.DiagError,
+						Summary:  "Invalid escape sequence",
+						Detail:   fmt.Sprintf("Cannot encode character U+%04x in UTF-8.", num),
+						Subject:  rng.Ptr(),
+					})
+					break TokenType
+				}
+				for i := 0; i < l; i++ {
+					ret = append(ret, 0)
+				}
+				rb := ret[len(ret)-l:]
+				utf8.EncodeRune(rb, r)
+
+				continue Slices
+
+			default:
+				diags = append(diags, &hcl.Diagnostic{
+					Severity: hcl.DiagError,
+					Summary:  "Invalid escape sequence",
+					Detail:   fmt.Sprintf("The symbol %q is not a valid escape sequence selector.", slice[1:]),
+					Subject:  rng.Ptr(),
+				})
+				ret = append(ret, slice[1:]...)
+				continue Slices
 			}
-			ret = append(ret, ch...)
+
+		case '$', '%':
+			if len(slice) != 3 {
+				// Not long enough to be our escape sequence, so it's literal.
+				break TokenType
+			}
+
+			if slice[1] == slice[0] && slice[2] == '{' {
+				ret = append(ret, slice[0])
+				ret = append(ret, '{')
+				continue Slices
+			}
+
+			break TokenType
 		}
-	}

-	// if we still have an outstanding "esc" when we fall out here then
-	// the literal ended with an unterminated escape sequence, which we
-	// must now deal with.
-	if len(esc) > 0 {
-		if esc[0] == '\\' {
-			// An incomplete backslash sequence is an error, since it suggests
-			// that e.g. the user started writing a \uXXXX sequence but didn't
-			// provide enough hex digits.
-			diags = append(diags, &hcl.Diagnostic{
-				Severity: hcl.DiagError,
-				Summary:  "Invalid escape sequence",
-				Detail:   fmt.Sprintf("The characters %q do not form a complete escape sequence.", esc),
-				Subject: &hcl.Range{
-					Filename: tok.Range.Filename,
-					Start: hcl.Pos{
-						Line:   pos.Line,
-						Column: pos.Column,
-						Byte:   pos.Byte,
-					},
-					End: hcl.Pos{
-						Line:   pos.Line,
-						Column: pos.Column + len(esc),
-						Byte:   pos.Byte + len(esc),
-					},
-				},
-			})
-
-		}
-		// This might also be an incomplete $${ or %%{ escape sequence, but
-		// that's treated as a literal rather than an error since those only
-		// count as escape sequences when all three characters are present.
-
-		ret = append(ret, esc...)
-		esc = nil
+		// If we fall out here or break out of here from the switch above
+		// then this slice is just a literal.
+		ret = append(ret, slice...)
 	}

 	return string(ret), diags
--- a/hcl/hclsyntax/parser_test.go
+++ b/hcl/hclsyntax/parser_test.go
@ -765,6 +765,56 @@ block "valid" {}
 				},
 			},
 		},
+		{
+			"a = \"\\uu2022\"\n",
+			1, // \u must be followed by four hex digits
+			&Body{
+				Attributes: Attributes{
+					"a": {
+						Name: "a",
+						Expr: &TemplateExpr{
+							Parts: []Expression{
+								&LiteralValueExpr{
+									Val: cty.StringVal("\\uu2022"),
+
+									SrcRange: hcl.Range{
+										Start: hcl.Pos{Line: 1, Column: 6, Byte: 5},
+										End:   hcl.Pos{Line: 1, Column: 13, Byte: 12},
+									},
+								},
+							},
+
+							SrcRange: hcl.Range{
+								Start: hcl.Pos{Line: 1, Column: 5, Byte: 4},
+								End:   hcl.Pos{Line: 1, Column: 14, Byte: 13},
+							},
+						},
+
+						SrcRange: hcl.Range{
+							Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
+							End:   hcl.Pos{Line: 1, Column: 14, Byte: 13},
+						},
+						NameRange: hcl.Range{
+							Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
+							End:   hcl.Pos{Line: 1, Column: 2, Byte: 1},
+						},
+						EqualsRange: hcl.Range{
+							Start: hcl.Pos{Line: 1, Column: 3, Byte: 2},
+							End:   hcl.Pos{Line: 1, Column: 4, Byte: 3},
+						},
+					},
+				},
+				Blocks: Blocks{},
+				SrcRange: hcl.Range{
+					Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
+					End:   hcl.Pos{Line: 2, Column: 1, Byte: 14},
+				},
+				EndRange: hcl.Range{
+					Start: hcl.Pos{Line: 2, Column: 1, Byte: 14},
+					End:   hcl.Pos{Line: 2, Column: 1, Byte: 14},
+				},
+			},
+		},
 		{
 			"a = \"\\U0001d11e\"\n",
 			0,
@ -968,6 +1018,106 @@ block "valid" {}
 				},
 			},
 		},
+		{
+			"a = \"\\U00300000\"\n",
+			1, // Invalid unicode character (can't encode in UTF-8)
+			&Body{
+				Attributes: Attributes{
+					"a": {
+						Name: "a",
+						Expr: &TemplateExpr{
+							Parts: []Expression{
+								&LiteralValueExpr{
+									Val: cty.StringVal("\\U00300000"),
+
+									SrcRange: hcl.Range{
+										Start: hcl.Pos{Line: 1, Column: 6, Byte: 5},
+										End:   hcl.Pos{Line: 1, Column: 16, Byte: 15},
+									},
+								},
+							},
+
+							SrcRange: hcl.Range{
+								Start: hcl.Pos{Line: 1, Column: 5, Byte: 4},
+								End:   hcl.Pos{Line: 1, Column: 17, Byte: 16},
+							},
+						},
+
+						SrcRange: hcl.Range{
+							Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
+							End:   hcl.Pos{Line: 1, Column: 17, Byte: 16},
+						},
+						NameRange: hcl.Range{
+							Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
+							End:   hcl.Pos{Line: 1, Column: 2, Byte: 1},
+						},
+						EqualsRange: hcl.Range{
+							Start: hcl.Pos{Line: 1, Column: 3, Byte: 2},
+							End:   hcl.Pos{Line: 1, Column: 4, Byte: 3},
+						},
+					},
+				},
+				Blocks: Blocks{},
+				SrcRange: hcl.Range{
+					Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
+					End:   hcl.Pos{Line: 2, Column: 1, Byte: 17},
+				},
+				EndRange: hcl.Range{
+					Start: hcl.Pos{Line: 2, Column: 1, Byte: 17},
+					End:   hcl.Pos{Line: 2, Column: 1, Byte: 17},
+				},
+			},
+		},
+		{
+			"a = \"\\Ub2705550\"\n",
+			1, // Invalid unicode character (can't encode in UTF-8)
+			&Body{
+				Attributes: Attributes{
+					"a": {
+						Name: "a",
+						Expr: &TemplateExpr{
+							Parts: []Expression{
+								&LiteralValueExpr{
+									Val: cty.StringVal("\\Ub2705550"),
+
+									SrcRange: hcl.Range{
+										Start: hcl.Pos{Line: 1, Column: 6, Byte: 5},
+										End:   hcl.Pos{Line: 1, Column: 16, Byte: 15},
+									},
+								},
+							},
+
+							SrcRange: hcl.Range{
+								Start: hcl.Pos{Line: 1, Column: 5, Byte: 4},
+								End:   hcl.Pos{Line: 1, Column: 17, Byte: 16},
+							},
+						},
+
+						SrcRange: hcl.Range{
+							Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
+							End:   hcl.Pos{Line: 1, Column: 17, Byte: 16},
+						},
+						NameRange: hcl.Range{
+							Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
+							End:   hcl.Pos{Line: 1, Column: 2, Byte: 1},
+						},
+						EqualsRange: hcl.Range{
+							Start: hcl.Pos{Line: 1, Column: 3, Byte: 2},
+							End:   hcl.Pos{Line: 1, Column: 4, Byte: 3},
+						},
+					},
+				},
+				Blocks: Blocks{},
+				SrcRange: hcl.Range{
+					Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
+					End:   hcl.Pos{Line: 2, Column: 1, Byte: 17},
+				},
+				EndRange: hcl.Range{
+					Start: hcl.Pos{Line: 2, Column: 1, Byte: 17},
+					End:   hcl.Pos{Line: 2, Column: 1, Byte: 17},
+				},
+			},
+		},
 		{
 			"a = foo.bar\n",
 			0,
--- a/hcl/hclsyntax/scan_string_lit.go
+++ b/hcl/hclsyntax/scan_string_lit.go
@ -0,0 +1,301 @@
+// line 1 "scan_string_lit.rl"
+
+package hclsyntax
+
+// This file is generated from scan_string_lit.rl. DO NOT EDIT.
+
+// line 9 "scan_string_lit.go"
+var _hclstrtok_actions []byte = []byte{
+	0, 1, 0, 1, 1, 2, 1, 0,
+}
+
+var _hclstrtok_key_offsets []byte = []byte{
+	0, 0, 2, 4, 6, 10, 14, 18,
+	22, 27, 31, 36, 41, 46, 51, 57,
+	62, 74, 85, 96, 107, 118, 129, 140,
+	151,
+}
+
+var _hclstrtok_trans_keys []byte = []byte{
+	128, 191, 128, 191, 128, 191, 10, 13,
+	36, 37, 10, 13, 36, 37, 10, 13,
+	36, 37, 10, 13, 36, 37, 10, 13,
+	36, 37, 123, 10, 13, 36, 37, 10,
+	13, 36, 37, 92, 10, 13, 36, 37,
+	92, 10, 13, 36, 37, 92, 10, 13,
+	36, 37, 92, 10, 13, 36, 37, 92,
+	123, 10, 13, 36, 37, 92, 85, 117,
+	128, 191, 192, 223, 224, 239, 240, 247,
+	248, 255, 10, 13, 36, 37, 92, 48,
+	57, 65, 70, 97, 102, 10, 13, 36,
+	37, 92, 48, 57, 65, 70, 97, 102,
+	10, 13, 36, 37, 92, 48, 57, 65,
+	70, 97, 102, 10, 13, 36, 37, 92,
+	48, 57, 65, 70, 97, 102, 10, 13,
+	36, 37, 92, 48, 57, 65, 70, 97,
+	102, 10, 13, 36, 37, 92, 48, 57,
+	65, 70, 97, 102, 10, 13, 36, 37,
+	92, 48, 57, 65, 70, 97, 102, 10,
+	13, 36, 37, 92, 48, 57, 65, 70,
+	97, 102,
+}
+
+var _hclstrtok_single_lengths []byte = []byte{
+	0, 0, 0, 0, 4, 4, 4, 4,
+	5, 4, 5, 5, 5, 5, 6, 5,
+	2, 5, 5, 5, 5, 5, 5, 5,
+	5,
+}
+
+var _hclstrtok_range_lengths []byte = []byte{
+	0, 1, 1, 1, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	5, 3, 3, 3, 3, 3, 3, 3,
+	3,
+}
+
+var _hclstrtok_index_offsets []byte = []byte{
+	0, 0, 2, 4, 6, 11, 16, 21,
+	26, 32, 37, 43, 49, 55, 61, 68,
+	74, 82, 91, 100, 109, 118, 127, 136,
+	145,
+}
+
+var _hclstrtok_indicies []byte = []byte{
+	0, 1, 2, 1, 3, 1, 5, 6,
+	7, 8, 4, 10, 11, 12, 13, 9,
+	14, 11, 12, 13, 9, 10, 11, 15,
+	13, 9, 10, 11, 12, 13, 14, 9,
+	10, 11, 12, 15, 9, 17, 18, 19,
+	20, 21, 16, 23, 24, 25, 26, 27,
+	22, 0, 24, 25, 26, 27, 22, 23,
+	24, 28, 26, 27, 22, 23, 24, 25,
+	26, 27, 0, 22, 23, 24, 25, 28,
+	27, 22, 29, 30, 22, 2, 3, 31,
+	22, 0, 23, 24, 25, 26, 27, 32,
+	32, 32, 22, 23, 24, 25, 26, 27,
+	33, 33, 33, 22, 23, 24, 25, 26,
+	27, 34, 34, 34, 22, 23, 24, 25,
+	26, 27, 30, 30, 30, 22, 23, 24,
+	25, 26, 27, 35, 35, 35, 22, 23,
+	24, 25, 26, 27, 36, 36, 36, 22,
+	23, 24, 25, 26, 27, 37, 37, 37,
+	22, 23, 24, 25, 26, 27, 0, 0,
+	0, 22,
+}
+
+var _hclstrtok_trans_targs []byte = []byte{
+	11, 0, 1, 2, 4, 5, 6, 7,
+	9, 4, 5, 6, 7, 9, 5, 8,
+	10, 11, 12, 13, 15, 16, 10, 11,
+	12, 13, 15, 16, 14, 17, 21, 3,
+	18, 19, 20, 22, 23, 24,
+}
+
+var _hclstrtok_trans_actions []byte = []byte{
+	0, 0, 0, 0, 0, 1, 1, 1,
+	1, 3, 5, 5, 5, 5, 0, 0,
+	0, 1, 1, 1, 1, 1, 3, 5,
+	5, 5, 5, 5, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0,
+}
+
+var _hclstrtok_eof_actions []byte = []byte{
+	0, 0, 0, 0, 0, 3, 3, 3,
+	3, 3, 0, 3, 3, 3, 3, 3,
+	3, 3, 3, 3, 3, 3, 3, 3,
+	3,
+}
+
+const hclstrtok_start int = 4
+const hclstrtok_first_final int = 4
+const hclstrtok_error int = 0
+
+const hclstrtok_en_quoted int = 10
+const hclstrtok_en_unquoted int = 4
+
+// line 10 "scan_string_lit.rl"
+
+func scanStringLit(data []byte, quoted bool) [][]byte {
+	var ret [][]byte
+
+	// line 61 "scan_string_lit.rl"
+
+	// Ragel state
+	p := 0          // "Pointer" into data
+	pe := len(data) // End-of-data "pointer"
+	ts := 0
+	te := 0
+	eof := pe
+
+	var cs int // current state
+	switch {
+	case quoted:
+		cs = hclstrtok_en_quoted
+	default:
+		cs = hclstrtok_en_unquoted
+	}
+
+	// Make Go compiler happy
+	_ = ts
+	_ = eof
+
+	/*token := func () {
+	    ret = append(ret, data[ts:te])
+	}*/
+
+	// line 154 "scan_string_lit.go"
+	{
+	}
+
+	// line 158 "scan_string_lit.go"
+	{
+		var _klen int
+		var _trans int
+		var _acts int
+		var _nacts uint
+		var _keys int
+		if p == pe {
+			goto _test_eof
+		}
+		if cs == 0 {
+			goto _out
+		}
+	_resume:
+		_keys = int(_hclstrtok_key_offsets[cs])
+		_trans = int(_hclstrtok_index_offsets[cs])
+
+		_klen = int(_hclstrtok_single_lengths[cs])
+		if _klen > 0 {
+			_lower := int(_keys)
+			var _mid int
+			_upper := int(_keys + _klen - 1)
+			for {
+				if _upper < _lower {
+					break
+				}
+
+				_mid = _lower + ((_upper - _lower) >> 1)
+				switch {
+				case data[p] < _hclstrtok_trans_keys[_mid]:
+					_upper = _mid - 1
+				case data[p] > _hclstrtok_trans_keys[_mid]:
+					_lower = _mid + 1
+				default:
+					_trans += int(_mid - int(_keys))
+					goto _match
+				}
+			}
+			_keys += _klen
+			_trans += _klen
+		}
+
+		_klen = int(_hclstrtok_range_lengths[cs])
+		if _klen > 0 {
+			_lower := int(_keys)
+			var _mid int
+			_upper := int(_keys + (_klen << 1) - 2)
+			for {
+				if _upper < _lower {
+					break
+				}
+
+				_mid = _lower + (((_upper - _lower) >> 1) & ^1)
+				switch {
+				case data[p] < _hclstrtok_trans_keys[_mid]:
+					_upper = _mid - 2
+				case data[p] > _hclstrtok_trans_keys[_mid+1]:
+					_lower = _mid + 2
+				default:
+					_trans += int((_mid - int(_keys)) >> 1)
+					goto _match
+				}
+			}
+			_trans += _klen
+		}
+
+	_match:
+		_trans = int(_hclstrtok_indicies[_trans])
+		cs = int(_hclstrtok_trans_targs[_trans])
+
+		if _hclstrtok_trans_actions[_trans] == 0 {
+			goto _again
+		}
+
+		_acts = int(_hclstrtok_trans_actions[_trans])
+		_nacts = uint(_hclstrtok_actions[_acts])
+		_acts++
+		for ; _nacts > 0; _nacts-- {
+			_acts++
+			switch _hclstrtok_actions[_acts-1] {
+			case 0:
+				// line 40 "scan_string_lit.rl"
+
+				// If te is behind p then we've skipped over some literal
+				// characters which we must now return.
+				if te < p {
+					ret = append(ret, data[te:p])
+				}
+				ts = p
+
+			case 1:
+				// line 48 "scan_string_lit.rl"
+
+				te = p
+				ret = append(ret, data[ts:te])
+
+				// line 255 "scan_string_lit.go"
+			}
+		}
+
+	_again:
+		if cs == 0 {
+			goto _out
+		}
+		p++
+		if p != pe {
+			goto _resume
+		}
+	_test_eof:
+		{
+		}
+		if p == eof {
+			__acts := _hclstrtok_eof_actions[cs]
+			__nacts := uint(_hclstrtok_actions[__acts])
+			__acts++
+			for ; __nacts > 0; __nacts-- {
+				__acts++
+				switch _hclstrtok_actions[__acts-1] {
+				case 1:
+					// line 48 "scan_string_lit.rl"
+
+					te = p
+					ret = append(ret, data[ts:te])
+
+					// line 281 "scan_string_lit.go"
+				}
+			}
+		}
+
+	_out:
+		{
+		}
+	}
+
+	// line 89 "scan_string_lit.rl"
+
+	if te < p {
+		// Collect any leftover literal characters at the end of the input
+		ret = append(ret, data[te:p])
+	}
+
+	// If we fall out here without being in a final state then we've
+	// encountered something that the scanner can't match, which should
+	// be impossible (the scanner matches all bytes _somehow_) but we'll
+	// tolerate it and let the caller deal with it.
+	if cs < hclstrtok_first_final {
+		ret = append(ret, data[p:len(data)])
+	}
+
+	return ret
+}
--- a/hcl/hclsyntax/scan_string_lit.rl
+++ b/hcl/hclsyntax/scan_string_lit.rl
@ -0,0 +1,105 @@
+
+package hclsyntax
+
+// This file is generated from scan_string_lit.rl. DO NOT EDIT.
+%%{
+  # (except you are actually in scan_string_lit.rl here, so edit away!)
+
+  machine hclstrtok;
+  write data;
+}%%
+
+func scanStringLit(data []byte, quoted bool) [][]byte {
+    var ret [][]byte
+
+    %%{
+        include UnicodeDerived "unicode_derived.rl";
+
+        UTF8Cont = 0x80 .. 0xBF;
+        AnyUTF8 = (
+            0x00..0x7F |
+            0xC0..0xDF . UTF8Cont |
+            0xE0..0xEF . UTF8Cont . UTF8Cont |
+            0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
+        );
+        BadUTF8 = any - AnyUTF8;
+
+        Hex = ('0'..'9' | 'a'..'f' | 'A'..'F');
+
+        # Our goal with this patterns is to capture user intent as best as
+        # possible, even if the input is invalid. The caller will then verify
+        # whether each token is valid and generate suitable error messages
+        # if not.
+        UnicodeEscapeShort = "\\u" . Hex{0,4};
+        UnicodeEscapeLong = "\\U" . Hex{0,8};
+        UnicodeEscape = (UnicodeEscapeShort | UnicodeEscapeLong);
+        SimpleEscape = "\\" . (AnyUTF8 - ('U'|'u'))?;
+        TemplateEscape = ("$" . ("$" . ("{"?))?) | ("%" . ("%" . ("{"?))?);
+        Newline = ("\r\n" | "\r" | "\n");
+
+        action Begin {
+            // If te is behind p then we've skipped over some literal
+            // characters which we must now return.
+            if te < p {
+                ret = append(ret, data[te:p])
+            }
+            ts = p;
+        }
+        action End {
+            te = p;
+            ret = append(ret, data[ts:te]);
+        }
+
+        QuotedToken = (UnicodeEscape | SimpleEscape | TemplateEscape | Newline) >Begin %End;
+        UnquotedToken = (TemplateEscape | Newline) >Begin %End;
+        QuotedLiteral = (any - ("\\" | "$" | "%" | "\r" | "\n"));
+        UnquotedLiteral = (any - ("$" | "%" | "\r" | "\n"));
+
+        quoted := (QuotedToken | QuotedLiteral)**;
+        unquoted := (UnquotedToken | UnquotedLiteral)**;
+
+    }%%
+
+    // Ragel state
+	p := 0  // "Pointer" into data
+	pe := len(data) // End-of-data "pointer"
+    ts := 0
+    te := 0
+    eof := pe
+
+    var cs int // current state
+    switch {
+    case quoted:
+        cs = hclstrtok_en_quoted
+    default:
+        cs = hclstrtok_en_unquoted
+    }
+
+    // Make Go compiler happy
+    _ = ts
+    _ = eof
+
+    /*token := func () {
+        ret = append(ret, data[ts:te])
+    }*/
+
+    %%{
+        write init nocs;
+        write exec;
+    }%%
+
+    if te < p {
+        // Collect any leftover literal characters at the end of the input
+        ret = append(ret, data[te:p])
+    }
+
+    // If we fall out here without being in a final state then we've
+    // encountered something that the scanner can't match, which should
+    // be impossible (the scanner matches all bytes _somehow_) but we'll
+    // tolerate it and let the caller deal with it.
+    if cs < hclstrtok_first_final {
+        ret = append(ret, data[p:len(data)])
+    }
+
+    return ret
+}
--- a/hcl/hclsyntax/scan_string_lit_test.go
+++ b/hcl/hclsyntax/scan_string_lit_test.go
@ -0,0 +1,202 @@
+package hclsyntax
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/davecgh/go-spew/spew"
+)
+
+func TestScanStringLit(t *testing.T) {
+	tests := []struct {
+		Input        string
+		WantQuoted   []string
+		WantUnquoted []string
+	}{
+		{
+			``,
+			[]string{},
+			[]string{},
+		},
+		{
+			`hello`,
+			[]string{`hello`},
+			[]string{`hello`},
+		},
+		{
+			`hello world`,
+			[]string{`hello world`},
+			[]string{`hello world`},
+		},
+		{
+			`hello\nworld`,
+			[]string{`hello`, `\n`, `world`},
+			[]string{`hello\nworld`},
+		},
+		{
+			`hello\🥁world`,
+			[]string{`hello`, `\🥁`, `world`},
+			[]string{`hello\🥁world`},
+		},
+		{
+			`hello\uabcdworld`,
+			[]string{`hello`, `\uabcd`, `world`},
+			[]string{`hello\uabcdworld`},
+		},
+		{
+			`hello\uabcdabcdworld`,
+			[]string{`hello`, `\uabcd`, `abcdworld`},
+			[]string{`hello\uabcdabcdworld`},
+		},
+		{
+			`hello\uabcworld`,
+			[]string{`hello`, `\uabc`, `world`},
+			[]string{`hello\uabcworld`},
+		},
+		{
+			`hello\U01234567world`,
+			[]string{`hello`, `\U01234567`, `world`},
+			[]string{`hello\U01234567world`},
+		},
+		{
+			`hello\U012345670123world`,
+			[]string{`hello`, `\U01234567`, `0123world`},
+			[]string{`hello\U012345670123world`},
+		},
+		{
+			`hello\Uabcdworld`,
+			[]string{`hello`, `\Uabcd`, `world`},
+			[]string{`hello\Uabcdworld`},
+		},
+		{
+			`hello\Uabcworld`,
+			[]string{`hello`, `\Uabc`, `world`},
+			[]string{`hello\Uabcworld`},
+		},
+		{
+			`hello\uworld`,
+			[]string{`hello`, `\u`, `world`},
+			[]string{`hello\uworld`},
+		},
+		{
+			`hello\Uworld`,
+			[]string{`hello`, `\U`, `world`},
+			[]string{`hello\Uworld`},
+		},
+		{
+			`hello\u`,
+			[]string{`hello`, `\u`},
+			[]string{`hello\u`},
+		},
+		{
+			`hello\U`,
+			[]string{`hello`, `\U`},
+			[]string{`hello\U`},
+		},
+		{
+			`hello\`,
+			[]string{`hello`, `\`},
+			[]string{`hello\`},
+		},
+		{
+			`hello$${world}`,
+			[]string{`hello`, `$${`, `world}`},
+			[]string{`hello`, `$${`, `world}`},
+		},
+		{
+			`hello$$world`,
+			[]string{`hello`, `$$`, `world`},
+			[]string{`hello`, `$$`, `world`},
+		},
+		{
+			`hello$world`,
+			[]string{`hello`, `$`, `world`},
+			[]string{`hello`, `$`, `world`},
+		},
+		{
+			`hello$`,
+			[]string{`hello`, `$`},
+			[]string{`hello`, `$`},
+		},
+		{
+			`hello$${`,
+			[]string{`hello`, `$${`},
+			[]string{`hello`, `$${`},
+		},
+		{
+			`hello%%{world}`,
+			[]string{`hello`, `%%{`, `world}`},
+			[]string{`hello`, `%%{`, `world}`},
+		},
+		{
+			`hello%%world`,
+			[]string{`hello`, `%%`, `world`},
+			[]string{`hello`, `%%`, `world`},
+		},
+		{
+			`hello%world`,
+			[]string{`hello`, `%`, `world`},
+			[]string{`hello`, `%`, `world`},
+		},
+		{
+			`hello%`,
+			[]string{`hello`, `%`},
+			[]string{`hello`, `%`},
+		},
+		{
+			`hello%%{`,
+			[]string{`hello`, `%%{`},
+			[]string{`hello`, `%%{`},
+		},
+		{
+			`hello\${world}`,
+			[]string{`hello`, `\$`, `{world}`},
+			[]string{`hello\`, `$`, `{world}`},
+		},
+		{
+			`hello\%{world}`,
+			[]string{`hello`, `\%`, `{world}`},
+			[]string{`hello\`, `%`, `{world}`},
+		},
+		{
+			"hello\nworld",
+			[]string{`hello`, "\n", `world`},
+			[]string{`hello`, "\n", `world`},
+		},
+		{
+			"hello\rworld",
+			[]string{`hello`, "\r", `world`},
+			[]string{`hello`, "\r", `world`},
+		},
+		{
+			"hello\r\nworld",
+			[]string{`hello`, "\r\n", `world`},
+			[]string{`hello`, "\r\n", `world`},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.Input, func(t *testing.T) {
+			t.Run("quoted", func(t *testing.T) {
+				slices := scanStringLit([]byte(test.Input), true)
+				got := make([]string, len(slices))
+				for i, slice := range slices {
+					got[i] = string(slice)
+				}
+				if !reflect.DeepEqual(got, test.WantQuoted) {
+					t.Errorf("wrong result\ngot: %swant: %s", spew.Sdump(got), spew.Sdump(test.WantQuoted))
+				}
+			})
+			t.Run("unquoted", func(t *testing.T) {
+				slices := scanStringLit([]byte(test.Input), false)
+				got := make([]string, len(slices))
+				for i, slice := range slices {
+					got[i] = string(slice)
+				}
+				if !reflect.DeepEqual(got, test.WantUnquoted) {
+					t.Errorf("wrong result\ngot: %swant: %s", spew.Sdump(got), spew.Sdump(test.WantUnquoted))
+				}
+			})
+		})
+	}
+}