hclsyntax: \uxxxx and \Uxxxxxxxx escape sequences in string literals

These allow the inclusion of arbitrary unicode codepoints (always encoded
as UTF-8) using a hex representation.

\u expects four digits and can thus represent only characters in the basic
multilingual plane.

\U expects eight digits and can thus represent all unicode characters,
at the cost of being extra-verbose.

Since our parser properly accounts for unicode characters (including
combining sequences) it's recommended to include them literally (UTF-8
encoded) in source code, but these sequences are useful for explicitly
representing non-printable characters that could otherwise appear
invisible in source code, such as zero-width modifier characters.

This fixes #6.
This commit is contained in:
Martin Atkins 2018-01-27 10:20:22 -08:00
parent f0bf2b15ae
commit a1c55afeca
2 changed files with 339 additions and 3 deletions

View File

@ -4,6 +4,8 @@ import (
"bufio"
"bytes"
"fmt"
"strconv"
"unicode/utf8"
"github.com/apparentlymart/go-textseg/textseg"
"github.com/hashicorp/hcl2/hcl"
@ -1501,11 +1503,87 @@ Character:
if len(esc) > 0 {
switch esc[0] {
case '\\':
if len(esc) >= 2 {
switch esc[1] {
case 'u', 'U':
// Our new character must be an ASCII hex digit
_, err := strconv.ParseInt(string(ch), 16, 0)
if err != nil {
var detail string
switch esc[1] {
case 'u':
detail = "Escape sequence \\u must be followed by exactly four hexidecimal digits."
case 'U':
detail = "Escape sequence \\U must be followed by exactly eight hexidecimal digits."
}
diags = append(diags, &hcl.Diagnostic{
Severity: hcl.DiagError,
Summary: "Invalid escape sequence",
Detail: detail,
Subject: &hcl.Range{
Filename: tok.Range.Filename,
Start: hcl.Pos{
Line: pos.Line,
Column: pos.Column,
Byte: pos.Byte,
},
End: hcl.Pos{
Line: pos.Line,
Column: pos.Column + 1,
Byte: pos.Byte + len(ch),
},
},
})
ret = append(ret, esc...)
ret = append(ret, ch...)
esc = esc[:0]
continue Character
}
esc = append(esc, ch...)
var complete bool
switch esc[1] {
case 'u':
complete = (len(esc) == 6) // four digits plus our \u introducer
case 'U':
complete = (len(esc) == 10) // eight digits plus our \U introducer
}
if !complete {
// Keep accumulating more digits, then
continue Character
}
digits := string(esc[2:])
valInt, err := strconv.ParseInt(digits, 16, 32)
if err != nil {
// Should never happen because we validated our digits
// as they arrived, above.
panic(err)
}
r := rune(valInt)
rl := utf8.RuneLen(r)
// Make room in our ret buffer for the extra characters
for i := 0; i < rl; i++ {
ret = append(ret, 0)
}
// Fill those extra characters with the canonical UTF-8
// representation of our rune.
utf8.EncodeRune(ret[len(ret)-rl:], r)
// ...and now finally we're finished escaping!
esc = esc[:0]
continue Character
}
}
if len(ch) == 1 {
switch ch[0] {
// TODO: numeric character escapes with \uXXXX
case 'n':
ret = append(ret, '\n')
esc = esc[:0]
@ -1526,6 +1604,11 @@ Character:
ret = append(ret, '\\')
esc = esc[:0]
continue Character
case 'u', 'U':
// For these, we'll continue working on them until
// we accumulate the expected number of digits.
esc = append(esc, ch...)
continue Character
}
}
@ -1622,7 +1705,7 @@ Character:
diags = append(diags, &hcl.Diagnostic{
Severity: hcl.DiagError,
Summary: "Invalid escape sequence",
Detail: fmt.Sprintf("The characters %q do not form a recognized escape sequence.", esc),
Detail: fmt.Sprintf("The characters %q do not form a complete escape sequence.", esc),
Subject: &hcl.Range{
Filename: tok.Range.Filename,
Start: hcl.Pos{

View File

@ -715,6 +715,259 @@ block "valid" {}
},
},
},
{
"a = \"\\u2022\"\n",
0,
&Body{
Attributes: Attributes{
"a": {
Name: "a",
Expr: &TemplateExpr{
Parts: []Expression{
&LiteralValueExpr{
Val: cty.StringVal("\u2022"),
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 6, Byte: 5},
End: hcl.Pos{Line: 1, Column: 12, Byte: 11},
},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 5, Byte: 4},
End: hcl.Pos{Line: 1, Column: 13, Byte: 12},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 13, Byte: 12},
},
NameRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 2, Byte: 1},
},
EqualsRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 3, Byte: 2},
End: hcl.Pos{Line: 1, Column: 4, Byte: 3},
},
},
},
Blocks: Blocks{},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 2, Column: 1, Byte: 13},
},
EndRange: hcl.Range{
Start: hcl.Pos{Line: 2, Column: 1, Byte: 13},
End: hcl.Pos{Line: 2, Column: 1, Byte: 13},
},
},
},
{
"a = \"\\U0001d11e\"\n",
0,
&Body{
Attributes: Attributes{
"a": {
Name: "a",
Expr: &TemplateExpr{
Parts: []Expression{
&LiteralValueExpr{
Val: cty.StringVal("\U0001d11e"),
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 6, Byte: 5},
End: hcl.Pos{Line: 1, Column: 16, Byte: 15},
},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 5, Byte: 4},
End: hcl.Pos{Line: 1, Column: 17, Byte: 16},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 17, Byte: 16},
},
NameRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 2, Byte: 1},
},
EqualsRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 3, Byte: 2},
End: hcl.Pos{Line: 1, Column: 4, Byte: 3},
},
},
},
Blocks: Blocks{},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 2, Column: 1, Byte: 17},
},
EndRange: hcl.Range{
Start: hcl.Pos{Line: 2, Column: 1, Byte: 17},
End: hcl.Pos{Line: 2, Column: 1, Byte: 17},
},
},
},
{
"a = \"\\u0001d11e\"\n",
0, // This is valid, but probably not what the user intended :(
&Body{
Attributes: Attributes{
"a": {
Name: "a",
Expr: &TemplateExpr{
Parts: []Expression{
&LiteralValueExpr{
// Only the first four digits were used for the
// escape sequence, so the remaining four just
// get echoed out literally.
Val: cty.StringVal("\u0001d11e"),
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 6, Byte: 5},
End: hcl.Pos{Line: 1, Column: 16, Byte: 15},
},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 5, Byte: 4},
End: hcl.Pos{Line: 1, Column: 17, Byte: 16},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 17, Byte: 16},
},
NameRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 2, Byte: 1},
},
EqualsRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 3, Byte: 2},
End: hcl.Pos{Line: 1, Column: 4, Byte: 3},
},
},
},
Blocks: Blocks{},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 2, Column: 1, Byte: 17},
},
EndRange: hcl.Range{
Start: hcl.Pos{Line: 2, Column: 1, Byte: 17},
End: hcl.Pos{Line: 2, Column: 1, Byte: 17},
},
},
},
{
"a = \"\\U2022\"\n",
1, // Invalid escape sequence, since we need eight hex digits for \U
&Body{
Attributes: Attributes{
"a": {
Name: "a",
Expr: &TemplateExpr{
Parts: []Expression{
&LiteralValueExpr{
Val: cty.StringVal("\\U2022"),
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 6, Byte: 5},
End: hcl.Pos{Line: 1, Column: 12, Byte: 11},
},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 5, Byte: 4},
End: hcl.Pos{Line: 1, Column: 13, Byte: 12},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 13, Byte: 12},
},
NameRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 2, Byte: 1},
},
EqualsRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 3, Byte: 2},
End: hcl.Pos{Line: 1, Column: 4, Byte: 3},
},
},
},
Blocks: Blocks{},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 2, Column: 1, Byte: 13},
},
EndRange: hcl.Range{
Start: hcl.Pos{Line: 2, Column: 1, Byte: 13},
End: hcl.Pos{Line: 2, Column: 1, Byte: 13},
},
},
},
{
"a = \"\\u20m2\"\n",
1, // Invalid escape sequence
&Body{
Attributes: Attributes{
"a": {
Name: "a",
Expr: &TemplateExpr{
Parts: []Expression{
&LiteralValueExpr{
Val: cty.StringVal("\\u20m2"),
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 6, Byte: 5},
End: hcl.Pos{Line: 1, Column: 12, Byte: 11},
},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 5, Byte: 4},
End: hcl.Pos{Line: 1, Column: 13, Byte: 12},
},
},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 13, Byte: 12},
},
NameRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 1, Column: 2, Byte: 1},
},
EqualsRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 3, Byte: 2},
End: hcl.Pos{Line: 1, Column: 4, Byte: 3},
},
},
},
Blocks: Blocks{},
SrcRange: hcl.Range{
Start: hcl.Pos{Line: 1, Column: 1, Byte: 0},
End: hcl.Pos{Line: 2, Column: 1, Byte: 13},
},
EndRange: hcl.Range{
Start: hcl.Pos{Line: 2, Column: 1, Byte: 13},
End: hcl.Pos{Line: 2, Column: 1, Byte: 13},
},
},
},
{
"a = foo.bar\n",
0,