hcl/hclsyntax: Fix up parsing of flush heredocs

This was implemented a long time ago in the original template parser, but it was missed in the rewrite of the template parser to make it use a two-stage parsing strategy. It's implemented as a post-processing step on the result of the first stage of parsing, which produces a flat sequence of literal strings, interpolation markers, and control markers, and prior to the second stage which matches opening and closing control markers to produce an expression AST. It's important to do this at parse time rather than eval time since it is the static layout of the source code that decides the indentation level, and so an interpolation marker at the start of a line that itself produces spaces does not affect the result.
2018-12-13 17:22:41 -08:00 · 2018-12-13 17:22:41 -08:00 · e8dbb16dbc
commit e8dbb16dbc
parent c33bbe4c25
7 changed files with 252 additions and 5 deletions
--- a/hcl/hclsyntax/expression_test.go
+++ b/hcl/hclsyntax/expression_test.go
@ -1163,6 +1163,62 @@ EOT
 			cty.TupleVal([]cty.Value{cty.StringVal("  Foo\n  Bar\n  Baz\n")}),
 			0,
 		},
+		{
+			`[
+  <<-EOT
+  Foo
+  Bar
+  Baz
+  EOT
+]
+`,
+			nil,
+			cty.TupleVal([]cty.Value{cty.StringVal("Foo\nBar\nBaz\n")}),
+			0,
+		},
+		{
+			`[
+  <<-EOT
+  Foo
+    Bar
+    Baz
+  EOT
+]
+`,
+			nil,
+			cty.TupleVal([]cty.Value{cty.StringVal("Foo\n  Bar\n  Baz\n")}),
+			0,
+		},
+		{
+			`[
+  <<-EOT
+    Foo
+  Bar
+    Baz
+  EOT
+]
+`,
+			nil,
+			cty.TupleVal([]cty.Value{cty.StringVal("  Foo\nBar\n  Baz\n")}),
+			0,
+		},
+		{
+			`[
+  <<-EOT
+    Foo
+  ${bar}
+    Baz
+    EOT
+]
+`,
+			&hcl.EvalContext{
+				Variables: map[string]cty.Value{
+					"bar": cty.StringVal("  Bar"), // Spaces in the interpolation result don't affect the outcome
+				},
+			},
+			cty.TupleVal([]cty.Value{cty.StringVal("  Foo\n  Bar\n  Baz\n")}),
+			0,
+		},

 		{
 			`unk["baz"]`,
--- a/hcl/hclsyntax/parser.go
+++ b/hcl/hclsyntax/parser.go
@ -860,7 +860,7 @@ func (p *parser) parseExpressionTerm() (Expression, hcl.Diagnostics) {
 	case TokenOQuote, TokenOHeredoc:
 		open := p.Read() // eat opening marker
 		closer := p.oppositeBracket(open.Type)
-		exprs, passthru, _, diags := p.parseTemplateInner(closer)
+		exprs, passthru, _, diags := p.parseTemplateInner(closer, tokenOpensFlushHeredoc(open))

 		closeRange := p.PrevRange()

--- a/hcl/hclsyntax/parser_template.go
+++ b/hcl/hclsyntax/parser_template.go
@ -2,6 +2,7 @@ package hclsyntax

 import (
 	"fmt"
+	"github.com/apparentlymart/go-textseg/textseg"
 	"strings"
 	"unicode"

@ -10,11 +11,11 @@ import (
 )

 func (p *parser) ParseTemplate() (Expression, hcl.Diagnostics) {
-	return p.parseTemplate(TokenEOF)
+	return p.parseTemplate(TokenEOF, false)
 }

-func (p *parser) parseTemplate(end TokenType) (Expression, hcl.Diagnostics) {
-	exprs, passthru, rng, diags := p.parseTemplateInner(end)
+func (p *parser) parseTemplate(end TokenType, flushHeredoc bool) (Expression, hcl.Diagnostics) {
+	exprs, passthru, rng, diags := p.parseTemplateInner(end, flushHeredoc)

 	if passthru {
 		if len(exprs) != 1 {
@ -32,8 +33,11 @@ func (p *parser) parseTemplate(end TokenType) (Expression, hcl.Diagnostics) {
 	}, diags
 }

-func (p *parser) parseTemplateInner(end TokenType) ([]Expression, bool, hcl.Range, hcl.Diagnostics) {
+func (p *parser) parseTemplateInner(end TokenType, flushHeredoc bool) ([]Expression, bool, hcl.Range, hcl.Diagnostics) {
 	parts, diags := p.parseTemplateParts(end)
+	if flushHeredoc {
+		flushHeredocTemplateParts(parts) // Trim off leading spaces on lines per the flush heredoc spec
+	}
 	tp := templateParser{
 		Tokens:   parts.Tokens,
 		SrcRange: parts.SrcRange,
@ -649,6 +653,73 @@ Token:
 	return ret, diags
 }

+// flushHeredocTemplateParts modifies in-place the line-leading literal strings
+// to apply the flush heredoc processing rule: find the line with the smallest
+// number of whitespace characters as prefix and then trim that number of
+// characters from all of the lines.
+//
+// This rule is applied to static tokens rather than to the rendered result,
+// so interpolating a string with leading whitespace cannot affect the chosen
+// prefix length.
+func flushHeredocTemplateParts(parts *templateParts) {
+	if len(parts.Tokens) == 0 {
+		// Nothing to do
+		return
+	}
+
+	const maxInt = int((^uint(0)) >> 1)
+
+	minSpaces := maxInt
+	newline := true
+	var adjust []*templateLiteralToken
+	for _, ttok := range parts.Tokens {
+		if newline {
+			newline = false
+			var spaces int
+			if lit, ok := ttok.(*templateLiteralToken); ok {
+				orig := lit.Val
+				trimmed := strings.TrimLeftFunc(orig, unicode.IsSpace)
+				// If a token is entirely spaces and ends with a newline
+				// then it's a "blank line" and thus not considered for
+				// space-prefix-counting purposes.
+				if len(trimmed) == 0 && strings.HasSuffix(orig, "\n") {
+					spaces = maxInt
+				} else {
+					spaceBytes := len(lit.Val) - len(trimmed)
+					spaces, _ = textseg.TokenCount([]byte(orig[:spaceBytes]), textseg.ScanGraphemeClusters)
+					adjust = append(adjust, lit)
+				}
+			} else if _, ok := ttok.(*templateEndToken); ok {
+				break // don't process the end token since it never has spaces before it
+			}
+			if spaces < minSpaces {
+				minSpaces = spaces
+			}
+		}
+		if lit, ok := ttok.(*templateLiteralToken); ok {
+			if strings.HasSuffix(lit.Val, "\n") {
+				newline = true // The following token, if any, begins a new line
+			}
+		}
+	}
+
+	for _, lit := range adjust {
+		// Since we want to count space _characters_ rather than space _bytes_,
+		// we can't just do a straightforward slice operation here and instead
+		// need to hunt for the split point with a scanner.
+		valBytes := []byte(lit.Val)
+		spaceByteCount := 0
+		for i := 0; i < minSpaces; i++ {
+			adv, _, _ := textseg.ScanGraphemeClusters(valBytes, true)
+			spaceByteCount += adv
+			valBytes = valBytes[adv:]
+		}
+		lit.Val = lit.Val[spaceByteCount:]
+		lit.SrcRange.Start.Column += minSpaces
+		lit.SrcRange.Start.Byte += spaceByteCount
+	}
+}
+
 type templateParts struct {
 	Tokens   []templateToken
 	SrcRange hcl.Range
--- a/hcl/hclsyntax/token.go
+++ b/hcl/hclsyntax/token.go
@ -1,6 +1,7 @@
 package hclsyntax

 import (
+	"bytes"
 	"fmt"

 	"github.com/apparentlymart/go-textseg/textseg"
@ -161,6 +162,13 @@ type heredocInProgress struct {
 	StartOfLine bool
 }

+func tokenOpensFlushHeredoc(tok Token) bool {
+	if tok.Type != TokenOHeredoc {
+		return false
+	}
+	return bytes.HasPrefix(tok.Bytes, []byte{'<', '<', '-'})
+}
+
 // checkInvalidTokens does a simple pass across the given tokens and generates
 // diagnostics for tokens that should _never_ appear in HCL source. This
 // is intended to avoid the need for the parser to have special support
--- a/specsuite/tests/expressions/heredoc.hcl
+++ b/specsuite/tests/expressions/heredoc.hcl
@ -0,0 +1,73 @@
+normal = {
+  basic = <<EOT
+Foo
+Bar
+Baz
+EOT
+  indented = <<EOT
+    Foo
+    Bar
+    Baz
+  EOT
+  indented_more = <<EOT
+    Foo
+      Bar
+    Baz
+  EOT
+  interp = <<EOT
+    Foo
+    ${bar}
+    Baz
+  EOT
+
+  marker_at_suffix = <<EOT
+    NOT EOT
+  EOT
+}
+flush = {
+  basic = <<-EOT
+Foo
+Bar
+Baz
+EOT
+  indented = <<-EOT
+    Foo
+    Bar
+    Baz
+  EOT
+  indented_more = <<-EOT
+    Foo
+      Bar
+    Baz
+  EOT
+  indented_less = <<-EOT
+    Foo
+  Bar
+    Baz
+  EOT
+  interp = <<-EOT
+    Foo
+    ${bar}
+    Baz
+  EOT
+  interp_indented_more = <<-EOT
+    Foo
+      ${bar}
+    Baz
+  EOT
+  interp_indented_less = <<-EOT
+    Foo
+  ${space_bar}
+    Baz
+  EOT
+  tabs = <<-EOT
+	Foo
+  Bar
+  Baz
+  EOT
+  unicode_spaces = <<-EOT
+     Foo (there's two "em spaces" before Foo there)
+    Bar
+    Baz
+  EOT
+}
--- a/specsuite/tests/expressions/heredoc.hcldec
+++ b/specsuite/tests/expressions/heredoc.hcldec
@ -0,0 +1,14 @@
+variables {
+    bar       = "Bar"
+    space_bar = "  Bar"
+    words     = ["Foo", "Bar", "Baz"]
+}
+
+object {
+  attr "normal" {
+    type = map(string)
+  }
+  attr "flush" {
+    type = map(string)
+  }
+}
--- a/specsuite/tests/expressions/heredoc.t
+++ b/specsuite/tests/expressions/heredoc.t
@ -0,0 +1,25 @@
+result = {
+    normal = {
+        basic         = "Foo\nBar\nBaz\n"
+        indented      = "    Foo\n    Bar\n    Baz\n"
+        indented_more = "    Foo\n      Bar\n    Baz\n"
+        interp        = "    Foo\n    Bar\n    Baz\n"
+
+        marker_at_suffix = "    NOT EOT\n"
+    }
+    flush  = {
+        basic                = "Foo\nBar\nBaz\n"
+        indented             = "Foo\nBar\nBaz\n"
+        indented_more        = "Foo\n  Bar\nBaz\n"
+        indented_less        = "  Foo\nBar\n  Baz\n"
+        interp               = "Foo\nBar\nBaz\n"
+        interp_indented_more = "Foo\n  Bar\nBaz\n"
+        interp_indented_less = "  Foo\n  Bar\n  Baz\n"
+        tabs                 = "Foo\n Bar\n Baz\n"
+        unicode_spaces       = " Foo (there's two \"em spaces\" before Foo there)\nBar\nBaz\n"
+    }
+}
+result_type = object({
+  normal = map(string)
+  flush  = map(string)
+})