zclwrite: "format" implementation

This tweaks the number of spaces before each token to create a consistent layout. It doesn't (yet?) attempt to add or remove line breaks or otherwise mess with the non-space characters in the code. The main goal here is to get things to line up. zcl syntax is simple enough that there's not much latitude for super-weird usage, and so if someone does manage to write something weird (asymmetric breaking of brackets between lines, etc) we won't try to fix it here.
2017-06-10 14:11:47 -07:00 · 2017-06-10 14:11:47 -07:00 · ac42b456f3
commit ac42b456f3
parent 040160f3f9
3 changed files with 628 additions and 4 deletions
--- a/zclwrite/format.go
+++ b/zclwrite/format.go
@ -4,10 +4,287 @@ import (
 	"github.com/zclconf/go-zcl/zcl/zclsyntax"
 )

+// placeholder token used when we don't have a token but we don't want
+// to pass a real "nil" and complicate things with nil pointer checks
+var nilToken = &Token{
+	Type:         zclsyntax.TokenNil,
+	Bytes:        []byte{},
+	SpacesBefore: 0,
+}
+
 // format rewrites tokens within the given sequence, in-place, to adjust the
 // whitespace around their content to achieve canonical formatting.
 func format(tokens Tokens) {
-	// currently does nothing
+	// Formatting is a multi-pass process. More details on the passes below,
+	// but this is the overview:
+	// - adjust the leading space on each line to create appropriate
+	//   indentation
+	// - adjust spaces between tokens in a single cell using a set of rules
+	// - adjust the leading space in the "assign" and "comment" cells on each
+	//   line to vertically align with neighboring lines.
+	// All of these steps operate in-place on the given tokens, so a caller
+	// may collect a flat sequence of all of the tokens underlying an AST
+	// and pass it here and we will then indirectly modify the AST itself.
+	// Formatting must change only whitespace. Specifically, that means
+	// changing the SpacesBefore attribute on a token while leaving the
+	// other token attributes unchanged.
+
+	lines := linesForFormat(tokens)
+	formatIndent(lines)
+	formatSpaces(lines)
+	formatCells(lines)
+}
+
+func formatIndent(lines []formatLine) {
+	// Our methodology for indents is to take the input one line at a time
+	// and count the bracketing delimiters on each line. If a line has a net
+	// increase in open brackets, we increase the indent level by one and
+	// remember how many new openers we had. If the line has a net _decrease_,
+	// we'll compare it to the most recent number of openers and decrease the
+	// dedent level by one each time we pass an indent level remembered
+	// earlier.
+	// The "indent stack" used here allows for us to recognize degenerate
+	// input where brackets are not symmetrical within lines and avoid
+	// pushing things too far left or right, creating confusion.
+
+	// We'll start our indent stack at a reasonable capacity to minimize the
+	// chance of us needing to grow it; 10 here means 10 levels of indent,
+	// which should be more than enough for reasonable zcl uses.
+	indents := make([]int, 0, 10)
+
+	for i := range lines {
+		// TODO: need to track when we're inside a multi-line template and
+		// suspend indentation processing.
+
+		line := &lines[i]
+		if len(line.lead) == 0 {
+			continue
+		}
+		if line.lead[0].Type == zclsyntax.TokenNewline {
+			// Never place spaces before a newline
+			line.lead[0].SpacesBefore = 0
+			continue
+		}
+
+		netBrackets := 0
+		for _, token := range line.lead {
+			netBrackets += tokenBracketChange(token)
+		}
+		for _, token := range line.assign {
+			netBrackets += tokenBracketChange(token)
+		}
+
+		switch {
+		case netBrackets > 0:
+			line.lead[0].SpacesBefore = 2 * len(indents)
+			indents = append(indents, netBrackets)
+		case netBrackets < 0:
+			closed := -netBrackets
+			for closed > 0 && len(indents) > 0 {
+				switch {
+
+				case closed > indents[len(indents)-1]:
+					closed -= indents[len(indents)-1]
+					indents = indents[:len(indents)-1]
+
+				case closed < indents[len(indents)-1]:
+					indents[len(indents)-1] -= closed
+					closed = 0
+
+				default:
+					indents = indents[:len(indents)-1]
+					closed = 0
+				}
+			}
+			line.lead[0].SpacesBefore = 2 * len(indents)
+		default:
+			line.lead[0].SpacesBefore = 2 * len(indents)
+		}
+	}
+}
+
+func formatSpaces(lines []formatLine) {
+	for _, line := range lines {
+		for i, token := range line.lead {
+			var before, after *Token
+			if i > 0 {
+				before = line.lead[i-1]
+			} else {
+				before = nilToken
+			}
+			if i < (len(line.lead) - 1) {
+				after = line.lead[i+1]
+			} else {
+				after = nilToken
+			}
+			if spaceAfterToken(token, before, after) {
+				after.SpacesBefore = 1
+			} else {
+				after.SpacesBefore = 0
+			}
+		}
+		for i, token := range line.assign {
+			if i == 0 {
+				// first token in "assign" always has one space before to
+				// separate the equals sign from what it's assigning.
+				token.SpacesBefore = 1
+			}
+
+			var before, after *Token
+			if i > 0 {
+				before = line.assign[i-1]
+			} else {
+				before = nilToken
+			}
+			if i < (len(line.assign) - 1) {
+				after = line.assign[i+1]
+			} else {
+				after = nilToken
+			}
+			if spaceAfterToken(token, before, after) {
+				after.SpacesBefore = 1
+			} else {
+				after.SpacesBefore = 0
+			}
+		}
+
+	}
+}
+
+func formatCells(lines []formatLine) {
+
+	chainStart := -1
+	maxColumns := 0
+
+	// We'll deal with the "assign" cell first, since moving that will
+	// also impact the "comment" cell.
+	closeAssignChain := func(i int) {
+		for _, chainLine := range lines[chainStart:i] {
+			columns := chainLine.lead.Columns()
+			spaces := (maxColumns - columns) + 1
+			chainLine.assign[0].SpacesBefore = spaces
+		}
+		chainStart = -1
+		maxColumns = 0
+	}
+	for i, line := range lines {
+		if line.assign == nil {
+			if chainStart != -1 {
+				closeAssignChain(i)
+			}
+		} else {
+			if chainStart == -1 {
+				chainStart = i
+			}
+			columns := line.lead.Columns()
+			if columns > maxColumns {
+				maxColumns = columns
+			}
+		}
+	}
+	if chainStart != -1 {
+		closeAssignChain(len(lines))
+	}
+
+	// Now we'll deal with the comments
+	closeCommentChain := func(i int) {
+		for _, chainLine := range lines[chainStart:i] {
+			columns := chainLine.lead.Columns() + chainLine.assign.Columns()
+			spaces := (maxColumns - columns) + 1
+			chainLine.comment[0].SpacesBefore = spaces
+		}
+		chainStart = -1
+		maxColumns = 0
+	}
+	for i, line := range lines {
+		if line.comment == nil {
+			if chainStart != -1 {
+				closeCommentChain(i)
+			}
+		} else {
+			if chainStart == -1 {
+				chainStart = i
+			}
+			columns := line.lead.Columns() + line.assign.Columns()
+			if columns > maxColumns {
+				maxColumns = columns
+			}
+		}
+	}
+	if chainStart != -1 {
+		closeCommentChain(len(lines))
+	}
+
+}
+
+// spaceAfterToken decides whether a particular subject token should have a
+// space after it when surrounded by the given before and after tokens.
+// "before" can be TokenNil, if the subject token is at the start of a sequence.
+func spaceAfterToken(subject, before, after *Token) bool {
+	switch {
+
+	case after.Type == zclsyntax.TokenNewline || after.Type == zclsyntax.TokenNil:
+		// Never add spaces before a newline
+		return false
+
+	case subject.Type == zclsyntax.TokenIdent && after.Type == zclsyntax.TokenOParen:
+		// Don't split a function name from open paren in a call
+		return false
+
+	case after.Type == zclsyntax.TokenComma:
+		// No space right before a comma in an argument list
+		return false
+
+	case subject.Type == zclsyntax.TokenQuotedLit || subject.Type == zclsyntax.TokenStringLit || subject.Type == zclsyntax.TokenOQuote || subject.Type == zclsyntax.TokenOHeredoc || after.Type == zclsyntax.TokenQuotedLit || after.Type == zclsyntax.TokenStringLit || after.Type == zclsyntax.TokenCQuote || after.Type == zclsyntax.TokenCHeredoc:
+		// No extra spaces within templates
+		return false
+
+	case subject.Type == zclsyntax.TokenMinus:
+		// Since a minus can either be subtraction or negation, and the latter
+		// should _not_ have a space after it, we need to use some heuristics
+		// to decide which case this is.
+		// We guess that we have a negation if the token before doesn't look
+		// like it could be the end of an expression.
+
+		switch before.Type {
+
+		case zclsyntax.TokenNil:
+			// Minus at the start of input must be a negation
+			return false
+
+		case zclsyntax.TokenOParen, zclsyntax.TokenOBrace, zclsyntax.TokenOBrack, zclsyntax.TokenEqual, zclsyntax.TokenColon, zclsyntax.TokenComma, zclsyntax.TokenQuestion:
+			// Minus immediately after an opening bracket or separator must be a negation.
+			return false
+
+		case zclsyntax.TokenPlus, zclsyntax.TokenStar, zclsyntax.TokenSlash, zclsyntax.TokenPercent, zclsyntax.TokenMinus:
+			// Minus immediately after another arithmetic operator must be negation.
+			return false
+
+		case zclsyntax.TokenEqualOp, zclsyntax.TokenNotEqual, zclsyntax.TokenGreaterThan, zclsyntax.TokenGreaterThanEq, zclsyntax.TokenLessThan, zclsyntax.TokenLessThanEq:
+			// Minus immediately after another comparison operator must be negation.
+			return false
+
+		case zclsyntax.TokenAnd, zclsyntax.TokenOr, zclsyntax.TokenBang:
+			// Minus immediately after logical operator doesn't make sense but probably intended as negation.
+			return false
+
+		default:
+			return true
+		}
+
+	case tokenBracketChange(subject) > 0:
+		// No spaces after open brackets
+		return false
+
+	case tokenBracketChange(after) < 0:
+		// No spaces before close brackets
+		return false
+
+	default:
+		// Most tokens are space-separated
+		return true
+
+	}
 }

 func linesForFormat(tokens Tokens) []formatLine {
@ -64,9 +341,21 @@ func linesForFormat(tokens Tokens) []formatLine {
 		}

 		for i, tok := range line.lead {
-			if tok.Type == zclsyntax.TokenEqual {
-				line.assign = line.lead[i:]
-				line.lead = line.lead[:i]
+			if i > 0 && tok.Type == zclsyntax.TokenEqual {
+				// We only move the tokens into "assign" if the RHS seems to
+				// be a whole expression, which we determine by counting
+				// brackets. If there's a net positive number of brackets
+				// then that suggests we're introducing a multi-line expression.
+				netBrackets := 0
+				for _, token := range line.lead[i:] {
+					netBrackets += tokenBracketChange(token)
+				}
+
+				if netBrackets == 0 {
+					line.assign = line.lead[i:]
+					line.lead = line.lead[:i]
+				}
+				break
 			}
 		}
 	}
@ -87,6 +376,17 @@ func tokenIsNewline(tok *Token) bool {
 	return false
 }

+func tokenBracketChange(tok *Token) int {
+	switch tok.Type {
+	case zclsyntax.TokenOBrace, zclsyntax.TokenOBrack, zclsyntax.TokenOParen, zclsyntax.TokenTemplateControl, zclsyntax.TokenTemplateInterp:
+		return 1
+	case zclsyntax.TokenCBrace, zclsyntax.TokenCBrack, zclsyntax.TokenCParen, zclsyntax.TokenTemplateSeqEnd:
+		return -1
+	default:
+		return 0
+	}
+}
+
 // formatLine represents a single line of source code for formatting purposes,
 // splitting its tokens into up to three "cells":
 //
--- a/zclwrite/format_test.go
+++ b/zclwrite/format_test.go
@ -6,9 +6,307 @@ import (

 	"reflect"

+	"github.com/davecgh/go-spew/spew"
 	"github.com/zclconf/go-zcl/zcl/zclsyntax"
 )

+func TestFormat(t *testing.T) {
+	tests := []struct {
+		input string
+		want  string
+	}{
+		{
+			``,
+			``,
+		},
+		{
+			`a=1`,
+			`a = 1`,
+		},
+		{
+			`( a+2 )`,
+			`(a + 2)`,
+		},
+		{
+			`( a-2 )`,
+			`(a - 2)`,
+		},
+		{
+			`( a+-2 )`,
+			`(a + -2)`,
+		},
+		{
+			`( a--2 )`,
+			`(a - -2)`,
+		},
+		{
+			`(-2+1)`,
+			`(-2 + 1)`,
+		},
+		{
+			`foo(1, -2,a-b, b,c)`,
+			`foo(1, -2, a - b, b, c)`,
+		},
+		{
+			`a="hello ${ name }"`,
+			`a = "hello ${name}"`,
+		},
+		{
+			`a="hello ${~ name ~}"`,
+			`a = "hello ${~name~}"`,
+		},
+		{
+			`b{}`,
+			`b {}`,
+		},
+		{
+			`
+"${
+hello
+}"
+`,
+			`
+"${
+  hello
+}"
+`,
+		},
+		{
+			`
+foo(
+1,
+- 2,
+a-b,
+b,
+c,
+)
+`,
+			`
+foo(
+  1,
+  -2,
+  a - b,
+  b,
+  c,
+)
+`,
+		},
+		{
+			`[ [ ] ]`,
+			`[[]]`,
+		},
+		{
+			`
+[
+[
+a
+]
+]
+`,
+			`
+[
+  [
+    a
+  ]
+]
+`,
+		},
+		{
+			`
+[[
+a
+]]
+`,
+			`
+[[
+  a
+]]
+`,
+		},
+		{
+			`
+[[
+[
+a
+]
+]]
+`,
+			`
+[[
+  [
+    a
+  ]
+]]
+`,
+		},
+		{
+			// degenerate case with asymmetrical brackets
+			`
+[[
+[
+a
+]]
+]
+`,
+			`
+[[
+  [
+    a
+  ]]
+]
+`,
+		},
+		{
+			`
+b {
+a = 1
+}
+`,
+			`
+b {
+  a = 1
+}
+`,
+		},
+		{
+			`
+a = 1
+bungle = 2
+`,
+			`
+a      = 1
+bungle = 2
+`,
+		},
+		{
+			`
+a = 1
+
+bungle = 2
+`,
+			`
+a = 1
+
+bungle = 2
+`,
+		},
+		{
+			`
+a = 1 # foo
+bungle = 2
+`,
+			`
+a      = 1 # foo
+bungle = 2
+`,
+		},
+		{
+			`
+a = 1 # foo
+bungle = "bonce" # baz
+`,
+			`
+a      = 1       # foo
+bungle = "bonce" # baz
+`,
+		},
+		{
+			`
+# here we go
+a = 1 # foo
+bungle = "bonce" # baz
+`,
+			`
+# here we go
+a      = 1       # foo
+bungle = "bonce" # baz
+`,
+		},
+		{
+			`
+foo {} # here we go
+a = 1 # foo
+bungle = "bonce" # baz
+`,
+			`
+foo {}           # here we go
+a      = 1       # foo
+bungle = "bonce" # baz
+`,
+		},
+		{
+			`
+a = 1 # foo
+bungle = "bonce" # baz
+zebra = "striped" # baz
+`,
+			`
+a      = 1         # foo
+bungle = "bonce"   # baz
+zebra  = "striped" # baz
+`,
+		},
+		{
+			`
+a = 1 # foo
+bungle = (
+    "bonce"
+) # baz
+zebra = "striped" # baz
+`,
+			`
+a = 1 # foo
+bungle = (
+  "bonce"
+)                 # baz
+zebra = "striped" # baz
+`,
+		},
+		{
+			`
+a="apple"# foo
+bungle=(# woo parens
+"bonce"
+)# baz
+zebra="striped"# baz
+`,
+			`
+a = "apple" # foo
+bungle = (  # woo parens
+  "bonce"
+)                 # baz
+zebra = "striped" # baz
+`,
+		},
+		{
+			`
+𝒜 = 1 # foo
+bungle = "🇬🇧" # baz
+zebra = "striped" # baz
+`,
+			`
+𝒜      = 1         # foo
+bungle = "🇬🇧"       # baz
+zebra  = "striped" # baz
+`,
+		},
+	}
+
+	for i, test := range tests {
+		t.Run(fmt.Sprintf("%02d", i), func(t *testing.T) {
+			tokens := lexConfig([]byte(test.input))
+			format(tokens)
+			t.Logf("tokens %s\n", spew.Sdump(tokens))
+			got := string(tokens.Bytes())
+
+			if got != test.want {
+				t.Errorf("wrong result\ninput:\n%s\ngot:\n%s\nwant:\n%s", test.input, got, test.want)
+			}
+		})
+	}
+
+}
+
 func TestLinesForFormat(t *testing.T) {
 	tests := []struct {
 		tokens Tokens
--- a/zclwrite/tokens.go
+++ b/zclwrite/tokens.go
@ -1,8 +1,10 @@
 package zclwrite

 import (
+	"bytes"
 	"io"

+	"github.com/apparentlymart/go-textseg/textseg"
 	"github.com/zclconf/go-zcl/zcl/zclsyntax"
 )

@ -36,6 +38,30 @@ type Token struct {
 // Tokens is a flat list of tokens.
 type Tokens []*Token

+func (ts Tokens) WriteTo(wr io.Writer) (int, error) {
+	seq := &TokenSeq{ts}
+	return seq.WriteTo(wr)
+}
+
+func (ts Tokens) Bytes() []byte {
+	buf := &bytes.Buffer{}
+	ts.WriteTo(buf)
+	return buf.Bytes()
+}
+
+// Columns returns the number of columns (grapheme clusters) the token sequence
+// occupies. The result is not meaningful if there are newline or single-line
+// comment tokens in the sequence.
+func (ts Tokens) Columns() int {
+	ret := 0
+	for _, token := range ts {
+		ret += token.SpacesBefore // spaces are always worth one column each
+		ct, _ := textseg.TokenCount(token.Bytes, textseg.ScanGraphemeClusters)
+		ret += ct
+	}
+	return ret
+}
+
 // TokenSeq combines zero or more TokenGens together to produce a flat sequence
 // of tokens from a tree of TokenGens.
 type TokenSeq []TokenGen