hcl/hclwrite/tokens.go
Martin Atkins fee90926da Use Unicode 12.0.0 grapheme cluster segmentation rules
HCL uses grapheme cluster segmentation to produce accurate "column"
indications in diagnostic messages and other human-oriented source
location information. Each new major version of Unicode introduces new
codepoints, some of which are defined to combine with other codepoints to
produce a single visible character (grapheme cluster).

We were previously using the rules from Unicode 9.0.0. This change
switches to using the segmentation rules from Unicode 12.0.0, which is
the latest version at the time of this commit and is also the version of
Unicode used for other purposes by the Go 1.14 runtime.

HCL does not use text segmentation results for any purpose that would
affect the meaning of decoded data extracted from HCL files, so this
change will only affect the human-oriented source positions generated for
files containing characters that were newly-introduced in Unicode 10, 11,
or 12. (Machine-oriented uses of source location information are based on
byte offsets and not affected by text segmentation.)
2020-03-09 09:16:33 -07:00

123 lines
3.3 KiB
Go

package hclwrite
import (
"bytes"
"io"
"github.com/apparentlymart/go-textseg/v12/textseg"
"github.com/hashicorp/hcl/v2"
"github.com/hashicorp/hcl/v2/hclsyntax"
)
// Token is a single sequence of bytes annotated with a type. It is similar
// in purpose to hclsyntax.Token, but discards the source position information
// since that is not useful in code generation.
type Token struct {
Type hclsyntax.TokenType
Bytes []byte
// We record the number of spaces before each token so that we can
// reproduce the exact layout of the original file when we're making
// surgical changes in-place. When _new_ code is created it will always
// be in the canonical style, but we preserve layout of existing code.
SpacesBefore int
}
// asHCLSyntax returns the receiver expressed as an incomplete hclsyntax.Token.
// A complete token is not possible since we don't have source location
// information here, and so this method is unexported so we can be sure it will
// only be used for internal purposes where we know the range isn't important.
//
// This is primarily intended to allow us to re-use certain functionality from
// hclsyntax rather than re-implementing it against our own token type here.
func (t *Token) asHCLSyntax() hclsyntax.Token {
return hclsyntax.Token{
Type: t.Type,
Bytes: t.Bytes,
Range: hcl.Range{
Filename: "<invalid>",
},
}
}
// Tokens is a flat list of tokens.
type Tokens []*Token
func (ts Tokens) Bytes() []byte {
buf := &bytes.Buffer{}
ts.WriteTo(buf)
return buf.Bytes()
}
func (ts Tokens) testValue() string {
return string(ts.Bytes())
}
// Columns returns the number of columns (grapheme clusters) the token sequence
// occupies. The result is not meaningful if there are newline or single-line
// comment tokens in the sequence.
func (ts Tokens) Columns() int {
ret := 0
for _, token := range ts {
ret += token.SpacesBefore // spaces are always worth one column each
ct, _ := textseg.TokenCount(token.Bytes, textseg.ScanGraphemeClusters)
ret += ct
}
return ret
}
// WriteTo takes an io.Writer and writes the bytes for each token to it,
// along with the spacing that separates each token. In other words, this
// allows serializing the tokens to a file or other such byte stream.
func (ts Tokens) WriteTo(wr io.Writer) (int64, error) {
// We know we're going to be writing a lot of small chunks of repeated
// space characters, so we'll prepare a buffer of these that we can
// easily pass to wr.Write without any further allocation.
spaces := make([]byte, 40)
for i := range spaces {
spaces[i] = ' '
}
var n int64
var err error
for _, token := range ts {
if err != nil {
return n, err
}
for spacesBefore := token.SpacesBefore; spacesBefore > 0; spacesBefore -= len(spaces) {
thisChunk := spacesBefore
if thisChunk > len(spaces) {
thisChunk = len(spaces)
}
var thisN int
thisN, err = wr.Write(spaces[:thisChunk])
n += int64(thisN)
if err != nil {
return n, err
}
}
var thisN int
thisN, err = wr.Write(token.Bytes)
n += int64(thisN)
}
return n, err
}
func (ts Tokens) walkChildNodes(w internalWalkFunc) {
// Unstructured tokens have no child nodes
}
func (ts Tokens) BuildTokens(to Tokens) Tokens {
return append(to, ts...)
}
func newIdentToken(name string) *Token {
return &Token{
Type: hclsyntax.TokenIdent,
Bytes: []byte(name),
}
}