hcl/hclwrite/tokens.go
Martin Atkins 2eaeb36cb3 Use Unicode 13 text segmentation rules
HCL uses a number of upstream libraries that implement algorithms defined
in Unicode. This commit is updating those libraries all to versions that
have Unicode 13 support.

The main implication of this for HCL directly is that when it returns
column numbers in source locations it will count characters using the
Unicode 13 definition of "character", which includes various new
multi-codeunit characters added in Unicode 13.

These new version dependencies will also make Unicode 13 support available
for other functionality that HCL callers might use, such as the stdlib
functions in upstream cty, even though HCL itself does not directly use
those.
2021-02-23 09:05:19 -08:00

123 lines
3.3 KiB
Go

package hclwrite
import (
"bytes"
"io"
"github.com/apparentlymart/go-textseg/v13/textseg"
"github.com/hashicorp/hcl/v2"
"github.com/hashicorp/hcl/v2/hclsyntax"
)
// Token is a single sequence of bytes annotated with a type. It is similar
// in purpose to hclsyntax.Token, but discards the source position information
// since that is not useful in code generation.
type Token struct {
Type hclsyntax.TokenType
Bytes []byte
// We record the number of spaces before each token so that we can
// reproduce the exact layout of the original file when we're making
// surgical changes in-place. When _new_ code is created it will always
// be in the canonical style, but we preserve layout of existing code.
SpacesBefore int
}
// asHCLSyntax returns the receiver expressed as an incomplete hclsyntax.Token.
// A complete token is not possible since we don't have source location
// information here, and so this method is unexported so we can be sure it will
// only be used for internal purposes where we know the range isn't important.
//
// This is primarily intended to allow us to re-use certain functionality from
// hclsyntax rather than re-implementing it against our own token type here.
func (t *Token) asHCLSyntax() hclsyntax.Token {
return hclsyntax.Token{
Type: t.Type,
Bytes: t.Bytes,
Range: hcl.Range{
Filename: "<invalid>",
},
}
}
// Tokens is a flat list of tokens.
type Tokens []*Token
func (ts Tokens) Bytes() []byte {
buf := &bytes.Buffer{}
ts.WriteTo(buf)
return buf.Bytes()
}
func (ts Tokens) testValue() string {
return string(ts.Bytes())
}
// Columns returns the number of columns (grapheme clusters) the token sequence
// occupies. The result is not meaningful if there are newline or single-line
// comment tokens in the sequence.
func (ts Tokens) Columns() int {
ret := 0
for _, token := range ts {
ret += token.SpacesBefore // spaces are always worth one column each
ct, _ := textseg.TokenCount(token.Bytes, textseg.ScanGraphemeClusters)
ret += ct
}
return ret
}
// WriteTo takes an io.Writer and writes the bytes for each token to it,
// along with the spacing that separates each token. In other words, this
// allows serializing the tokens to a file or other such byte stream.
func (ts Tokens) WriteTo(wr io.Writer) (int64, error) {
// We know we're going to be writing a lot of small chunks of repeated
// space characters, so we'll prepare a buffer of these that we can
// easily pass to wr.Write without any further allocation.
spaces := make([]byte, 40)
for i := range spaces {
spaces[i] = ' '
}
var n int64
var err error
for _, token := range ts {
if err != nil {
return n, err
}
for spacesBefore := token.SpacesBefore; spacesBefore > 0; spacesBefore -= len(spaces) {
thisChunk := spacesBefore
if thisChunk > len(spaces) {
thisChunk = len(spaces)
}
var thisN int
thisN, err = wr.Write(spaces[:thisChunk])
n += int64(thisN)
if err != nil {
return n, err
}
}
var thisN int
thisN, err = wr.Write(token.Bytes)
n += int64(thisN)
}
return n, err
}
func (ts Tokens) walkChildNodes(w internalWalkFunc) {
// Unstructured tokens have no child nodes
}
func (ts Tokens) BuildTokens(to Tokens) Tokens {
return append(to, ts...)
}
func newIdentToken(name string) *Token {
return &Token{
Type: hclsyntax.TokenIdent,
Bytes: []byte(name),
}
}