hcl/hclwrite/parser.go

596 lines
20 KiB
Go
Raw Normal View History

package hclwrite
import (
"fmt"
"sort"
"github.com/hashicorp/hcl2/hcl"
"github.com/hashicorp/hcl2/hcl/hclsyntax"
"github.com/zclconf/go-cty/cty"
)
// Our "parser" here is actually not doing any parsing of its own. Instead,
// it leans on the native parser in hclsyntax, and then uses the source ranges
// from the AST to partition the raw token sequence to match the raw tokens
// up to AST nodes.
//
// This strategy feels somewhat counter-intuitive, since most of the work the
// parser does is thrown away here, but this strategy is chosen because the
// normal parsing work done by hclsyntax is considered to be the "main case",
// while modifying and re-printing source is more of an edge case, used only
// in ancillary tools, and so it's good to keep all the main parsing logic
// with the main case but keep all of the extra complexity of token wrangling
// out of the main parser, which is already rather complex just serving the
// use-cases it already serves.
//
// If the parsing step produces any errors, the returned File is nil because
// we can't reliably extract tokens from the partial AST produced by an
// erroneous parse.
func parse(src []byte, filename string, start hcl.Pos) (*File, hcl.Diagnostics) {
file, diags := hclsyntax.ParseConfig(src, filename, start)
if diags.HasErrors() {
return nil, diags
}
// To do our work here, we use the "native" tokens (those from hclsyntax)
// to match against source ranges in the AST, but ultimately produce
// slices from our sequence of "writer" tokens, which contain only
// *relative* position information that is more appropriate for
// transformation/writing use-cases.
nativeTokens, diags := hclsyntax.LexConfig(src, filename, start)
if diags.HasErrors() {
// should never happen, since we would've caught these diags in
// the first call above.
return nil, diags
}
writerTokens := writerTokens(nativeTokens)
from := inputTokens{
nativeTokens: nativeTokens,
writerTokens: writerTokens,
}
before, root, after := parseBody(file.Body.(*hclsyntax.Body), from)
ret := &File{
inTree: newInTree(),
srcBytes: src,
body: root,
}
nodes := ret.inTree.children
nodes.Append(before.Tokens())
nodes.AppendNode(root)
nodes.Append(after.Tokens())
return ret, diags
}
type inputTokens struct {
nativeTokens hclsyntax.Tokens
writerTokens Tokens
}
func (it inputTokens) Partition(rng hcl.Range) (before, within, after inputTokens) {
start, end := partitionTokens(it.nativeTokens, rng)
before = it.Slice(0, start)
within = it.Slice(start, end)
after = it.Slice(end, len(it.nativeTokens))
return
}
func (it inputTokens) PartitionType(ty hclsyntax.TokenType) (before, within, after inputTokens) {
for i, t := range it.writerTokens {
if t.Type == ty {
return it.Slice(0, i), it.Slice(i, i+1), it.Slice(i+1, len(it.nativeTokens))
}
}
panic(fmt.Sprintf("didn't find any token of type %s", ty))
}
func (it inputTokens) PartitionTypeSingle(ty hclsyntax.TokenType) (before inputTokens, found *Token, after inputTokens) {
before, within, after := it.PartitionType(ty)
if within.Len() != 1 {
panic("PartitionType found more than one token")
}
return before, within.Tokens()[0], after
}
// PartitionIncludeComments is like Partition except the returned "within"
// range includes any lead and line comments associated with the range.
func (it inputTokens) PartitionIncludingComments(rng hcl.Range) (before, within, after inputTokens) {
start, end := partitionTokens(it.nativeTokens, rng)
start = partitionLeadCommentTokens(it.nativeTokens[:start])
_, afterNewline := partitionLineEndTokens(it.nativeTokens[end:])
end += afterNewline
before = it.Slice(0, start)
within = it.Slice(start, end)
after = it.Slice(end, len(it.nativeTokens))
return
}
// PartitionBlockItem is similar to PartitionIncludeComments but it returns
// the comments as separate token sequences so that they can be captured into
// AST attributes. It makes assumptions that apply only to block items, so
// should not be used for other constructs.
func (it inputTokens) PartitionBlockItem(rng hcl.Range) (before, leadComments, within, lineComments, newline, after inputTokens) {
before, within, after = it.Partition(rng)
before, leadComments = before.PartitionLeadComments()
lineComments, newline, after = after.PartitionLineEndTokens()
return
}
func (it inputTokens) PartitionLeadComments() (before, within inputTokens) {
start := partitionLeadCommentTokens(it.nativeTokens)
before = it.Slice(0, start)
within = it.Slice(start, len(it.nativeTokens))
return
}
func (it inputTokens) PartitionLineEndTokens() (comments, newline, after inputTokens) {
afterComments, afterNewline := partitionLineEndTokens(it.nativeTokens)
comments = it.Slice(0, afterComments)
newline = it.Slice(afterComments, afterNewline)
after = it.Slice(afterNewline, len(it.nativeTokens))
return
}
func (it inputTokens) Slice(start, end int) inputTokens {
// When we slice, we create a new slice with no additional capacity because
// we expect that these slices will be mutated in order to insert
// new code into the AST, and we want to ensure that a new underlying
// array gets allocated in that case, rather than writing into some
// following slice and corrupting it.
return inputTokens{
nativeTokens: it.nativeTokens[start:end:end],
writerTokens: it.writerTokens[start:end:end],
}
}
func (it inputTokens) Len() int {
return len(it.nativeTokens)
}
func (it inputTokens) Tokens() Tokens {
return it.writerTokens
}
func (it inputTokens) Types() []hclsyntax.TokenType {
ret := make([]hclsyntax.TokenType, len(it.nativeTokens))
for i, tok := range it.nativeTokens {
ret[i] = tok.Type
}
return ret
}
// parseBody locates the given body within the given input tokens and returns
// the resulting *Body object as well as the tokens that appeared before and
// after it.
func parseBody(nativeBody *hclsyntax.Body, from inputTokens) (inputTokens, *node, inputTokens) {
before, within, after := from.PartitionIncludingComments(nativeBody.SrcRange)
// The main AST doesn't retain the original source ordering of the
// body items, so we need to reconstruct that ordering by inspecting
// their source ranges.
nativeItems := make([]hclsyntax.Node, 0, len(nativeBody.Attributes)+len(nativeBody.Blocks))
for _, nativeAttr := range nativeBody.Attributes {
nativeItems = append(nativeItems, nativeAttr)
}
for _, nativeBlock := range nativeBody.Blocks {
nativeItems = append(nativeItems, nativeBlock)
}
sort.Sort(nativeNodeSorter{nativeItems})
body := &Body{
inTree: newInTree(),
indentLevel: 0, // TODO: deal with this
items: newNodeSet(),
}
remain := within
for _, nativeItem := range nativeItems {
beforeItem, item, afterItem := parseBodyItem(nativeItem, remain)
if beforeItem.Len() > 0 {
body.AppendUnstructuredTokens(beforeItem.Tokens())
}
body.appendItemNode(item)
remain = afterItem
}
if remain.Len() > 0 {
body.AppendUnstructuredTokens(remain.Tokens())
}
return before, newNode(body), after
}
func parseBodyItem(nativeItem hclsyntax.Node, from inputTokens) (inputTokens, *node, inputTokens) {
before, leadComments, within, lineComments, newline, after := from.PartitionBlockItem(nativeItem.Range())
var item *node
switch tItem := nativeItem.(type) {
case *hclsyntax.Attribute:
item = parseAttribute(tItem, within, leadComments, lineComments, newline)
case *hclsyntax.Block:
item = parseBlock(tItem, within, leadComments, lineComments, newline)
default:
// should never happen if caller is behaving
panic("unsupported native item type")
}
return before, item, after
}
func parseAttribute(nativeAttr *hclsyntax.Attribute, from, leadComments, lineComments, newline inputTokens) *node {
attr := &Attribute{
inTree: newInTree(),
}
children := attr.inTree.children
{
cn := newNode(newComments(leadComments.Tokens()))
attr.leadComments = cn
children.AppendNode(cn)
}
before, nameTokens, from := from.Partition(nativeAttr.NameRange)
{
children.AppendUnstructuredTokens(before.Tokens())
if nameTokens.Len() != 1 {
// Should never happen with valid input
panic("attribute name is not exactly one token")
}
token := nameTokens.Tokens()[0]
in := newNode(newIdentifier(token))
attr.name = in
children.AppendNode(in)
}
before, equalsTokens, from := from.Partition(nativeAttr.EqualsRange)
children.AppendUnstructuredTokens(before.Tokens())
children.AppendUnstructuredTokens(equalsTokens.Tokens())
before, exprTokens, from := from.Partition(nativeAttr.Expr.Range())
{
children.AppendUnstructuredTokens(before.Tokens())
exprNode := parseExpression(nativeAttr.Expr, exprTokens)
attr.expr = exprNode
children.AppendNode(exprNode)
}
{
cn := newNode(newComments(lineComments.Tokens()))
attr.lineComments = cn
children.AppendNode(cn)
}
children.AppendUnstructuredTokens(newline.Tokens())
// Collect any stragglers, though there shouldn't be any
children.AppendUnstructuredTokens(from.Tokens())
return newNode(attr)
}
func parseBlock(nativeBlock *hclsyntax.Block, from, leadComments, lineComments, newline inputTokens) *node {
block := &Block{
inTree: newInTree(),
labels: newNodeSet(),
}
children := block.inTree.children
{
cn := newNode(newComments(leadComments.Tokens()))
block.leadComments = cn
children.AppendNode(cn)
}
before, typeTokens, from := from.Partition(nativeBlock.TypeRange)
{
children.AppendUnstructuredTokens(before.Tokens())
if typeTokens.Len() != 1 {
// Should never happen with valid input
panic("block type name is not exactly one token")
}
token := typeTokens.Tokens()[0]
in := newNode(newIdentifier(token))
block.typeName = in
children.AppendNode(in)
}
for _, rng := range nativeBlock.LabelRanges {
var labelTokens inputTokens
before, labelTokens, from = from.Partition(rng)
children.AppendUnstructuredTokens(before.Tokens())
tokens := labelTokens.Tokens()
ln := newNode(newQuoted(tokens))
block.labels.Add(ln)
children.AppendNode(ln)
}
before, oBrace, from := from.Partition(nativeBlock.OpenBraceRange)
children.AppendUnstructuredTokens(before.Tokens())
children.AppendUnstructuredTokens(oBrace.Tokens())
// We go a bit out of order here: we go hunting for the closing brace
// so that we have a delimited body, but then we'll deal with the body
// before we actually append the closing brace and any straggling tokens
// that appear after it.
bodyTokens, cBrace, from := from.Partition(nativeBlock.CloseBraceRange)
before, body, after := parseBody(nativeBlock.Body, bodyTokens)
children.AppendUnstructuredTokens(before.Tokens())
block.body = body
children.AppendNode(body)
children.AppendUnstructuredTokens(after.Tokens())
children.AppendUnstructuredTokens(cBrace.Tokens())
// stragglers
children.AppendUnstructuredTokens(from.Tokens())
if lineComments.Len() > 0 {
// blocks don't actually have line comments, so we'll just treat
// them as extra stragglers
children.AppendUnstructuredTokens(lineComments.Tokens())
}
children.AppendUnstructuredTokens(newline.Tokens())
return newNode(block)
}
func parseExpression(nativeExpr hclsyntax.Expression, from inputTokens) *node {
expr := newExpression()
children := expr.inTree.children
nativeVars := nativeExpr.Variables()
for _, nativeTraversal := range nativeVars {
before, traversal, after := parseTraversal(nativeTraversal, from)
children.AppendUnstructuredTokens(before.Tokens())
children.AppendNode(traversal)
expr.absTraversals.Add(traversal)
from = after
}
// Attach any stragglers that don't belong to a traversal to the expression
// itself. In an expression with no traversals at all, this is just the
// entirety of "from".
children.AppendUnstructuredTokens(from.Tokens())
return newNode(expr)
}
func parseTraversal(nativeTraversal hcl.Traversal, from inputTokens) (before inputTokens, n *node, after inputTokens) {
traversal := newTraversal()
children := traversal.inTree.children
before, from, after = from.Partition(nativeTraversal.SourceRange())
stepAfter := from
for _, nativeStep := range nativeTraversal {
before, step, after := parseTraversalStep(nativeStep, stepAfter)
children.AppendUnstructuredTokens(before.Tokens())
children.AppendNode(step)
stepAfter = after
}
return before, newNode(traversal), after
}
func parseTraversalStep(nativeStep hcl.Traverser, from inputTokens) (before inputTokens, n *node, after inputTokens) {
var children *nodes
switch tNativeStep := nativeStep.(type) {
case hcl.TraverseRoot, hcl.TraverseAttr:
step := newTraverseName()
children = step.inTree.children
before, from, after = from.Partition(nativeStep.SourceRange())
inBefore, token, inAfter := from.PartitionTypeSingle(hclsyntax.TokenIdent)
name := newIdentifier(token)
children.AppendUnstructuredTokens(inBefore.Tokens())
step.name = children.Append(name)
children.AppendUnstructuredTokens(inAfter.Tokens())
return before, newNode(step), after
case hcl.TraverseIndex:
step := newTraverseIndex()
children = step.inTree.children
before, from, after = from.Partition(nativeStep.SourceRange())
var inBefore, oBrack, keyTokens, cBrack inputTokens
inBefore, oBrack, from = from.PartitionType(hclsyntax.TokenOBrack)
children.AppendUnstructuredTokens(inBefore.Tokens())
children.AppendUnstructuredTokens(oBrack.Tokens())
keyTokens, cBrack, from = from.PartitionType(hclsyntax.TokenCBrack)
keyVal := tNativeStep.Key
switch keyVal.Type() {
case cty.String:
key := newQuoted(keyTokens.Tokens())
step.key = children.Append(key)
case cty.Number:
valBefore, valToken, valAfter := keyTokens.PartitionTypeSingle(hclsyntax.TokenNumberLit)
children.AppendUnstructuredTokens(valBefore.Tokens())
key := newNumber(valToken)
step.key = children.Append(key)
children.AppendUnstructuredTokens(valAfter.Tokens())
}
children.AppendUnstructuredTokens(cBrack.Tokens())
children.AppendUnstructuredTokens(from.Tokens())
return before, newNode(step), after
default:
panic(fmt.Sprintf("unsupported traversal step type %T", nativeStep))
}
}
// writerTokens takes a sequence of tokens as produced by the main hclsyntax
// package and transforms it into an equivalent sequence of tokens using
// this package's own token model.
//
// The resulting list contains the same number of tokens and uses the same
// indices as the input, allowing the two sets of tokens to be correlated
// by index.
func writerTokens(nativeTokens hclsyntax.Tokens) Tokens {
// Ultimately we want a slice of token _pointers_, but since we can
// predict how much memory we're going to devote to tokens we'll allocate
// it all as a single flat buffer and thus give the GC less work to do.
tokBuf := make([]Token, len(nativeTokens))
var lastByteOffset int
for i, mainToken := range nativeTokens {
// Create a copy of the bytes so that we can mutate without
// corrupting the original token stream.
bytes := make([]byte, len(mainToken.Bytes))
copy(bytes, mainToken.Bytes)
tokBuf[i] = Token{
Type: mainToken.Type,
Bytes: bytes,
// We assume here that spaces are always ASCII spaces, since
// that's what the scanner also assumes, and thus the number
// of bytes skipped is also the number of space characters.
SpacesBefore: mainToken.Range.Start.Byte - lastByteOffset,
}
lastByteOffset = mainToken.Range.End.Byte
}
// Now make a slice of pointers into the previous slice.
ret := make(Tokens, len(tokBuf))
for i := range ret {
ret[i] = &tokBuf[i]
}
return ret
}
// partitionTokens takes a sequence of tokens and a hcl.Range and returns
// two indices within the token sequence that correspond with the range
// boundaries, such that the slice operator could be used to produce
// three token sequences for before, within, and after respectively:
//
// start, end := partitionTokens(toks, rng)
// before := toks[:start]
// within := toks[start:end]
// after := toks[end:]
//
// This works best when the range is aligned with token boundaries (e.g.
// because it was produced in terms of the scanner's result) but if that isn't
// true then it will make a best effort that may produce strange results at
// the boundaries.
//
// Native hclsyntax tokens are used here, because they contain the necessary
// absolute position information. However, since writerTokens produces a
// correlatable sequence of writer tokens, the resulting indices can be
// used also to index into its result, allowing the partitioning of writer
// tokens to be driven by the partitioning of native tokens.
//
// The tokens are assumed to be in source order and non-overlapping, which
// will be true if the token sequence from the scanner is used directly.
func partitionTokens(toks hclsyntax.Tokens, rng hcl.Range) (start, end int) {
// We us a linear search here because we assume tha in most cases our
// target range is close to the beginning of the sequence, and the seqences
// are generally small for most reasonable files anyway.
for i := 0; ; i++ {
if i >= len(toks) {
// No tokens for the given range at all!
return len(toks), len(toks)
}
if toks[i].Range.Start.Byte >= rng.Start.Byte {
start = i
break
}
}
for i := start; ; i++ {
if i >= len(toks) {
// The range "hangs off" the end of the token sequence
return start, len(toks)
}
if toks[i].Range.Start.Byte >= rng.End.Byte {
end = i // end marker is exclusive
break
}
}
return start, end
}
// partitionLeadCommentTokens takes a sequence of tokens that is assumed
// to immediately precede a construct that can have lead comment tokens,
// and returns the index into that sequence where the lead comments begin.
//
// Lead comments are defined as whole lines containing only comment tokens
// with no blank lines between. If no such lines are found, the returned
// index will be len(toks).
func partitionLeadCommentTokens(toks hclsyntax.Tokens) int {
// single-line comments (which is what we're interested in here)
// consume their trailing newline, so we can just walk backwards
// until we stop seeing comment tokens.
for i := len(toks) - 1; i >= 0; i-- {
if toks[i].Type != hclsyntax.TokenComment {
return i + 1
}
}
return 0
}
// partitionLineEndTokens takes a sequence of tokens that is assumed
// to immediately follow a construct that can have a line comment, and
// returns first the index where any line comments end and then second
// the index immediately after the trailing newline.
//
// Line comments are defined as comments that appear immediately after
// a construct on the same line where its significant tokens ended.
//
// Since single-line comment tokens (# and //) include the newline that
// terminates them, in the presence of these the two returned indices
// will be the same since the comment itself serves as the line end.
func partitionLineEndTokens(toks hclsyntax.Tokens) (afterComment, afterNewline int) {
for i := 0; i < len(toks); i++ {
tok := toks[i]
if tok.Type != hclsyntax.TokenComment {
switch tok.Type {
case hclsyntax.TokenNewline:
return i, i + 1
case hclsyntax.TokenEOF:
// Although this is valid, we mustn't include the EOF
// itself as our "newline" or else strange things will
// happen when we try to append new items.
return i, i
default:
// If we have well-formed input here then nothing else should be
// possible. This path should never happen, because we only try
// to extract tokens from the sequence if the parser succeeded,
// and it should catch this problem itself.
panic("malformed line trailers: expected only comments and newlines")
}
}
if len(tok.Bytes) > 0 && tok.Bytes[len(tok.Bytes)-1] == '\n' {
// Newline at the end of a single-line comment serves both as
// the end of comments *and* the end of the line.
return i + 1, i + 1
}
}
return len(toks), len(toks)
}
// lexConfig uses the hclsyntax scanner to get a token stream and then
// rewrites it into this package's token model.
//
// Any errors produced during scanning are ignored, so the results of this
// function should be used with care.
func lexConfig(src []byte) Tokens {
mainTokens, _ := hclsyntax.LexConfig(src, "", hcl.Pos{Byte: 0, Line: 1, Column: 1})
return writerTokens(mainTokens)
}