fee90926da
HCL uses grapheme cluster segmentation to produce accurate "column" indications in diagnostic messages and other human-oriented source location information. Each new major version of Unicode introduces new codepoints, some of which are defined to combine with other codepoints to produce a single visible character (grapheme cluster). We were previously using the rules from Unicode 9.0.0. This change switches to using the segmentation rules from Unicode 12.0.0, which is the latest version at the time of this commit and is also the version of Unicode used for other purposes by the Go 1.14 runtime. HCL does not use text segmentation results for any purpose that would affect the meaning of decoded data extracted from HCL files, so this change will only affect the human-oriented source positions generated for files containing characters that were newly-introduced in Unicode 10, 11, or 12. (Machine-oriented uses of source location information are based on byte offsets and not affected by text segmentation.)
153 lines
4.6 KiB
Go
153 lines
4.6 KiB
Go
package hcl
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
|
|
"github.com/apparentlymart/go-textseg/v12/textseg"
|
|
)
|
|
|
|
// RangeScanner is a helper that will scan over a buffer using a bufio.SplitFunc
|
|
// and visit a source range for each token matched.
|
|
//
|
|
// For example, this can be used with bufio.ScanLines to find the source range
|
|
// for each line in the file, skipping over the actual newline characters, which
|
|
// may be useful when printing source code snippets as part of diagnostic
|
|
// messages.
|
|
//
|
|
// The line and column information in the returned ranges is produced by
|
|
// counting newline characters and grapheme clusters respectively, which
|
|
// mimics the behavior we expect from a parser when producing ranges.
|
|
type RangeScanner struct {
|
|
filename string
|
|
b []byte
|
|
cb bufio.SplitFunc
|
|
|
|
pos Pos // position of next byte to process in b
|
|
cur Range // latest range
|
|
tok []byte // slice of b that is covered by cur
|
|
err error // error from last scan, if any
|
|
}
|
|
|
|
// NewRangeScanner creates a new RangeScanner for the given buffer, producing
|
|
// ranges for the given filename.
|
|
//
|
|
// Since ranges have grapheme-cluster granularity rather than byte granularity,
|
|
// the scanner will produce incorrect results if the given SplitFunc creates
|
|
// tokens between grapheme cluster boundaries. In particular, it is incorrect
|
|
// to use RangeScanner with bufio.ScanRunes because it will produce tokens
|
|
// around individual UTF-8 sequences, which will split any multi-sequence
|
|
// grapheme clusters.
|
|
func NewRangeScanner(b []byte, filename string, cb bufio.SplitFunc) *RangeScanner {
|
|
return NewRangeScannerFragment(b, filename, InitialPos, cb)
|
|
}
|
|
|
|
// NewRangeScannerFragment is like NewRangeScanner but the ranges it produces
|
|
// will be offset by the given starting position, which is appropriate for
|
|
// sub-slices of a file, whereas NewRangeScanner assumes it is scanning an
|
|
// entire file.
|
|
func NewRangeScannerFragment(b []byte, filename string, start Pos, cb bufio.SplitFunc) *RangeScanner {
|
|
return &RangeScanner{
|
|
filename: filename,
|
|
b: b,
|
|
cb: cb,
|
|
pos: start,
|
|
}
|
|
}
|
|
|
|
func (sc *RangeScanner) Scan() bool {
|
|
if sc.pos.Byte >= len(sc.b) || sc.err != nil {
|
|
// All done
|
|
return false
|
|
}
|
|
|
|
// Since we're operating on an in-memory buffer, we always pass the whole
|
|
// remainder of the buffer to our SplitFunc and set isEOF to let it know
|
|
// that it has the whole thing.
|
|
advance, token, err := sc.cb(sc.b[sc.pos.Byte:], true)
|
|
|
|
// Since we are setting isEOF to true this should never happen, but
|
|
// if it does we will just abort and assume the SplitFunc is misbehaving.
|
|
if advance == 0 && token == nil && err == nil {
|
|
return false
|
|
}
|
|
|
|
if err != nil {
|
|
sc.err = err
|
|
sc.cur = Range{
|
|
Filename: sc.filename,
|
|
Start: sc.pos,
|
|
End: sc.pos,
|
|
}
|
|
sc.tok = nil
|
|
return false
|
|
}
|
|
|
|
sc.tok = token
|
|
start := sc.pos
|
|
end := sc.pos
|
|
new := sc.pos
|
|
|
|
// adv is similar to token but it also includes any subsequent characters
|
|
// we're being asked to skip over by the SplitFunc.
|
|
// adv is a slice covering any additional bytes we are skipping over, based
|
|
// on what the SplitFunc told us to do with advance.
|
|
adv := sc.b[sc.pos.Byte : sc.pos.Byte+advance]
|
|
|
|
// We now need to scan over our token to count the grapheme clusters
|
|
// so we can correctly advance Column, and count the newlines so we
|
|
// can correctly advance Line.
|
|
advR := bytes.NewReader(adv)
|
|
gsc := bufio.NewScanner(advR)
|
|
advanced := 0
|
|
gsc.Split(textseg.ScanGraphemeClusters)
|
|
for gsc.Scan() {
|
|
gr := gsc.Bytes()
|
|
new.Byte += len(gr)
|
|
new.Column++
|
|
|
|
// We rely here on the fact that \r\n is considered a grapheme cluster
|
|
// and so we don't need to worry about miscounting additional lines
|
|
// on files with Windows-style line endings.
|
|
if len(gr) != 0 && (gr[0] == '\r' || gr[0] == '\n') {
|
|
new.Column = 1
|
|
new.Line++
|
|
}
|
|
|
|
if advanced < len(token) {
|
|
// If we've not yet found the end of our token then we'll
|
|
// also push our "end" marker along.
|
|
// (if advance > len(token) then we'll stop moving "end" early
|
|
// so that the caller only sees the range covered by token.)
|
|
end = new
|
|
}
|
|
advanced += len(gr)
|
|
}
|
|
|
|
sc.cur = Range{
|
|
Filename: sc.filename,
|
|
Start: start,
|
|
End: end,
|
|
}
|
|
sc.pos = new
|
|
return true
|
|
}
|
|
|
|
// Range returns a range that covers the latest token obtained after a call
|
|
// to Scan returns true.
|
|
func (sc *RangeScanner) Range() Range {
|
|
return sc.cur
|
|
}
|
|
|
|
// Bytes returns the slice of the input buffer that is covered by the range
|
|
// that would be returned by Range.
|
|
func (sc *RangeScanner) Bytes() []byte {
|
|
return sc.tok
|
|
}
|
|
|
|
// Err can be called after Scan returns false to determine if the latest read
|
|
// resulted in an error, and obtain that error if so.
|
|
func (sc *RangeScanner) Err() error {
|
|
return sc.err
|
|
}
|