hcl/pos_scanner.go
Martin Atkins 2eaeb36cb3 Use Unicode 13 text segmentation rules
HCL uses a number of upstream libraries that implement algorithms defined
in Unicode. This commit is updating those libraries all to versions that
have Unicode 13 support.

The main implication of this for HCL directly is that when it returns
column numbers in source locations it will count characters using the
Unicode 13 definition of "character", which includes various new
multi-codeunit characters added in Unicode 13.

These new version dependencies will also make Unicode 13 support available
for other functionality that HCL callers might use, such as the stdlib
functions in upstream cty, even though HCL itself does not directly use
those.
2021-02-23 09:05:19 -08:00

153 lines
4.6 KiB
Go

package hcl
import (
"bufio"
"bytes"
"github.com/apparentlymart/go-textseg/v13/textseg"
)
// RangeScanner is a helper that will scan over a buffer using a bufio.SplitFunc
// and visit a source range for each token matched.
//
// For example, this can be used with bufio.ScanLines to find the source range
// for each line in the file, skipping over the actual newline characters, which
// may be useful when printing source code snippets as part of diagnostic
// messages.
//
// The line and column information in the returned ranges is produced by
// counting newline characters and grapheme clusters respectively, which
// mimics the behavior we expect from a parser when producing ranges.
type RangeScanner struct {
filename string
b []byte
cb bufio.SplitFunc
pos Pos // position of next byte to process in b
cur Range // latest range
tok []byte // slice of b that is covered by cur
err error // error from last scan, if any
}
// NewRangeScanner creates a new RangeScanner for the given buffer, producing
// ranges for the given filename.
//
// Since ranges have grapheme-cluster granularity rather than byte granularity,
// the scanner will produce incorrect results if the given SplitFunc creates
// tokens between grapheme cluster boundaries. In particular, it is incorrect
// to use RangeScanner with bufio.ScanRunes because it will produce tokens
// around individual UTF-8 sequences, which will split any multi-sequence
// grapheme clusters.
func NewRangeScanner(b []byte, filename string, cb bufio.SplitFunc) *RangeScanner {
return NewRangeScannerFragment(b, filename, InitialPos, cb)
}
// NewRangeScannerFragment is like NewRangeScanner but the ranges it produces
// will be offset by the given starting position, which is appropriate for
// sub-slices of a file, whereas NewRangeScanner assumes it is scanning an
// entire file.
func NewRangeScannerFragment(b []byte, filename string, start Pos, cb bufio.SplitFunc) *RangeScanner {
return &RangeScanner{
filename: filename,
b: b,
cb: cb,
pos: start,
}
}
func (sc *RangeScanner) Scan() bool {
if sc.pos.Byte >= len(sc.b) || sc.err != nil {
// All done
return false
}
// Since we're operating on an in-memory buffer, we always pass the whole
// remainder of the buffer to our SplitFunc and set isEOF to let it know
// that it has the whole thing.
advance, token, err := sc.cb(sc.b[sc.pos.Byte:], true)
// Since we are setting isEOF to true this should never happen, but
// if it does we will just abort and assume the SplitFunc is misbehaving.
if advance == 0 && token == nil && err == nil {
return false
}
if err != nil {
sc.err = err
sc.cur = Range{
Filename: sc.filename,
Start: sc.pos,
End: sc.pos,
}
sc.tok = nil
return false
}
sc.tok = token
start := sc.pos
end := sc.pos
new := sc.pos
// adv is similar to token but it also includes any subsequent characters
// we're being asked to skip over by the SplitFunc.
// adv is a slice covering any additional bytes we are skipping over, based
// on what the SplitFunc told us to do with advance.
adv := sc.b[sc.pos.Byte : sc.pos.Byte+advance]
// We now need to scan over our token to count the grapheme clusters
// so we can correctly advance Column, and count the newlines so we
// can correctly advance Line.
advR := bytes.NewReader(adv)
gsc := bufio.NewScanner(advR)
advanced := 0
gsc.Split(textseg.ScanGraphemeClusters)
for gsc.Scan() {
gr := gsc.Bytes()
new.Byte += len(gr)
new.Column++
// We rely here on the fact that \r\n is considered a grapheme cluster
// and so we don't need to worry about miscounting additional lines
// on files with Windows-style line endings.
if len(gr) != 0 && (gr[0] == '\r' || gr[0] == '\n') {
new.Column = 1
new.Line++
}
if advanced < len(token) {
// If we've not yet found the end of our token then we'll
// also push our "end" marker along.
// (if advance > len(token) then we'll stop moving "end" early
// so that the caller only sees the range covered by token.)
end = new
}
advanced += len(gr)
}
sc.cur = Range{
Filename: sc.filename,
Start: start,
End: end,
}
sc.pos = new
return true
}
// Range returns a range that covers the latest token obtained after a call
// to Scan returns true.
func (sc *RangeScanner) Range() Range {
return sc.cur
}
// Bytes returns the slice of the input buffer that is covered by the range
// that would be returned by Range.
func (sc *RangeScanner) Bytes() []byte {
return sc.tok
}
// Err can be called after Scan returns false to determine if the latest read
// resulted in an error, and obtain that error if so.
func (sc *RangeScanner) Err() error {
return sc.err
}