hcl: RangeScanner helper

RangeScanner has an interface similar to bufio.Scanner for partitioning a buffer into tokens, but it returns the hcl.Range of each token along with that token so that the caller can see where the token fits in relation to the entire source file. The main intended use-case for this is to partition a source file into lines for the purpose of printing a source code snippet in diagnostic output. Having the source location information is important in that case to recognize which lines belong to the subject and context of each diagnostic.
2018-01-14 11:24:19 -08:00 · 2018-01-14 11:24:19 -08:00 · d34d4686fb
commit d34d4686fb
parent 11e4972f13
2 changed files with 341 additions and 0 deletions
--- a/hcl/pos_scanner.go
+++ b/hcl/pos_scanner.go
@ -0,0 +1,148 @@
+package hcl
+
+import (
+	"bufio"
+	"bytes"
+
+	"github.com/apparentlymart/go-textseg/textseg"
+)
+
+// RangeScanner is a helper that will scan over a buffer using a bufio.SplitFunc
+// and visit a source range for each token matched.
+//
+// For example, this can be used with bufio.ScanLines to find the source range
+// for each line in the file, skipping over the actual newline characters, which
+// may be useful when printing source code snippets as part of diagnostic
+// messages.
+//
+// The line and column information in the returned ranges is produced by
+// counting newline characters and grapheme clusters respectively, which
+// mimics the behavior we expect from a parser when producing ranges.
+type RangeScanner struct {
+	filename string
+	b        []byte
+	cb       bufio.SplitFunc
+
+	pos Pos    // position of next byte to process in b
+	cur Range  // latest range
+	tok []byte // slice of b that is covered by cur
+	err error  // error from last scan, if any
+}
+
+// Create a new RangeScanner for the given buffer, producing ranges for the
+// given filename.
+//
+// Since ranges have grapheme-cluster granularity rather than byte granularity,
+// the scanner will produce incorrect results if the given SplitFunc creates
+// tokens between grapheme cluster boundaries. In particular, it is incorrect
+// to use RangeScanner with bufio.ScanRunes because it will produce tokens
+// around individual UTF-8 sequences, which will split any multi-sequence
+// grapheme clusters.
+func NewRangeScanner(b []byte, filename string, cb bufio.SplitFunc) *RangeScanner {
+	return &RangeScanner{
+		filename: filename,
+		b:        b,
+		cb:       cb,
+		pos: Pos{
+			Byte:   0,
+			Line:   1,
+			Column: 1,
+		},
+	}
+}
+
+func (sc *RangeScanner) Scan() bool {
+	if sc.pos.Byte >= len(sc.b) || sc.err != nil {
+		// All done
+		return false
+	}
+
+	// Since we're operating on an in-memory buffer, we always pass the whole
+	// remainder of the buffer to our SplitFunc and set isEOF to let it know
+	// that it has the whole thing.
+	advance, token, err := sc.cb(sc.b[sc.pos.Byte:], true)
+
+	// Since we are setting isEOF to true this should never happen, but
+	// if it does we will just abort and assume the SplitFunc is misbehaving.
+	if advance == 0 && token == nil && err == nil {
+		return false
+	}
+
+	if err != nil {
+		sc.err = err
+		sc.cur = Range{
+			Filename: sc.filename,
+			Start:    sc.pos,
+			End:      sc.pos,
+		}
+		sc.tok = nil
+		return false
+	}
+
+	sc.tok = token
+	start := sc.pos
+	end := sc.pos
+	new := sc.pos
+
+	// adv is similar to token but it also includes any subsequent characters
+	// we're being asked to skip over by the SplitFunc.
+	// adv is a slice covering any additional bytes we are skipping over, based
+	// on what the SplitFunc told us to do with advance.
+	adv := sc.b[sc.pos.Byte : sc.pos.Byte+advance]
+
+	// We now need to scan over our token to count the grapheme clusters
+	// so we can correctly advance Column, and count the newlines so we
+	// can correctly advance Line.
+	advR := bytes.NewReader(adv)
+	gsc := bufio.NewScanner(advR)
+	advanced := 0
+	gsc.Split(textseg.ScanGraphemeClusters)
+	for gsc.Scan() {
+		gr := gsc.Bytes()
+		new.Byte += len(gr)
+		new.Column++
+
+		// We rely here on the fact that \r\n is considered a grapheme cluster
+		// and so we don't need to worry about miscounting additional lines
+		// on files with Windows-style line endings.
+		if len(gr) != 0 && (gr[0] == '\r' || gr[0] == '\n') {
+			new.Column = 1
+			new.Line++
+		}
+
+		if advanced < len(token) {
+			// If we've not yet found the end of our token then we'll
+			// also push our "end" marker along.
+			// (if advance > len(token) then we'll stop moving "end" early
+			// so that the caller only sees the range covered by token.)
+			end = new
+		}
+		advanced += len(gr)
+	}
+
+	sc.cur = Range{
+		Filename: sc.filename,
+		Start:    start,
+		End:      end,
+	}
+	sc.pos = new
+	return true
+}
+
+// Range returns a range that covers the latest token obtained after a call
+// to Scan returns true.
+func (sc *RangeScanner) Range() Range {
+	return sc.cur
+}
+
+// Bytes returns the slice of the input buffer that is covered by the range
+// that would be returned by Range.
+func (sc *RangeScanner) Bytes() []byte {
+	return sc.tok
+}
+
+// Err can be called after Scan returns false to determine if the latest read
+// resulted in an error, and obtain that error if so.
+func (sc *RangeScanner) Err() error {
+	return sc.err
+}
--- a/hcl/pos_scanner_test.go
+++ b/hcl/pos_scanner_test.go
@ -0,0 +1,193 @@
+package hcl
+
+import (
+	"bufio"
+	"reflect"
+	"testing"
+
+	"github.com/davecgh/go-spew/spew"
+)
+
+func TestPosScanner(t *testing.T) {
+	tests := map[string]struct {
+		Input    string
+		Want     []Range
+		WantToks [][]byte
+	}{
+		"empty": {
+			"",
+			[]Range{},
+			[][]byte{},
+		},
+		"single line": {
+			"hello",
+			[]Range{
+				{
+					Start: Pos{Byte: 0, Line: 1, Column: 1},
+					End:   Pos{Byte: 5, Line: 1, Column: 6},
+				},
+			},
+			[][]byte{
+				[]byte("hello"),
+			},
+		},
+		"single line with trailing UNIX newline": {
+			"hello\n",
+			[]Range{
+				{
+					Start: Pos{Byte: 0, Line: 1, Column: 1},
+					End:   Pos{Byte: 5, Line: 1, Column: 6},
+				},
+			},
+			[][]byte{
+				[]byte("hello"),
+			},
+		},
+		"single line with trailing Windows newline": {
+			"hello\r\n",
+			[]Range{
+				{
+					Start: Pos{Byte: 0, Line: 1, Column: 1},
+					End:   Pos{Byte: 5, Line: 1, Column: 6},
+				},
+			},
+			[][]byte{
+				[]byte("hello"),
+			},
+		},
+		"two lines with UNIX newline": {
+			"hello\nworld",
+			[]Range{
+				{
+					Start: Pos{Byte: 0, Line: 1, Column: 1},
+					End:   Pos{Byte: 5, Line: 1, Column: 6},
+				},
+				{
+					Start: Pos{Byte: 6, Line: 2, Column: 1},
+					End:   Pos{Byte: 11, Line: 2, Column: 6},
+				},
+			},
+			[][]byte{
+				[]byte("hello"),
+				[]byte("world"),
+			},
+		},
+		"two lines with Windows newline": {
+			"hello\r\nworld",
+			[]Range{
+				{
+					Start: Pos{Byte: 0, Line: 1, Column: 1},
+					End:   Pos{Byte: 5, Line: 1, Column: 6},
+				},
+				{
+					Start: Pos{Byte: 7, Line: 2, Column: 1},
+					End:   Pos{Byte: 12, Line: 2, Column: 6},
+				},
+			},
+			[][]byte{
+				[]byte("hello"),
+				[]byte("world"),
+			},
+		},
+		"blank line with UNIX newlines": {
+			"hello\n\nworld",
+			[]Range{
+				{
+					Start: Pos{Byte: 0, Line: 1, Column: 1},
+					End:   Pos{Byte: 5, Line: 1, Column: 6},
+				},
+				{
+					Start: Pos{Byte: 6, Line: 2, Column: 1},
+					End:   Pos{Byte: 6, Line: 2, Column: 1},
+				},
+				{
+					Start: Pos{Byte: 7, Line: 3, Column: 1},
+					End:   Pos{Byte: 12, Line: 3, Column: 6},
+				},
+			},
+			[][]byte{
+				[]byte("hello"),
+				[]byte(""),
+				[]byte("world"),
+			},
+		},
+		"blank line with Windows newlines": {
+			"hello\r\n\r\nworld",
+			[]Range{
+				{
+					Start: Pos{Byte: 0, Line: 1, Column: 1},
+					End:   Pos{Byte: 5, Line: 1, Column: 6},
+				},
+				{
+					Start: Pos{Byte: 7, Line: 2, Column: 1},
+					End:   Pos{Byte: 7, Line: 2, Column: 1},
+				},
+				{
+					Start: Pos{Byte: 9, Line: 3, Column: 1},
+					End:   Pos{Byte: 14, Line: 3, Column: 6},
+				},
+			},
+			[][]byte{
+				[]byte("hello"),
+				[]byte(""),
+				[]byte("world"),
+			},
+		},
+		"two lines with combiner and UNIX newline": {
+			"foo \U0001f469\U0001f3ff bar\nbaz",
+			[]Range{
+				{
+					Start: Pos{Byte: 0, Line: 1, Column: 1},
+					End:   Pos{Byte: 16, Line: 1, Column: 10},
+				},
+				{
+					Start: Pos{Byte: 17, Line: 2, Column: 1},
+					End:   Pos{Byte: 20, Line: 2, Column: 4},
+				},
+			},
+			[][]byte{
+				[]byte("foo \U0001f469\U0001f3ff bar"),
+				[]byte("baz"),
+			},
+		},
+		"two lines with combiner and Windows newline": {
+			"foo \U0001f469\U0001f3ff bar\r\nbaz",
+			[]Range{
+				{
+					Start: Pos{Byte: 0, Line: 1, Column: 1},
+					End:   Pos{Byte: 16, Line: 1, Column: 10},
+				},
+				{
+					Start: Pos{Byte: 18, Line: 2, Column: 1},
+					End:   Pos{Byte: 21, Line: 2, Column: 4},
+				},
+			},
+			[][]byte{
+				[]byte("foo \U0001f469\U0001f3ff bar"),
+				[]byte("baz"),
+			},
+		},
+	}
+
+	for name, test := range tests {
+		t.Run(name, func(t *testing.T) {
+			src := []byte(test.Input)
+			sc := NewRangeScanner(src, "", bufio.ScanLines)
+			got := make([]Range, 0)
+			gotToks := make([][]byte, 0)
+			for sc.Scan() {
+				got = append(got, sc.Range())
+				gotToks = append(gotToks, sc.Bytes())
+			}
+			if sc.Err() != nil {
+				t.Fatalf("unexpected error: %s", sc.Err())
+			}
+			if !reflect.DeepEqual(got, test.Want) {
+				t.Errorf("incorrect ranges\ngot: %swant: %s", spew.Sdump(got), spew.Sdump(test.Want))
+			}
+			if !reflect.DeepEqual(gotToks, test.WantToks) {
+				t.Errorf("incorrect tokens\ngot: %swant: %s", spew.Sdump(gotToks), spew.Sdump(test.WantToks))
+			}
+		})
+	}
+}