json: count whole grapheme clusters as columns for position purposes

This means we can actually point at a column in the console without it getting misaligned by multi-byte UTF-8 sequences and Unicode combining characters.
2017-05-27 17:10:51 -07:00 · 2017-05-27 17:10:51 -07:00 · 42b18cc7c6
commit 42b18cc7c6
parent 2cb0bd7e83
2 changed files with 79 additions and 7 deletions
--- a/zcl/json/scanner.go
+++ b/zcl/json/scanner.go
@ -3,6 +3,7 @@ package json
 import (
 	"fmt"

+	"github.com/apparentlymart/go-textseg/textseg"
 	"github.com/apparentlymart/go-zcl/zcl"
 )

@ -213,15 +214,14 @@ Byte:
 		case b < 32:
 			break Byte
 		default:
-			// TODO: Use Unicode Text Segmentation spec to advance
-			// Column only once per grapheme cluster, rather than once per
-			// byte.
-			// Consume one or more UTF-8 codepoints that together form
-			// a single grapheme cluster.
+			// Advance by one grapheme cluster, so that we consider each
+			// grapheme to be a "column".
+			// Ignoring error because this scanner cannot produce errors.
+			advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true)

-			p.Pos.Byte++
+			p.Pos.Byte += advance
 			p.Pos.Column++
-			i++
+			i += advance

 			escaping = false
 		}
--- a/zcl/json/scanner_test.go
+++ b/zcl/json/scanner_test.go
@ -738,6 +738,78 @@ func TestScan(t *testing.T) {
 				},
 			},
 		},
+		{
+			`"🇬🇧"`,
+			[]token{
+				{
+					Type:  tokenString,
+					Bytes: []byte(`"🇬🇧"`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   0,
+							Line:   1,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   10,
+							Line:   1,
+							Column: 4,
+						},
+					},
+				},
+				{
+					Type: tokenEOF,
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   10,
+							Line:   1,
+							Column: 4,
+						},
+						End: zcl.Pos{
+							Byte:   10,
+							Line:   1,
+							Column: 4,
+						},
+					},
+				},
+			},
+		},
+		{
+			`"á́́́́́́́"`,
+			[]token{
+				{
+					Type:  tokenString,
+					Bytes: []byte(`"á́́́́́́́"`),
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   0,
+							Line:   1,
+							Column: 1,
+						},
+						End: zcl.Pos{
+							Byte:   19,
+							Line:   1,
+							Column: 4,
+						},
+					},
+				},
+				{
+					Type: tokenEOF,
+					Range: zcl.Range{
+						Start: zcl.Pos{
+							Byte:   19,
+							Line:   1,
+							Column: 4,
+						},
+						End: zcl.Pos{
+							Byte:   19,
+							Line:   1,
+							Column: 4,
+						},
+					},
+				},
+			},
+		},
 	}

 	for _, test := range tests {