json: count whole grapheme clusters as columns for position purposes

This means we can actually point at a column in the console without it getting misaligned by multi-byte UTF-8 sequences and Unicode combining characters.
2017-05-27 17:10:51 -07:00 · 2017-05-27 17:10:51 -07:00 · 42b18cc7c6
commit 42b18cc7c6
parent 2cb0bd7e83
2 changed files with 79 additions and 7 deletions
--- a/zcl/json/scanner.go
+++ b/zcl/json/scanner.go
@ -3,6 +3,7 @@ package json
 import (
 	"fmt"
 	"github.com/apparentlymart/go-textseg/textseg"
 	"github.com/apparentlymart/go-zcl/zcl"
 )
@ -213,15 +214,14 @@ Byte:
 		case b < 32:
 			break Byte
 		default:
-			// TODO: Use Unicode Text Segmentation spec to advance
+			// Advance by one grapheme cluster, so that we consider each
-			// Column only once per grapheme cluster, rather than once per
+			// grapheme to be a "column".
-			// byte.
+			// Ignoring error because this scanner cannot produce errors.
-			// Consume one or more UTF-8 codepoints that together form
+			advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true)
 			// a single grapheme cluster.
-			p.Pos.Byte++
+			p.Pos.Byte += advance
 			p.Pos.Column++
-			i++
+			i += advance
 			escaping = false
 		}
--- a/zcl/json/scanner_test.go
+++ b/zcl/json/scanner_test.go
@ -738,6 +738,78 @@ func TestScan(t *testing.T) {
 				},
 			},
 		},
 		{
 			`"🇬🇧"`,
 			[]token{
 				{
 					Type:  tokenString,
 					Bytes: []byte(`"🇬🇧"`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   0,
 							Line:   1,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   10,
 							Line:   1,
 							Column: 4,
 						},
 					},
 				},
 				{
 					Type: tokenEOF,
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   10,
 							Line:   1,
 							Column: 4,
 						},
 						End: zcl.Pos{
 							Byte:   10,
 							Line:   1,
 							Column: 4,
 						},
 					},
 				},
 			},
 		},
 		{
 			`"á́́́́́́́"`,
 			[]token{
 				{
 					Type:  tokenString,
 					Bytes: []byte(`"á́́́́́́́"`),
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   0,
 							Line:   1,
 							Column: 1,
 						},
 						End: zcl.Pos{
 							Byte:   19,
 							Line:   1,
 							Column: 4,
 						},
 					},
 				},
 				{
 					Type: tokenEOF,
 					Range: zcl.Range{
 						Start: zcl.Pos{
 							Byte:   19,
 							Line:   1,
 							Column: 4,
 						},
 						End: zcl.Pos{
 							Byte:   19,
 							Line:   1,
 							Column: 4,
 						},
 					},
 				},
 			},
 		},
 	}
 	for _, test := range tests {