json: count whole grapheme clusters as columns for position purposes

This means we can actually point at a column in the console without it
getting misaligned by multi-byte UTF-8 sequences and Unicode combining
characters.
This commit is contained in:
Martin Atkins 2017-05-27 17:10:51 -07:00
parent 2cb0bd7e83
commit 42b18cc7c6
2 changed files with 79 additions and 7 deletions

View File

@ -3,6 +3,7 @@ package json
import (
"fmt"
"github.com/apparentlymart/go-textseg/textseg"
"github.com/apparentlymart/go-zcl/zcl"
)
@ -213,15 +214,14 @@ Byte:
case b < 32:
break Byte
default:
// TODO: Use Unicode Text Segmentation spec to advance
// Column only once per grapheme cluster, rather than once per
// byte.
// Consume one or more UTF-8 codepoints that together form
// a single grapheme cluster.
// Advance by one grapheme cluster, so that we consider each
// grapheme to be a "column".
// Ignoring error because this scanner cannot produce errors.
advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true)
p.Pos.Byte++
p.Pos.Byte += advance
p.Pos.Column++
i++
i += advance
escaping = false
}

View File

@ -738,6 +738,78 @@ func TestScan(t *testing.T) {
},
},
},
{
`"🇬🇧"`,
[]token{
{
Type: tokenString,
Bytes: []byte(`"🇬🇧"`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 10,
Line: 1,
Column: 4,
},
},
},
{
Type: tokenEOF,
Range: zcl.Range{
Start: zcl.Pos{
Byte: 10,
Line: 1,
Column: 4,
},
End: zcl.Pos{
Byte: 10,
Line: 1,
Column: 4,
},
},
},
},
},
{
`"á́́́́́́́"`,
[]token{
{
Type: tokenString,
Bytes: []byte(`"á́́́́́́́"`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 19,
Line: 1,
Column: 4,
},
},
},
{
Type: tokenEOF,
Range: zcl.Range{
Start: zcl.Pos{
Byte: 19,
Line: 1,
Column: 4,
},
End: zcl.Pos{
Byte: 19,
Line: 1,
Column: 4,
},
},
},
},
},
}
for _, test := range tests {