json: count whole grapheme clusters as columns for position purposes

This means we can actually point at a column in the console without it
getting misaligned by multi-byte UTF-8 sequences and Unicode combining
characters.
This commit is contained in:
Martin Atkins 2017-05-27 17:10:51 -07:00
parent 2cb0bd7e83
commit 42b18cc7c6
2 changed files with 79 additions and 7 deletions

View File

@ -3,6 +3,7 @@ package json
import ( import (
"fmt" "fmt"
"github.com/apparentlymart/go-textseg/textseg"
"github.com/apparentlymart/go-zcl/zcl" "github.com/apparentlymart/go-zcl/zcl"
) )
@ -213,15 +214,14 @@ Byte:
case b < 32: case b < 32:
break Byte break Byte
default: default:
// TODO: Use Unicode Text Segmentation spec to advance // Advance by one grapheme cluster, so that we consider each
// Column only once per grapheme cluster, rather than once per // grapheme to be a "column".
// byte. // Ignoring error because this scanner cannot produce errors.
// Consume one or more UTF-8 codepoints that together form advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true)
// a single grapheme cluster.
p.Pos.Byte++ p.Pos.Byte += advance
p.Pos.Column++ p.Pos.Column++
i++ i += advance
escaping = false escaping = false
} }

View File

@ -738,6 +738,78 @@ func TestScan(t *testing.T) {
}, },
}, },
}, },
{
`"🇬🇧"`,
[]token{
{
Type: tokenString,
Bytes: []byte(`"🇬🇧"`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 10,
Line: 1,
Column: 4,
},
},
},
{
Type: tokenEOF,
Range: zcl.Range{
Start: zcl.Pos{
Byte: 10,
Line: 1,
Column: 4,
},
End: zcl.Pos{
Byte: 10,
Line: 1,
Column: 4,
},
},
},
},
},
{
`"á́́́́́́́"`,
[]token{
{
Type: tokenString,
Bytes: []byte(`"á́́́́́́́"`),
Range: zcl.Range{
Start: zcl.Pos{
Byte: 0,
Line: 1,
Column: 1,
},
End: zcl.Pos{
Byte: 19,
Line: 1,
Column: 4,
},
},
},
{
Type: tokenEOF,
Range: zcl.Range{
Start: zcl.Pos{
Byte: 19,
Line: 1,
Column: 4,
},
End: zcl.Pos{
Byte: 19,
Line: 1,
Column: 4,
},
},
},
},
},
} }
for _, test := range tests { for _, test := range tests {