zclsyntax: re-organize and simplify the scanner

This commit is contained in:
Martin Atkins 2017-05-28 08:38:13 -07:00
parent b8db08bf04
commit 187d7b8045
5 changed files with 255 additions and 106 deletions

View File

@ -11,73 +11,73 @@ import (
var _zcltok_actions []byte = []byte{
0, 1, 0, 1, 1, 1, 2, 1, 3,
1, 4, 1, 5, 1, 6, 1, 7,
1, 8,
}
var _zcltok_key_offsets []byte = []byte{
0, 0, 2, 4, 5, 15, 17, 19,
0, 2, 4, 16, 17, 18, 20, 22,
}
var _zcltok_trans_keys []byte = []byte{
128, 191, 128, 191, 32, 128, 191, 192,
223, 224, 239, 240, 247, 248, 255, 128,
191, 128, 191, 128, 191,
128, 191, 128, 191, 9, 32, 128, 191,
192, 223, 224, 239, 240, 247, 248, 255,
9, 32, 128, 191, 128, 191, 128, 191,
}
var _zcltok_single_lengths []byte = []byte{
0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 2, 1, 1, 0, 0, 0,
}
var _zcltok_range_lengths []byte = []byte{
0, 1, 1, 0, 5, 1, 1, 1,
1, 1, 5, 0, 0, 1, 1, 1,
}
var _zcltok_index_offsets []byte = []byte{
0, 0, 2, 4, 6, 12, 14, 16,
0, 2, 4, 12, 14, 16, 18, 20,
}
var _zcltok_trans_targs []byte = []byte{
4, 4, 1, 4, 3, 0, 4, 5,
6, 7, 4, 4, 4, 4, 1, 4,
2, 4, 4, 4, 4, 4, 4,
2, 2, 0, 2, 3, 4, 2, 5,
6, 7, 2, 2, 3, 2, 4, 2,
2, 2, 0, 2, 1, 2, 2, 2,
2, 2, 2, 2, 2,
}
var _zcltok_trans_actions []byte = []byte{
9, 15, 0, 15, 1, 0, 11, 0,
7, 7, 11, 9, 9, 13, 0, 13,
0, 13, 15, 15, 13, 13, 13,
7, 17, 0, 17, 0, 0, 9, 0,
5, 5, 9, 7, 0, 13, 0, 11,
7, 15, 0, 15, 0, 15, 17, 17,
13, 11, 15, 15, 15,
}
var _zcltok_to_state_actions []byte = []byte{
0, 0, 0, 3, 3, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0,
}
var _zcltok_from_state_actions []byte = []byte{
0, 0, 0, 0, 5, 0, 0, 0,
0, 0, 3, 0, 0, 0, 0, 0,
}
var _zcltok_eof_trans []byte = []byte{
0, 20, 20, 0, 0, 23, 23, 23,
24, 24, 0, 25, 26, 29, 29, 29,
}
const zcltok_start int = 3
const zcltok_first_final int = 3
const zcltok_error int = 0
const zcltok_start int = 2
const zcltok_first_final int = 2
const zcltok_error int = -1
const zcltok_en_token int = 4
const zcltok_en_main int = 3
const zcltok_en_main int = 2
// line 13 "scan_tokens.rl"
func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
offset := 0
f := &tokenAccum{
Filename: filename,
Bytes: data,
Start: start,
Pos: start,
}
// line 69 "scan_tokens.rl"
// line 47 "scan_tokens.rl"
// Ragel state
cs := 0 // Current State
@ -94,7 +94,11 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
_ = act
_ = eof
// line 104 "scan_tokens.go"
token := func(ty TokenType) {
f.emitToken(ty, ts, te)
}
// line 109 "scan_tokens.go"
{
cs = zcltok_start
ts = 0
@ -102,7 +106,7 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
act = 0
}
// line 112 "scan_tokens.go"
// line 117 "scan_tokens.go"
{
var _klen int
var _trans int
@ -112,9 +116,6 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
if p == pe {
goto _test_eof
}
if cs == 0 {
goto _out
}
_resume:
_acts = int(_zcltok_from_state_actions[cs])
_nacts = uint(_zcltok_actions[_acts])
@ -122,12 +123,12 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
for ; _nacts > 0; _nacts-- {
_acts++
switch _zcltok_actions[_acts-1] {
case 2:
case 1:
// line 1 "NONE"
ts = p
// line 136 "scan_tokens.go"
// line 138 "scan_tokens.go"
}
}
@ -197,48 +198,55 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
for ; _nacts > 0; _nacts-- {
_acts++
switch _zcltok_actions[_acts-1] {
case 0:
// line 25 "scan_tokens.rl"
offset = p
cs = 4
goto _again
case 3:
case 2:
// line 1 "NONE"
te = p + 1
case 4:
// line 30 "scan_tokens.rl"
case 3:
// line 43 "scan_tokens.rl"
te = p + 1
{
f.emitToken(TokenInvalid, offset, p+1)
token(TokenInvalid)
}
case 4:
// line 44 "scan_tokens.rl"
te = p + 1
{
token(TokenBadUTF8)
}
case 5:
// line 34 "scan_tokens.rl"
// line 41 "scan_tokens.rl"
te = p
p--
te = p + 1
{
f.emitToken(TokenBadUTF8, offset, p+1)
}
case 6:
// line 34 "scan_tokens.rl"
// line 42 "scan_tokens.rl"
te = p
p--
{
f.emitToken(TokenBadUTF8, offset, p+1)
token(TokenTabs)
}
case 7:
// line 34 "scan_tokens.rl"
// line 44 "scan_tokens.rl"
te = p
p--
{
token(TokenBadUTF8)
}
case 8:
// line 44 "scan_tokens.rl"
p = (te) - 1
{
f.emitToken(TokenBadUTF8, offset, p+1)
token(TokenBadUTF8)
}
// line 248 "scan_tokens.go"
// line 245 "scan_tokens.go"
}
}
@ -249,18 +257,15 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
for ; _nacts > 0; _nacts-- {
_acts++
switch _zcltok_actions[_acts-1] {
case 1:
case 0:
// line 1 "NONE"
ts = 0
// line 263 "scan_tokens.go"
// line 260 "scan_tokens.go"
}
}
if cs == 0 {
goto _out
}
p++
if p != pe {
goto _resume
@ -275,12 +280,9 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
}
}
_out:
{
}
}
// line 89 "scan_tokens.rl"
// line 71 "scan_tokens.rl"
// If we fall out here without being in a final state then we've
// encountered something that the scanner can't match, which we'll
@ -289,5 +291,9 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
f.emitToken(TokenInvalid, p, len(data))
}
// We always emit a synthetic EOF token at the end, since it gives the
// parser position information for an "unexpected EOF" diagnostic.
f.emitToken(TokenEOF, len(data), len(data))
return f.Tokens
}

View File

@ -13,31 +13,13 @@ import (
}%%
func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
offset := 0
f := &tokenAccum{
Filename: filename,
Bytes: data,
Start: start,
Pos: start,
}
%%{
action start {
offset = p
fgoto token;
}
action EmitInvalid {
f.emitToken(TokenInvalid, offset, p+1)
}
action EmitBadUTF8 {
f.emitToken(TokenBadUTF8, offset, p+1)
}
action EmitEOF {
f.emitToken(TokenEOF, offset, offset)
}
UTF8Cont = 0x80 .. 0xBF;
AnyUTF8 = (
@ -46,26 +28,22 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
0xE0..0xEF . UTF8Cont . UTF8Cont |
0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
);
AnyUTF8Tok = AnyUTF8 >start;
BrokenUTF8 = any - AnyUTF8;
EmptyTok = "";
# Tabs are not valid, but we accept them in the scanner and mark them
# as tokens so that we can produce diagnostics advising the user to
# use spaces instead.
TabTok = 0x09 >start;
Tabs = 0x09+;
token := |*
AnyUTF8 => EmitInvalid;
BrokenUTF8 => EmitBadUTF8;
EmptyTok => EmitEOF;
Spaces = ' '+;
main := |*
Spaces => {};
Tabs => { token(TokenTabs) };
AnyUTF8 => { token(TokenInvalid) };
BrokenUTF8 => { token(TokenBadUTF8) };
*|;
Spaces = ' '*;
main := Spaces @start;
}%%
// Ragel state
@ -83,6 +61,10 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
_ = act
_ = eof
token := func (ty TokenType) {
f.emitToken(ty, ts, te)
}
%%{
write init;
write exec;
@ -95,5 +77,9 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
f.emitToken(TokenInvalid, p, len(data))
}
// We always emit a synthetic EOF token at the end, since it gives the
// parser position information for an "unexpected EOF" diagnostic.
f.emitToken(TokenEOF, len(data), len(data))
return f.Tokens
}

View File

@ -0,0 +1,155 @@
package zclsyntax
import (
"reflect"
"testing"
"github.com/kylelemons/godebug/pretty"
"github.com/zclconf/go-zcl/zcl"
)
func TestScanTokens(t *testing.T) {
tests := []struct {
input string
want []Token
}{
{
``,
[]Token{
{
Type: TokenEOF,
Bytes: []byte{},
Range: zcl.Range{
Start: zcl.Pos{Byte: 0, Line: 1, Column: 1},
End: zcl.Pos{Byte: 0, Line: 1, Column: 1},
},
},
},
},
{
` `,
[]Token{
{
Type: TokenEOF,
Bytes: []byte{},
Range: zcl.Range{
Start: zcl.Pos{Byte: 1, Line: 1, Column: 2},
End: zcl.Pos{Byte: 1, Line: 1, Column: 2},
},
},
},
},
{
`|`,
[]Token{
{
Type: TokenInvalid,
Bytes: []byte(`|`),
Range: zcl.Range{
Start: zcl.Pos{Byte: 0, Line: 1, Column: 1},
End: zcl.Pos{Byte: 1, Line: 1, Column: 2},
},
},
{
Type: TokenEOF,
Bytes: []byte{},
Range: zcl.Range{
Start: zcl.Pos{Byte: 1, Line: 1, Column: 2},
End: zcl.Pos{Byte: 1, Line: 1, Column: 2},
},
},
},
},
{
"\x80", // UTF-8 continuation without an introducer
[]Token{
{
Type: TokenBadUTF8,
Bytes: []byte{0x80},
Range: zcl.Range{
Start: zcl.Pos{Byte: 0, Line: 1, Column: 1},
End: zcl.Pos{Byte: 1, Line: 1, Column: 2},
},
},
{
Type: TokenEOF,
Bytes: []byte{},
Range: zcl.Range{
Start: zcl.Pos{Byte: 1, Line: 1, Column: 2},
End: zcl.Pos{Byte: 1, Line: 1, Column: 2},
},
},
},
},
{
" \x80\x80", // UTF-8 continuation without an introducer
[]Token{
{
Type: TokenBadUTF8,
Bytes: []byte{0x80},
Range: zcl.Range{
Start: zcl.Pos{Byte: 1, Line: 1, Column: 2},
End: zcl.Pos{Byte: 2, Line: 1, Column: 3},
},
},
{
Type: TokenBadUTF8,
Bytes: []byte{0x80},
Range: zcl.Range{
Start: zcl.Pos{Byte: 2, Line: 1, Column: 3},
End: zcl.Pos{Byte: 3, Line: 1, Column: 4},
},
},
{
Type: TokenEOF,
Bytes: []byte{},
Range: zcl.Range{
Start: zcl.Pos{Byte: 3, Line: 1, Column: 4},
End: zcl.Pos{Byte: 3, Line: 1, Column: 4},
},
},
},
},
{
"\t\t",
[]Token{
{
Type: TokenTabs,
Bytes: []byte{0x09, 0x09},
Range: zcl.Range{
Start: zcl.Pos{Byte: 0, Line: 1, Column: 1},
End: zcl.Pos{Byte: 2, Line: 1, Column: 3},
},
},
{
Type: TokenEOF,
Bytes: []byte{},
Range: zcl.Range{
Start: zcl.Pos{Byte: 2, Line: 1, Column: 3},
End: zcl.Pos{Byte: 2, Line: 1, Column: 3},
},
},
},
},
}
prettyConfig := &pretty.Config{
Diffable: true,
IncludeUnexported: true,
PrintStringers: true,
}
for _, test := range tests {
t.Run(test.input, func(t *testing.T) {
got := scanTokens([]byte(test.input), "", zcl.Pos{Byte: 0, Line: 1, Column: 1})
if !reflect.DeepEqual(got, test.want) {
diff := prettyConfig.Compare(test.want, got)
t.Errorf(
"wrong result\ninput: %s\ndiff: %s",
test.input, diff,
)
}
})
}
}

View File

@ -79,7 +79,7 @@ const (
TokenStarStar TokenType = '➚'
TokenBacktick TokenType = '`'
TokenSemicolon TokenType = ';'
TokenTab TokenType = '␉'
TokenTabs TokenType = '␉'
TokenInvalid TokenType = '<27>'
TokenBadUTF8 TokenType = '💩'
)
@ -91,21 +91,21 @@ func (t TokenType) GoString() string {
type tokenAccum struct {
Filename string
Bytes []byte
Start zcl.Pos
Pos zcl.Pos
Tokens []Token
}
func (f *tokenAccum) emitToken(ty TokenType, startOfs int, endOfs int) {
func (f *tokenAccum) emitToken(ty TokenType, startOfs, endOfs int) {
// Walk through our buffer to figure out how much we need to adjust
// the start pos to get our end pos.
start := f.Start
start.Byte += startOfs
start.Column += startOfs // Safe because only ASCII spaces can be in the offset
start := f.Pos
start.Column += startOfs - f.Pos.Byte // Safe because only ASCII spaces can be in the offset
start.Byte = startOfs
end := start
end.Byte = f.Start.Byte + endOfs
b := f.Bytes
end.Byte = endOfs
b := f.Bytes[startOfs:endOfs]
for len(b) > 0 {
advance, seq, _ := textseg.ScanGraphemeClusters(b, true)
if len(seq) == 1 && seq[0] == '\n' {
@ -117,6 +117,8 @@ func (f *tokenAccum) emitToken(ty TokenType, startOfs int, endOfs int) {
b = b[advance:]
}
f.Pos = end
f.Tokens = append(f.Tokens, Token{
Type: ty,
Bytes: f.Bytes[startOfs:endOfs],

View File

@ -4,7 +4,7 @@ package zclsyntax
import "fmt"
const _TokenType_name = "TokenNewlineTokenBangTokenBitwiseAndTokenOParenTokenCParenTokenStarTokenPlusTokenMinusTokenDotTokenSlashTokenColonTokenSemicolonTokenLessThanTokenEqualTokenGreaterThanTokenQuestionTokenHeredocTokenIdentTokenNumberLitTokenStringLitTokenOBrackTokenCBrackTokenBitwiseXorTokenBacktickTokenOBraceTokenBitwiseOrTokenCBraceTokenBitwiseNotTokenOQuoteTokenCQuoteTokenTemplateControlTokenTemplateSeqEndTokenAndTokenOrTokenTemplateInterpTokenNotEqualTokenLessThanEqTokenGreaterThanEqTokenEOFTokenTabTokenStarStarTokenInvalidTokenBadUTF8"
const _TokenType_name = "TokenNewlineTokenBangTokenBitwiseAndTokenOParenTokenCParenTokenStarTokenPlusTokenMinusTokenDotTokenSlashTokenColonTokenSemicolonTokenLessThanTokenEqualTokenGreaterThanTokenQuestionTokenHeredocTokenIdentTokenNumberLitTokenStringLitTokenOBrackTokenCBrackTokenBitwiseXorTokenBacktickTokenOBraceTokenBitwiseOrTokenCBraceTokenBitwiseNotTokenOQuoteTokenCQuoteTokenTemplateControlTokenTemplateSeqEndTokenAndTokenOrTokenTemplateInterpTokenNotEqualTokenLessThanEqTokenGreaterThanEqTokenEOFTokenTabsTokenStarStarTokenInvalidTokenBadUTF8"
var _TokenType_map = map[TokenType]string{
10: _TokenType_name[0:12],
@ -46,10 +46,10 @@ var _TokenType_map = map[TokenType]string{
8804: _TokenType_name[439:454],
8805: _TokenType_name[454:472],
9220: _TokenType_name[472:480],
9225: _TokenType_name[480:488],
10138: _TokenType_name[488:501],
65533: _TokenType_name[501:513],
128169: _TokenType_name[513:525],
9225: _TokenType_name[480:489],
10138: _TokenType_name[489:502],
65533: _TokenType_name[502:514],
128169: _TokenType_name[514:526],
}
func (i TokenType) String() string {