zclsyntax: re-organize and simplify the scanner

This commit is contained in:
Martin Atkins 2017-05-28 08:38:13 -07:00
parent b8db08bf04
commit 187d7b8045
5 changed files with 255 additions and 106 deletions

View File

@ -11,73 +11,73 @@ import (
var _zcltok_actions []byte = []byte{ var _zcltok_actions []byte = []byte{
0, 1, 0, 1, 1, 1, 2, 1, 3, 0, 1, 0, 1, 1, 1, 2, 1, 3,
1, 4, 1, 5, 1, 6, 1, 7, 1, 4, 1, 5, 1, 6, 1, 7,
1, 8,
} }
var _zcltok_key_offsets []byte = []byte{ var _zcltok_key_offsets []byte = []byte{
0, 0, 2, 4, 5, 15, 17, 19, 0, 2, 4, 16, 17, 18, 20, 22,
} }
var _zcltok_trans_keys []byte = []byte{ var _zcltok_trans_keys []byte = []byte{
128, 191, 128, 191, 32, 128, 191, 192, 128, 191, 128, 191, 9, 32, 128, 191,
223, 224, 239, 240, 247, 248, 255, 128, 192, 223, 224, 239, 240, 247, 248, 255,
191, 128, 191, 128, 191, 9, 32, 128, 191, 128, 191, 128, 191,
} }
var _zcltok_single_lengths []byte = []byte{ var _zcltok_single_lengths []byte = []byte{
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0,
} }
var _zcltok_range_lengths []byte = []byte{ var _zcltok_range_lengths []byte = []byte{
0, 1, 1, 0, 5, 1, 1, 1, 1, 1, 5, 0, 0, 1, 1, 1,
} }
var _zcltok_index_offsets []byte = []byte{ var _zcltok_index_offsets []byte = []byte{
0, 0, 2, 4, 6, 12, 14, 16, 0, 2, 4, 12, 14, 16, 18, 20,
} }
var _zcltok_trans_targs []byte = []byte{ var _zcltok_trans_targs []byte = []byte{
4, 4, 1, 4, 3, 0, 4, 5, 2, 2, 0, 2, 3, 4, 2, 5,
6, 7, 4, 4, 4, 4, 1, 4, 6, 7, 2, 2, 3, 2, 4, 2,
2, 4, 4, 4, 4, 4, 4, 2, 2, 0, 2, 1, 2, 2, 2,
2, 2, 2, 2, 2,
} }
var _zcltok_trans_actions []byte = []byte{ var _zcltok_trans_actions []byte = []byte{
9, 15, 0, 15, 1, 0, 11, 0, 7, 17, 0, 17, 0, 0, 9, 0,
7, 7, 11, 9, 9, 13, 0, 13, 5, 5, 9, 7, 0, 13, 0, 11,
0, 13, 15, 15, 13, 13, 13, 7, 15, 0, 15, 0, 15, 17, 17,
13, 11, 15, 15, 15,
} }
var _zcltok_to_state_actions []byte = []byte{ var _zcltok_to_state_actions []byte = []byte{
0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
} }
var _zcltok_from_state_actions []byte = []byte{ var _zcltok_from_state_actions []byte = []byte{
0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
} }
var _zcltok_eof_trans []byte = []byte{ var _zcltok_eof_trans []byte = []byte{
0, 20, 20, 0, 0, 23, 23, 23, 24, 24, 0, 25, 26, 29, 29, 29,
} }
const zcltok_start int = 3 const zcltok_start int = 2
const zcltok_first_final int = 3 const zcltok_first_final int = 2
const zcltok_error int = 0 const zcltok_error int = -1
const zcltok_en_token int = 4 const zcltok_en_main int = 2
const zcltok_en_main int = 3
// line 13 "scan_tokens.rl" // line 13 "scan_tokens.rl"
func scanTokens(data []byte, filename string, start zcl.Pos) []Token { func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
offset := 0
f := &tokenAccum{ f := &tokenAccum{
Filename: filename, Filename: filename,
Bytes: data, Bytes: data,
Start: start, Pos: start,
} }
// line 69 "scan_tokens.rl" // line 47 "scan_tokens.rl"
// Ragel state // Ragel state
cs := 0 // Current State cs := 0 // Current State
@ -94,7 +94,11 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
_ = act _ = act
_ = eof _ = eof
// line 104 "scan_tokens.go" token := func(ty TokenType) {
f.emitToken(ty, ts, te)
}
// line 109 "scan_tokens.go"
{ {
cs = zcltok_start cs = zcltok_start
ts = 0 ts = 0
@ -102,7 +106,7 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
act = 0 act = 0
} }
// line 112 "scan_tokens.go" // line 117 "scan_tokens.go"
{ {
var _klen int var _klen int
var _trans int var _trans int
@ -112,9 +116,6 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
if p == pe { if p == pe {
goto _test_eof goto _test_eof
} }
if cs == 0 {
goto _out
}
_resume: _resume:
_acts = int(_zcltok_from_state_actions[cs]) _acts = int(_zcltok_from_state_actions[cs])
_nacts = uint(_zcltok_actions[_acts]) _nacts = uint(_zcltok_actions[_acts])
@ -122,12 +123,12 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
for ; _nacts > 0; _nacts-- { for ; _nacts > 0; _nacts-- {
_acts++ _acts++
switch _zcltok_actions[_acts-1] { switch _zcltok_actions[_acts-1] {
case 2: case 1:
// line 1 "NONE" // line 1 "NONE"
ts = p ts = p
// line 136 "scan_tokens.go" // line 138 "scan_tokens.go"
} }
} }
@ -197,48 +198,55 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
for ; _nacts > 0; _nacts-- { for ; _nacts > 0; _nacts-- {
_acts++ _acts++
switch _zcltok_actions[_acts-1] { switch _zcltok_actions[_acts-1] {
case 0: case 2:
// line 25 "scan_tokens.rl"
offset = p
cs = 4
goto _again
case 3:
// line 1 "NONE" // line 1 "NONE"
te = p + 1 te = p + 1
case 4: case 3:
// line 30 "scan_tokens.rl" // line 43 "scan_tokens.rl"
te = p + 1 te = p + 1
{ {
f.emitToken(TokenInvalid, offset, p+1) token(TokenInvalid)
}
case 4:
// line 44 "scan_tokens.rl"
te = p + 1
{
token(TokenBadUTF8)
} }
case 5: case 5:
// line 34 "scan_tokens.rl" // line 41 "scan_tokens.rl"
te = p
p--
te = p + 1
{
f.emitToken(TokenBadUTF8, offset, p+1)
}
case 6: case 6:
// line 34 "scan_tokens.rl" // line 42 "scan_tokens.rl"
te = p te = p
p-- p--
{ {
f.emitToken(TokenBadUTF8, offset, p+1) token(TokenTabs)
} }
case 7: case 7:
// line 34 "scan_tokens.rl" // line 44 "scan_tokens.rl"
te = p
p--
{
token(TokenBadUTF8)
}
case 8:
// line 44 "scan_tokens.rl"
p = (te) - 1 p = (te) - 1
{ {
f.emitToken(TokenBadUTF8, offset, p+1) token(TokenBadUTF8)
} }
// line 248 "scan_tokens.go" // line 245 "scan_tokens.go"
} }
} }
@ -249,18 +257,15 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
for ; _nacts > 0; _nacts-- { for ; _nacts > 0; _nacts-- {
_acts++ _acts++
switch _zcltok_actions[_acts-1] { switch _zcltok_actions[_acts-1] {
case 1: case 0:
// line 1 "NONE" // line 1 "NONE"
ts = 0 ts = 0
// line 263 "scan_tokens.go" // line 260 "scan_tokens.go"
} }
} }
if cs == 0 {
goto _out
}
p++ p++
if p != pe { if p != pe {
goto _resume goto _resume
@ -275,12 +280,9 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
} }
} }
_out:
{
}
} }
// line 89 "scan_tokens.rl" // line 71 "scan_tokens.rl"
// If we fall out here without being in a final state then we've // If we fall out here without being in a final state then we've
// encountered something that the scanner can't match, which we'll // encountered something that the scanner can't match, which we'll
@ -289,5 +291,9 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
f.emitToken(TokenInvalid, p, len(data)) f.emitToken(TokenInvalid, p, len(data))
} }
// We always emit a synthetic EOF token at the end, since it gives the
// parser position information for an "unexpected EOF" diagnostic.
f.emitToken(TokenEOF, len(data), len(data))
return f.Tokens return f.Tokens
} }

View File

@ -13,31 +13,13 @@ import (
}%% }%%
func scanTokens(data []byte, filename string, start zcl.Pos) []Token { func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
offset := 0
f := &tokenAccum{ f := &tokenAccum{
Filename: filename, Filename: filename,
Bytes: data, Bytes: data,
Start: start, Pos: start,
} }
%%{ %%{
action start {
offset = p
fgoto token;
}
action EmitInvalid {
f.emitToken(TokenInvalid, offset, p+1)
}
action EmitBadUTF8 {
f.emitToken(TokenBadUTF8, offset, p+1)
}
action EmitEOF {
f.emitToken(TokenEOF, offset, offset)
}
UTF8Cont = 0x80 .. 0xBF; UTF8Cont = 0x80 .. 0xBF;
AnyUTF8 = ( AnyUTF8 = (
@ -46,26 +28,22 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
0xE0..0xEF . UTF8Cont . UTF8Cont | 0xE0..0xEF . UTF8Cont . UTF8Cont |
0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont 0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
); );
AnyUTF8Tok = AnyUTF8 >start;
BrokenUTF8 = any - AnyUTF8; BrokenUTF8 = any - AnyUTF8;
EmptyTok = "";
# Tabs are not valid, but we accept them in the scanner and mark them # Tabs are not valid, but we accept them in the scanner and mark them
# as tokens so that we can produce diagnostics advising the user to # as tokens so that we can produce diagnostics advising the user to
# use spaces instead. # use spaces instead.
TabTok = 0x09 >start; Tabs = 0x09+;
token := |* Spaces = ' '+;
AnyUTF8 => EmitInvalid;
BrokenUTF8 => EmitBadUTF8; main := |*
EmptyTok => EmitEOF; Spaces => {};
Tabs => { token(TokenTabs) };
AnyUTF8 => { token(TokenInvalid) };
BrokenUTF8 => { token(TokenBadUTF8) };
*|; *|;
Spaces = ' '*;
main := Spaces @start;
}%% }%%
// Ragel state // Ragel state
@ -83,6 +61,10 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
_ = act _ = act
_ = eof _ = eof
token := func (ty TokenType) {
f.emitToken(ty, ts, te)
}
%%{ %%{
write init; write init;
write exec; write exec;
@ -95,5 +77,9 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
f.emitToken(TokenInvalid, p, len(data)) f.emitToken(TokenInvalid, p, len(data))
} }
// We always emit a synthetic EOF token at the end, since it gives the
// parser position information for an "unexpected EOF" diagnostic.
f.emitToken(TokenEOF, len(data), len(data))
return f.Tokens return f.Tokens
} }

View File

@ -0,0 +1,155 @@
package zclsyntax
import (
"reflect"
"testing"
"github.com/kylelemons/godebug/pretty"
"github.com/zclconf/go-zcl/zcl"
)
func TestScanTokens(t *testing.T) {
tests := []struct {
input string
want []Token
}{
{
``,
[]Token{
{
Type: TokenEOF,
Bytes: []byte{},
Range: zcl.Range{
Start: zcl.Pos{Byte: 0, Line: 1, Column: 1},
End: zcl.Pos{Byte: 0, Line: 1, Column: 1},
},
},
},
},
{
` `,
[]Token{
{
Type: TokenEOF,
Bytes: []byte{},
Range: zcl.Range{
Start: zcl.Pos{Byte: 1, Line: 1, Column: 2},
End: zcl.Pos{Byte: 1, Line: 1, Column: 2},
},
},
},
},
{
`|`,
[]Token{
{
Type: TokenInvalid,
Bytes: []byte(`|`),
Range: zcl.Range{
Start: zcl.Pos{Byte: 0, Line: 1, Column: 1},
End: zcl.Pos{Byte: 1, Line: 1, Column: 2},
},
},
{
Type: TokenEOF,
Bytes: []byte{},
Range: zcl.Range{
Start: zcl.Pos{Byte: 1, Line: 1, Column: 2},
End: zcl.Pos{Byte: 1, Line: 1, Column: 2},
},
},
},
},
{
"\x80", // UTF-8 continuation without an introducer
[]Token{
{
Type: TokenBadUTF8,
Bytes: []byte{0x80},
Range: zcl.Range{
Start: zcl.Pos{Byte: 0, Line: 1, Column: 1},
End: zcl.Pos{Byte: 1, Line: 1, Column: 2},
},
},
{
Type: TokenEOF,
Bytes: []byte{},
Range: zcl.Range{
Start: zcl.Pos{Byte: 1, Line: 1, Column: 2},
End: zcl.Pos{Byte: 1, Line: 1, Column: 2},
},
},
},
},
{
" \x80\x80", // UTF-8 continuation without an introducer
[]Token{
{
Type: TokenBadUTF8,
Bytes: []byte{0x80},
Range: zcl.Range{
Start: zcl.Pos{Byte: 1, Line: 1, Column: 2},
End: zcl.Pos{Byte: 2, Line: 1, Column: 3},
},
},
{
Type: TokenBadUTF8,
Bytes: []byte{0x80},
Range: zcl.Range{
Start: zcl.Pos{Byte: 2, Line: 1, Column: 3},
End: zcl.Pos{Byte: 3, Line: 1, Column: 4},
},
},
{
Type: TokenEOF,
Bytes: []byte{},
Range: zcl.Range{
Start: zcl.Pos{Byte: 3, Line: 1, Column: 4},
End: zcl.Pos{Byte: 3, Line: 1, Column: 4},
},
},
},
},
{
"\t\t",
[]Token{
{
Type: TokenTabs,
Bytes: []byte{0x09, 0x09},
Range: zcl.Range{
Start: zcl.Pos{Byte: 0, Line: 1, Column: 1},
End: zcl.Pos{Byte: 2, Line: 1, Column: 3},
},
},
{
Type: TokenEOF,
Bytes: []byte{},
Range: zcl.Range{
Start: zcl.Pos{Byte: 2, Line: 1, Column: 3},
End: zcl.Pos{Byte: 2, Line: 1, Column: 3},
},
},
},
},
}
prettyConfig := &pretty.Config{
Diffable: true,
IncludeUnexported: true,
PrintStringers: true,
}
for _, test := range tests {
t.Run(test.input, func(t *testing.T) {
got := scanTokens([]byte(test.input), "", zcl.Pos{Byte: 0, Line: 1, Column: 1})
if !reflect.DeepEqual(got, test.want) {
diff := prettyConfig.Compare(test.want, got)
t.Errorf(
"wrong result\ninput: %s\ndiff: %s",
test.input, diff,
)
}
})
}
}

View File

@ -79,7 +79,7 @@ const (
TokenStarStar TokenType = '➚' TokenStarStar TokenType = '➚'
TokenBacktick TokenType = '`' TokenBacktick TokenType = '`'
TokenSemicolon TokenType = ';' TokenSemicolon TokenType = ';'
TokenTab TokenType = '␉' TokenTabs TokenType = '␉'
TokenInvalid TokenType = '<27>' TokenInvalid TokenType = '<27>'
TokenBadUTF8 TokenType = '💩' TokenBadUTF8 TokenType = '💩'
) )
@ -91,21 +91,21 @@ func (t TokenType) GoString() string {
type tokenAccum struct { type tokenAccum struct {
Filename string Filename string
Bytes []byte Bytes []byte
Start zcl.Pos Pos zcl.Pos
Tokens []Token Tokens []Token
} }
func (f *tokenAccum) emitToken(ty TokenType, startOfs int, endOfs int) { func (f *tokenAccum) emitToken(ty TokenType, startOfs, endOfs int) {
// Walk through our buffer to figure out how much we need to adjust // Walk through our buffer to figure out how much we need to adjust
// the start pos to get our end pos. // the start pos to get our end pos.
start := f.Start start := f.Pos
start.Byte += startOfs start.Column += startOfs - f.Pos.Byte // Safe because only ASCII spaces can be in the offset
start.Column += startOfs // Safe because only ASCII spaces can be in the offset start.Byte = startOfs
end := start end := start
end.Byte = f.Start.Byte + endOfs end.Byte = endOfs
b := f.Bytes b := f.Bytes[startOfs:endOfs]
for len(b) > 0 { for len(b) > 0 {
advance, seq, _ := textseg.ScanGraphemeClusters(b, true) advance, seq, _ := textseg.ScanGraphemeClusters(b, true)
if len(seq) == 1 && seq[0] == '\n' { if len(seq) == 1 && seq[0] == '\n' {
@ -117,6 +117,8 @@ func (f *tokenAccum) emitToken(ty TokenType, startOfs int, endOfs int) {
b = b[advance:] b = b[advance:]
} }
f.Pos = end
f.Tokens = append(f.Tokens, Token{ f.Tokens = append(f.Tokens, Token{
Type: ty, Type: ty,
Bytes: f.Bytes[startOfs:endOfs], Bytes: f.Bytes[startOfs:endOfs],

View File

@ -4,7 +4,7 @@ package zclsyntax
import "fmt" import "fmt"
const _TokenType_name = "TokenNewlineTokenBangTokenBitwiseAndTokenOParenTokenCParenTokenStarTokenPlusTokenMinusTokenDotTokenSlashTokenColonTokenSemicolonTokenLessThanTokenEqualTokenGreaterThanTokenQuestionTokenHeredocTokenIdentTokenNumberLitTokenStringLitTokenOBrackTokenCBrackTokenBitwiseXorTokenBacktickTokenOBraceTokenBitwiseOrTokenCBraceTokenBitwiseNotTokenOQuoteTokenCQuoteTokenTemplateControlTokenTemplateSeqEndTokenAndTokenOrTokenTemplateInterpTokenNotEqualTokenLessThanEqTokenGreaterThanEqTokenEOFTokenTabTokenStarStarTokenInvalidTokenBadUTF8" const _TokenType_name = "TokenNewlineTokenBangTokenBitwiseAndTokenOParenTokenCParenTokenStarTokenPlusTokenMinusTokenDotTokenSlashTokenColonTokenSemicolonTokenLessThanTokenEqualTokenGreaterThanTokenQuestionTokenHeredocTokenIdentTokenNumberLitTokenStringLitTokenOBrackTokenCBrackTokenBitwiseXorTokenBacktickTokenOBraceTokenBitwiseOrTokenCBraceTokenBitwiseNotTokenOQuoteTokenCQuoteTokenTemplateControlTokenTemplateSeqEndTokenAndTokenOrTokenTemplateInterpTokenNotEqualTokenLessThanEqTokenGreaterThanEqTokenEOFTokenTabsTokenStarStarTokenInvalidTokenBadUTF8"
var _TokenType_map = map[TokenType]string{ var _TokenType_map = map[TokenType]string{
10: _TokenType_name[0:12], 10: _TokenType_name[0:12],
@ -46,10 +46,10 @@ var _TokenType_map = map[TokenType]string{
8804: _TokenType_name[439:454], 8804: _TokenType_name[439:454],
8805: _TokenType_name[454:472], 8805: _TokenType_name[454:472],
9220: _TokenType_name[472:480], 9220: _TokenType_name[472:480],
9225: _TokenType_name[480:488], 9225: _TokenType_name[480:489],
10138: _TokenType_name[488:501], 10138: _TokenType_name[489:502],
65533: _TokenType_name[501:513], 65533: _TokenType_name[502:514],
128169: _TokenType_name[513:525], 128169: _TokenType_name[514:526],
} }
func (i TokenType) String() string { func (i TokenType) String() string {