diff --git a/zcl/zclsyntax/scan_tokens.go b/zcl/zclsyntax/scan_tokens.go index 21d34c7..e24accb 100644 --- a/zcl/zclsyntax/scan_tokens.go +++ b/zcl/zclsyntax/scan_tokens.go @@ -11,73 +11,73 @@ import ( var _zcltok_actions []byte = []byte{ 0, 1, 0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, + 1, 8, } var _zcltok_key_offsets []byte = []byte{ - 0, 0, 2, 4, 5, 15, 17, 19, + 0, 2, 4, 16, 17, 18, 20, 22, } var _zcltok_trans_keys []byte = []byte{ - 128, 191, 128, 191, 32, 128, 191, 192, - 223, 224, 239, 240, 247, 248, 255, 128, - 191, 128, 191, 128, 191, + 128, 191, 128, 191, 9, 32, 128, 191, + 192, 223, 224, 239, 240, 247, 248, 255, + 9, 32, 128, 191, 128, 191, 128, 191, } var _zcltok_single_lengths []byte = []byte{ - 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 2, 1, 1, 0, 0, 0, } var _zcltok_range_lengths []byte = []byte{ - 0, 1, 1, 0, 5, 1, 1, 1, + 1, 1, 5, 0, 0, 1, 1, 1, } var _zcltok_index_offsets []byte = []byte{ - 0, 0, 2, 4, 6, 12, 14, 16, + 0, 2, 4, 12, 14, 16, 18, 20, } var _zcltok_trans_targs []byte = []byte{ - 4, 4, 1, 4, 3, 0, 4, 5, - 6, 7, 4, 4, 4, 4, 1, 4, - 2, 4, 4, 4, 4, 4, 4, + 2, 2, 0, 2, 3, 4, 2, 5, + 6, 7, 2, 2, 3, 2, 4, 2, + 2, 2, 0, 2, 1, 2, 2, 2, + 2, 2, 2, 2, 2, } var _zcltok_trans_actions []byte = []byte{ - 9, 15, 0, 15, 1, 0, 11, 0, - 7, 7, 11, 9, 9, 13, 0, 13, - 0, 13, 15, 15, 13, 13, 13, + 7, 17, 0, 17, 0, 0, 9, 0, + 5, 5, 9, 7, 0, 13, 0, 11, + 7, 15, 0, 15, 0, 15, 17, 17, + 13, 11, 15, 15, 15, } var _zcltok_to_state_actions []byte = []byte{ - 0, 0, 0, 3, 3, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, } var _zcltok_from_state_actions []byte = []byte{ - 0, 0, 0, 0, 5, 0, 0, 0, + 0, 0, 3, 0, 0, 0, 0, 0, } var _zcltok_eof_trans []byte = []byte{ - 0, 20, 20, 0, 0, 23, 23, 23, + 24, 24, 0, 25, 26, 29, 29, 29, } -const zcltok_start int = 3 -const zcltok_first_final int = 3 -const zcltok_error int = 0 +const zcltok_start int = 2 +const zcltok_first_final int = 2 +const zcltok_error int = -1 -const zcltok_en_token int = 4 -const zcltok_en_main int = 3 +const zcltok_en_main int = 2 // line 13 "scan_tokens.rl" func scanTokens(data []byte, filename string, start zcl.Pos) []Token { - offset := 0 - f := &tokenAccum{ Filename: filename, Bytes: data, - Start: start, + Pos: start, } - // line 69 "scan_tokens.rl" + // line 47 "scan_tokens.rl" // Ragel state cs := 0 // Current State @@ -94,7 +94,11 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token { _ = act _ = eof - // line 104 "scan_tokens.go" + token := func(ty TokenType) { + f.emitToken(ty, ts, te) + } + + // line 109 "scan_tokens.go" { cs = zcltok_start ts = 0 @@ -102,7 +106,7 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token { act = 0 } - // line 112 "scan_tokens.go" + // line 117 "scan_tokens.go" { var _klen int var _trans int @@ -112,9 +116,6 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token { if p == pe { goto _test_eof } - if cs == 0 { - goto _out - } _resume: _acts = int(_zcltok_from_state_actions[cs]) _nacts = uint(_zcltok_actions[_acts]) @@ -122,12 +123,12 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token { for ; _nacts > 0; _nacts-- { _acts++ switch _zcltok_actions[_acts-1] { - case 2: + case 1: // line 1 "NONE" ts = p - // line 136 "scan_tokens.go" + // line 138 "scan_tokens.go" } } @@ -197,48 +198,55 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token { for ; _nacts > 0; _nacts-- { _acts++ switch _zcltok_actions[_acts-1] { - case 0: - // line 25 "scan_tokens.rl" - - offset = p - cs = 4 - goto _again - - case 3: + case 2: // line 1 "NONE" te = p + 1 - case 4: - // line 30 "scan_tokens.rl" + case 3: + // line 43 "scan_tokens.rl" te = p + 1 { - f.emitToken(TokenInvalid, offset, p+1) + token(TokenInvalid) + } + case 4: + // line 44 "scan_tokens.rl" + + te = p + 1 + { + token(TokenBadUTF8) } case 5: - // line 34 "scan_tokens.rl" + // line 41 "scan_tokens.rl" + + te = p + p-- - te = p + 1 - { - f.emitToken(TokenBadUTF8, offset, p+1) - } case 6: - // line 34 "scan_tokens.rl" + // line 42 "scan_tokens.rl" te = p p-- { - f.emitToken(TokenBadUTF8, offset, p+1) + token(TokenTabs) } case 7: - // line 34 "scan_tokens.rl" + // line 44 "scan_tokens.rl" + + te = p + p-- + { + token(TokenBadUTF8) + } + case 8: + // line 44 "scan_tokens.rl" p = (te) - 1 { - f.emitToken(TokenBadUTF8, offset, p+1) + token(TokenBadUTF8) } - // line 248 "scan_tokens.go" + // line 245 "scan_tokens.go" } } @@ -249,18 +257,15 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token { for ; _nacts > 0; _nacts-- { _acts++ switch _zcltok_actions[_acts-1] { - case 1: + case 0: // line 1 "NONE" ts = 0 - // line 263 "scan_tokens.go" + // line 260 "scan_tokens.go" } } - if cs == 0 { - goto _out - } p++ if p != pe { goto _resume @@ -275,12 +280,9 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token { } } - _out: - { - } } - // line 89 "scan_tokens.rl" + // line 71 "scan_tokens.rl" // If we fall out here without being in a final state then we've // encountered something that the scanner can't match, which we'll @@ -289,5 +291,9 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token { f.emitToken(TokenInvalid, p, len(data)) } + // We always emit a synthetic EOF token at the end, since it gives the + // parser position information for an "unexpected EOF" diagnostic. + f.emitToken(TokenEOF, len(data), len(data)) + return f.Tokens } diff --git a/zcl/zclsyntax/scan_tokens.rl b/zcl/zclsyntax/scan_tokens.rl index 656c8de..43f48fb 100644 --- a/zcl/zclsyntax/scan_tokens.rl +++ b/zcl/zclsyntax/scan_tokens.rl @@ -13,31 +13,13 @@ import ( }%% func scanTokens(data []byte, filename string, start zcl.Pos) []Token { - offset := 0 - f := &tokenAccum{ Filename: filename, Bytes: data, - Start: start, + Pos: start, } %%{ - action start { - offset = p - fgoto token; - } - - action EmitInvalid { - f.emitToken(TokenInvalid, offset, p+1) - } - - action EmitBadUTF8 { - f.emitToken(TokenBadUTF8, offset, p+1) - } - - action EmitEOF { - f.emitToken(TokenEOF, offset, offset) - } UTF8Cont = 0x80 .. 0xBF; AnyUTF8 = ( @@ -46,26 +28,22 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token { 0xE0..0xEF . UTF8Cont . UTF8Cont | 0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont ); - - AnyUTF8Tok = AnyUTF8 >start; BrokenUTF8 = any - AnyUTF8; - EmptyTok = ""; # Tabs are not valid, but we accept them in the scanner and mark them # as tokens so that we can produce diagnostics advising the user to # use spaces instead. - TabTok = 0x09 >start; + Tabs = 0x09+; - token := |* - AnyUTF8 => EmitInvalid; - BrokenUTF8 => EmitBadUTF8; - EmptyTok => EmitEOF; + Spaces = ' '+; + + main := |* + Spaces => {}; + Tabs => { token(TokenTabs) }; + AnyUTF8 => { token(TokenInvalid) }; + BrokenUTF8 => { token(TokenBadUTF8) }; *|; - Spaces = ' '*; - - main := Spaces @start; - }%% // Ragel state @@ -83,6 +61,10 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token { _ = act _ = eof + token := func (ty TokenType) { + f.emitToken(ty, ts, te) + } + %%{ write init; write exec; @@ -95,5 +77,9 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token { f.emitToken(TokenInvalid, p, len(data)) } + // We always emit a synthetic EOF token at the end, since it gives the + // parser position information for an "unexpected EOF" diagnostic. + f.emitToken(TokenEOF, len(data), len(data)) + return f.Tokens } diff --git a/zcl/zclsyntax/scan_tokens_test.go b/zcl/zclsyntax/scan_tokens_test.go new file mode 100644 index 0000000..72225f4 --- /dev/null +++ b/zcl/zclsyntax/scan_tokens_test.go @@ -0,0 +1,155 @@ +package zclsyntax + +import ( + "reflect" + "testing" + + "github.com/kylelemons/godebug/pretty" + "github.com/zclconf/go-zcl/zcl" +) + +func TestScanTokens(t *testing.T) { + tests := []struct { + input string + want []Token + }{ + { + ``, + []Token{ + { + Type: TokenEOF, + Bytes: []byte{}, + Range: zcl.Range{ + Start: zcl.Pos{Byte: 0, Line: 1, Column: 1}, + End: zcl.Pos{Byte: 0, Line: 1, Column: 1}, + }, + }, + }, + }, + { + ` `, + []Token{ + { + Type: TokenEOF, + Bytes: []byte{}, + Range: zcl.Range{ + Start: zcl.Pos{Byte: 1, Line: 1, Column: 2}, + End: zcl.Pos{Byte: 1, Line: 1, Column: 2}, + }, + }, + }, + }, + { + `|`, + []Token{ + { + Type: TokenInvalid, + Bytes: []byte(`|`), + Range: zcl.Range{ + Start: zcl.Pos{Byte: 0, Line: 1, Column: 1}, + End: zcl.Pos{Byte: 1, Line: 1, Column: 2}, + }, + }, + { + Type: TokenEOF, + Bytes: []byte{}, + Range: zcl.Range{ + Start: zcl.Pos{Byte: 1, Line: 1, Column: 2}, + End: zcl.Pos{Byte: 1, Line: 1, Column: 2}, + }, + }, + }, + }, + { + "\x80", // UTF-8 continuation without an introducer + []Token{ + { + Type: TokenBadUTF8, + Bytes: []byte{0x80}, + Range: zcl.Range{ + Start: zcl.Pos{Byte: 0, Line: 1, Column: 1}, + End: zcl.Pos{Byte: 1, Line: 1, Column: 2}, + }, + }, + { + Type: TokenEOF, + Bytes: []byte{}, + Range: zcl.Range{ + Start: zcl.Pos{Byte: 1, Line: 1, Column: 2}, + End: zcl.Pos{Byte: 1, Line: 1, Column: 2}, + }, + }, + }, + }, + { + " \x80\x80", // UTF-8 continuation without an introducer + []Token{ + { + Type: TokenBadUTF8, + Bytes: []byte{0x80}, + Range: zcl.Range{ + Start: zcl.Pos{Byte: 1, Line: 1, Column: 2}, + End: zcl.Pos{Byte: 2, Line: 1, Column: 3}, + }, + }, + { + Type: TokenBadUTF8, + Bytes: []byte{0x80}, + Range: zcl.Range{ + Start: zcl.Pos{Byte: 2, Line: 1, Column: 3}, + End: zcl.Pos{Byte: 3, Line: 1, Column: 4}, + }, + }, + { + Type: TokenEOF, + Bytes: []byte{}, + Range: zcl.Range{ + Start: zcl.Pos{Byte: 3, Line: 1, Column: 4}, + End: zcl.Pos{Byte: 3, Line: 1, Column: 4}, + }, + }, + }, + }, + { + "\t\t", + []Token{ + { + Type: TokenTabs, + Bytes: []byte{0x09, 0x09}, + Range: zcl.Range{ + Start: zcl.Pos{Byte: 0, Line: 1, Column: 1}, + End: zcl.Pos{Byte: 2, Line: 1, Column: 3}, + }, + }, + { + Type: TokenEOF, + Bytes: []byte{}, + Range: zcl.Range{ + Start: zcl.Pos{Byte: 2, Line: 1, Column: 3}, + End: zcl.Pos{Byte: 2, Line: 1, Column: 3}, + }, + }, + }, + }, + } + + prettyConfig := &pretty.Config{ + Diffable: true, + IncludeUnexported: true, + PrintStringers: true, + } + + for _, test := range tests { + t.Run(test.input, func(t *testing.T) { + got := scanTokens([]byte(test.input), "", zcl.Pos{Byte: 0, Line: 1, Column: 1}) + + if !reflect.DeepEqual(got, test.want) { + diff := prettyConfig.Compare(test.want, got) + t.Errorf( + "wrong result\ninput: %s\ndiff: %s", + test.input, diff, + ) + } + }) + } +} diff --git a/zcl/zclsyntax/token.go b/zcl/zclsyntax/token.go index 2496cc1..de64ea9 100644 --- a/zcl/zclsyntax/token.go +++ b/zcl/zclsyntax/token.go @@ -79,7 +79,7 @@ const ( TokenStarStar TokenType = '➚' TokenBacktick TokenType = '`' TokenSemicolon TokenType = ';' - TokenTab TokenType = '␉' + TokenTabs TokenType = '␉' TokenInvalid TokenType = '�' TokenBadUTF8 TokenType = '💩' ) @@ -91,21 +91,21 @@ func (t TokenType) GoString() string { type tokenAccum struct { Filename string Bytes []byte - Start zcl.Pos + Pos zcl.Pos Tokens []Token } -func (f *tokenAccum) emitToken(ty TokenType, startOfs int, endOfs int) { +func (f *tokenAccum) emitToken(ty TokenType, startOfs, endOfs int) { // Walk through our buffer to figure out how much we need to adjust // the start pos to get our end pos. - start := f.Start - start.Byte += startOfs - start.Column += startOfs // Safe because only ASCII spaces can be in the offset + start := f.Pos + start.Column += startOfs - f.Pos.Byte // Safe because only ASCII spaces can be in the offset + start.Byte = startOfs end := start - end.Byte = f.Start.Byte + endOfs - b := f.Bytes + end.Byte = endOfs + b := f.Bytes[startOfs:endOfs] for len(b) > 0 { advance, seq, _ := textseg.ScanGraphemeClusters(b, true) if len(seq) == 1 && seq[0] == '\n' { @@ -117,6 +117,8 @@ func (f *tokenAccum) emitToken(ty TokenType, startOfs int, endOfs int) { b = b[advance:] } + f.Pos = end + f.Tokens = append(f.Tokens, Token{ Type: ty, Bytes: f.Bytes[startOfs:endOfs], diff --git a/zcl/zclsyntax/token_type_string.go b/zcl/zclsyntax/token_type_string.go index 6fc370c..995994c 100644 --- a/zcl/zclsyntax/token_type_string.go +++ b/zcl/zclsyntax/token_type_string.go @@ -4,7 +4,7 @@ package zclsyntax import "fmt" -const _TokenType_name = "TokenNewlineTokenBangTokenBitwiseAndTokenOParenTokenCParenTokenStarTokenPlusTokenMinusTokenDotTokenSlashTokenColonTokenSemicolonTokenLessThanTokenEqualTokenGreaterThanTokenQuestionTokenHeredocTokenIdentTokenNumberLitTokenStringLitTokenOBrackTokenCBrackTokenBitwiseXorTokenBacktickTokenOBraceTokenBitwiseOrTokenCBraceTokenBitwiseNotTokenOQuoteTokenCQuoteTokenTemplateControlTokenTemplateSeqEndTokenAndTokenOrTokenTemplateInterpTokenNotEqualTokenLessThanEqTokenGreaterThanEqTokenEOFTokenTabTokenStarStarTokenInvalidTokenBadUTF8" +const _TokenType_name = "TokenNewlineTokenBangTokenBitwiseAndTokenOParenTokenCParenTokenStarTokenPlusTokenMinusTokenDotTokenSlashTokenColonTokenSemicolonTokenLessThanTokenEqualTokenGreaterThanTokenQuestionTokenHeredocTokenIdentTokenNumberLitTokenStringLitTokenOBrackTokenCBrackTokenBitwiseXorTokenBacktickTokenOBraceTokenBitwiseOrTokenCBraceTokenBitwiseNotTokenOQuoteTokenCQuoteTokenTemplateControlTokenTemplateSeqEndTokenAndTokenOrTokenTemplateInterpTokenNotEqualTokenLessThanEqTokenGreaterThanEqTokenEOFTokenTabsTokenStarStarTokenInvalidTokenBadUTF8" var _TokenType_map = map[TokenType]string{ 10: _TokenType_name[0:12], @@ -46,10 +46,10 @@ var _TokenType_map = map[TokenType]string{ 8804: _TokenType_name[439:454], 8805: _TokenType_name[454:472], 9220: _TokenType_name[472:480], - 9225: _TokenType_name[480:488], - 10138: _TokenType_name[488:501], - 65533: _TokenType_name[501:513], - 128169: _TokenType_name[513:525], + 9225: _TokenType_name[480:489], + 10138: _TokenType_name[489:502], + 65533: _TokenType_name[502:514], + 128169: _TokenType_name[514:526], } func (i TokenType) String() string {