zclsyntax: re-organize and simplify the scanner

2017-05-28 08:38:13 -07:00 · 2017-05-28 08:38:13 -07:00 · 187d7b8045
commit 187d7b8045
parent b8db08bf04
5 changed files with 255 additions and 106 deletions
--- a/zcl/zclsyntax/scan_tokens.go
+++ b/zcl/zclsyntax/scan_tokens.go
@ -11,73 +11,73 @@ import (
 var _zcltok_actions []byte = []byte{
 	0, 1, 0, 1, 1, 1, 2, 1, 3,
 	1, 4, 1, 5, 1, 6, 1, 7,
 	1, 8,
 }
 var _zcltok_key_offsets []byte = []byte{
-	0, 0, 2, 4, 5, 15, 17, 19,
+	0, 2, 4, 16, 17, 18, 20, 22,
 }
 var _zcltok_trans_keys []byte = []byte{
-	128, 191, 128, 191, 32, 128, 191, 192,
+	128, 191, 128, 191, 9, 32, 128, 191,
-	223, 224, 239, 240, 247, 248, 255, 128,
+	192, 223, 224, 239, 240, 247, 248, 255,
-	191, 128, 191, 128, 191,
+	9, 32, 128, 191, 128, 191, 128, 191,
 }
 var _zcltok_single_lengths []byte = []byte{
-	0, 0, 0, 1, 0, 0, 0, 0,
+	0, 0, 2, 1, 1, 0, 0, 0,
 }
 var _zcltok_range_lengths []byte = []byte{
-	0, 1, 1, 0, 5, 1, 1, 1,
+	1, 1, 5, 0, 0, 1, 1, 1,
 }
 var _zcltok_index_offsets []byte = []byte{
-	0, 0, 2, 4, 6, 12, 14, 16,
+	0, 2, 4, 12, 14, 16, 18, 20,
 }
 var _zcltok_trans_targs []byte = []byte{
-	4, 4, 1, 4, 3, 0, 4, 5,
+	2, 2, 0, 2, 3, 4, 2, 5,
-	6, 7, 4, 4, 4, 4, 1, 4,
+	6, 7, 2, 2, 3, 2, 4, 2,
-	2, 4, 4, 4, 4, 4, 4,
+	2, 2, 0, 2, 1, 2, 2, 2,
 	2, 2, 2, 2, 2,
 }
 var _zcltok_trans_actions []byte = []byte{
-	9, 15, 0, 15, 1, 0, 11, 0,
+	7, 17, 0, 17, 0, 0, 9, 0,
-	7, 7, 11, 9, 9, 13, 0, 13,
+	5, 5, 9, 7, 0, 13, 0, 11,
-	0, 13, 15, 15, 13, 13, 13,
+	7, 15, 0, 15, 0, 15, 17, 17,
 	13, 11, 15, 15, 15,
 }
 var _zcltok_to_state_actions []byte = []byte{
-	0, 0, 0, 3, 3, 0, 0, 0,
+	0, 0, 1, 0, 0, 0, 0, 0,
 }
 var _zcltok_from_state_actions []byte = []byte{
-	0, 0, 0, 0, 5, 0, 0, 0,
+	0, 0, 3, 0, 0, 0, 0, 0,
 }
 var _zcltok_eof_trans []byte = []byte{
-	0, 20, 20, 0, 0, 23, 23, 23,
+	24, 24, 0, 25, 26, 29, 29, 29,
 }
-const zcltok_start int = 3
+const zcltok_start int = 2
-const zcltok_first_final int = 3
+const zcltok_first_final int = 2
-const zcltok_error int = 0
+const zcltok_error int = -1
-const zcltok_en_token int = 4
+const zcltok_en_main int = 2
 const zcltok_en_main int = 3
 // line 13 "scan_tokens.rl"
 func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
 	offset := 0
 	f := &tokenAccum{
 		Filename: filename,
 		Bytes:    data,
-		Start:    start,
+		Pos:      start,
 	}
-	// line 69 "scan_tokens.rl"
+	// line 47 "scan_tokens.rl"
 	// Ragel state
 	cs := 0         // Current State
@ -94,7 +94,11 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
 	_ = act
 	_ = eof
-	// line 104 "scan_tokens.go"
+	token := func(ty TokenType) {
 		f.emitToken(ty, ts, te)
 	}
 	// line 109 "scan_tokens.go"
 	{
 		cs = zcltok_start
 		ts = 0
@ -102,7 +106,7 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
 		act = 0
 	}
-	// line 112 "scan_tokens.go"
+	// line 117 "scan_tokens.go"
 	{
 		var _klen int
 		var _trans int
@ -112,9 +116,6 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
 		if p == pe {
 			goto _test_eof
 		}
 		if cs == 0 {
 			goto _out
 		}
 	_resume:
 		_acts = int(_zcltok_from_state_actions[cs])
 		_nacts = uint(_zcltok_actions[_acts])
@ -122,12 +123,12 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
 		for ; _nacts > 0; _nacts-- {
 			_acts++
 			switch _zcltok_actions[_acts-1] {
-			case 2:
+			case 1:
 				// line 1 "NONE"
 				ts = p
-				// line 136 "scan_tokens.go"
+				// line 138 "scan_tokens.go"
 			}
 		}
@ -197,48 +198,55 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
 		for ; _nacts > 0; _nacts-- {
 			_acts++
 			switch _zcltok_actions[_acts-1] {
-			case 0:
+			case 2:
 				// line 25 "scan_tokens.rl"
 				offset = p
 				cs = 4
 				goto _again
 			case 3:
 				// line 1 "NONE"
 				te = p + 1
-			case 4:
+			case 3:
-				// line 30 "scan_tokens.rl"
+				// line 43 "scan_tokens.rl"
 				te = p + 1
 				{
-					f.emitToken(TokenInvalid, offset, p+1)
+					token(TokenInvalid)
 				}
 			case 4:
 				// line 44 "scan_tokens.rl"
 				te = p + 1
 				{
 					token(TokenBadUTF8)
 				}
 			case 5:
-				// line 34 "scan_tokens.rl"
+				// line 41 "scan_tokens.rl"
 				te = p
 				p--
 				te = p + 1
 				{
 					f.emitToken(TokenBadUTF8, offset, p+1)
 				}
 			case 6:
-				// line 34 "scan_tokens.rl"
+				// line 42 "scan_tokens.rl"
 				te = p
 				p--
 				{
-					f.emitToken(TokenBadUTF8, offset, p+1)
+					token(TokenTabs)
 				}
 			case 7:
-				// line 34 "scan_tokens.rl"
+				// line 44 "scan_tokens.rl"
 				te = p
 				p--
 				{
 					token(TokenBadUTF8)
 				}
 			case 8:
 				// line 44 "scan_tokens.rl"
 				p = (te) - 1
 				{
-					f.emitToken(TokenBadUTF8, offset, p+1)
+					token(TokenBadUTF8)
 				}
-				// line 248 "scan_tokens.go"
+				// line 245 "scan_tokens.go"
 			}
 		}
@ -249,18 +257,15 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
 		for ; _nacts > 0; _nacts-- {
 			_acts++
 			switch _zcltok_actions[_acts-1] {
-			case 1:
+			case 0:
 				// line 1 "NONE"
 				ts = 0
-				// line 263 "scan_tokens.go"
+				// line 260 "scan_tokens.go"
 			}
 		}
 		if cs == 0 {
 			goto _out
 		}
 		p++
 		if p != pe {
 			goto _resume
@ -275,12 +280,9 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
 			}
 		}
 	_out:
 		{
 		}
 	}
-	// line 89 "scan_tokens.rl"
+	// line 71 "scan_tokens.rl"
 	// If we fall out here without being in a final state then we've
 	// encountered something that the scanner can't match, which we'll
@ -289,5 +291,9 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
 		f.emitToken(TokenInvalid, p, len(data))
 	}
 	// We always emit a synthetic EOF token at the end, since it gives the
 	// parser position information for an "unexpected EOF" diagnostic.
 	f.emitToken(TokenEOF, len(data), len(data))
 	return f.Tokens
 }
--- a/zcl/zclsyntax/scan_tokens.rl
+++ b/zcl/zclsyntax/scan_tokens.rl
@ -13,31 +13,13 @@ import (
 }%%
 func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
    offset := 0
    f := &tokenAccum{
        Filename: filename,
        Bytes:    data,
-        Start:    start,
+        Pos:      start,
    }
    %%{
        action start {
            offset = p
            fgoto token;
        }
        action EmitInvalid {
            f.emitToken(TokenInvalid, offset, p+1)
        }
        action EmitBadUTF8 {
            f.emitToken(TokenBadUTF8, offset, p+1)
        }
        action EmitEOF {
            f.emitToken(TokenEOF, offset, offset)
        }
        UTF8Cont = 0x80 .. 0xBF;
        AnyUTF8 = (
@ -46,26 +28,22 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
            0xE0..0xEF . UTF8Cont . UTF8Cont |
            0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
        );
        AnyUTF8Tok = AnyUTF8 >start;
        BrokenUTF8 = any - AnyUTF8;
        EmptyTok = "";
        # Tabs are not valid, but we accept them in the scanner and mark them
        # as tokens so that we can produce diagnostics advising the user to
        # use spaces instead.
-        TabTok = 0x09 >start;
+        Tabs = 0x09+;
-        token := |*
+        Spaces = ' '+;
-            AnyUTF8    => EmitInvalid;
+
-            BrokenUTF8 => EmitBadUTF8;
+        main := |*
-            EmptyTok   => EmitEOF;
+            Spaces     => {};
            Tabs       => { token(TokenTabs) };
            AnyUTF8    => { token(TokenInvalid) };
            BrokenUTF8 => { token(TokenBadUTF8) };
        *|;
        Spaces = ' '*;
        main := Spaces @start;
    }%%
    // Ragel state
@ -83,6 +61,10 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
    _ = act
    _ = eof
    token := func (ty TokenType) {
        f.emitToken(ty, ts, te)
    }
    %%{
        write init;
        write exec;
@ -95,5 +77,9 @@ func scanTokens(data []byte, filename string, start zcl.Pos) []Token {
        f.emitToken(TokenInvalid, p, len(data))
    }
    // We always emit a synthetic EOF token at the end, since it gives the
    // parser position information for an "unexpected EOF" diagnostic.
    f.emitToken(TokenEOF, len(data), len(data))
    return f.Tokens
 }
--- a/zcl/zclsyntax/scan_tokens_test.go
+++ b/zcl/zclsyntax/scan_tokens_test.go
@ -0,0 +1,155 @@
 package zclsyntax
 import (
 	"reflect"
 	"testing"
 	"github.com/kylelemons/godebug/pretty"
 	"github.com/zclconf/go-zcl/zcl"
 )
 func TestScanTokens(t *testing.T) {
 	tests := []struct {
 		input string
 		want  []Token
 	}{
 		{
 			``,
 			[]Token{
 				{
 					Type:  TokenEOF,
 					Bytes: []byte{},
 					Range: zcl.Range{
 						Start: zcl.Pos{Byte: 0, Line: 1, Column: 1},
 						End:   zcl.Pos{Byte: 0, Line: 1, Column: 1},
 					},
 				},
 			},
 		},
 		{
 			` `,
 			[]Token{
 				{
 					Type:  TokenEOF,
 					Bytes: []byte{},
 					Range: zcl.Range{
 						Start: zcl.Pos{Byte: 1, Line: 1, Column: 2},
 						End:   zcl.Pos{Byte: 1, Line: 1, Column: 2},
 					},
 				},
 			},
 		},
 		{
 			`|`,
 			[]Token{
 				{
 					Type:  TokenInvalid,
 					Bytes: []byte(`|`),
 					Range: zcl.Range{
 						Start: zcl.Pos{Byte: 0, Line: 1, Column: 1},
 						End:   zcl.Pos{Byte: 1, Line: 1, Column: 2},
 					},
 				},
 				{
 					Type:  TokenEOF,
 					Bytes: []byte{},
 					Range: zcl.Range{
 						Start: zcl.Pos{Byte: 1, Line: 1, Column: 2},
 						End:   zcl.Pos{Byte: 1, Line: 1, Column: 2},
 					},
 				},
 			},
 		},
 		{
 			"\x80", // UTF-8 continuation without an introducer
 			[]Token{
 				{
 					Type:  TokenBadUTF8,
 					Bytes: []byte{0x80},
 					Range: zcl.Range{
 						Start: zcl.Pos{Byte: 0, Line: 1, Column: 1},
 						End:   zcl.Pos{Byte: 1, Line: 1, Column: 2},
 					},
 				},
 				{
 					Type:  TokenEOF,
 					Bytes: []byte{},
 					Range: zcl.Range{
 						Start: zcl.Pos{Byte: 1, Line: 1, Column: 2},
 						End:   zcl.Pos{Byte: 1, Line: 1, Column: 2},
 					},
 				},
 			},
 		},
 		{
 			" \x80\x80", // UTF-8 continuation without an introducer
 			[]Token{
 				{
 					Type:  TokenBadUTF8,
 					Bytes: []byte{0x80},
 					Range: zcl.Range{
 						Start: zcl.Pos{Byte: 1, Line: 1, Column: 2},
 						End:   zcl.Pos{Byte: 2, Line: 1, Column: 3},
 					},
 				},
 				{
 					Type:  TokenBadUTF8,
 					Bytes: []byte{0x80},
 					Range: zcl.Range{
 						Start: zcl.Pos{Byte: 2, Line: 1, Column: 3},
 						End:   zcl.Pos{Byte: 3, Line: 1, Column: 4},
 					},
 				},
 				{
 					Type:  TokenEOF,
 					Bytes: []byte{},
 					Range: zcl.Range{
 						Start: zcl.Pos{Byte: 3, Line: 1, Column: 4},
 						End:   zcl.Pos{Byte: 3, Line: 1, Column: 4},
 					},
 				},
 			},
 		},
 		{
 			"\t\t",
 			[]Token{
 				{
 					Type:  TokenTabs,
 					Bytes: []byte{0x09, 0x09},
 					Range: zcl.Range{
 						Start: zcl.Pos{Byte: 0, Line: 1, Column: 1},
 						End:   zcl.Pos{Byte: 2, Line: 1, Column: 3},
 					},
 				},
 				{
 					Type:  TokenEOF,
 					Bytes: []byte{},
 					Range: zcl.Range{
 						Start: zcl.Pos{Byte: 2, Line: 1, Column: 3},
 						End:   zcl.Pos{Byte: 2, Line: 1, Column: 3},
 					},
 				},
 			},
 		},
 	}
 	prettyConfig := &pretty.Config{
 		Diffable:          true,
 		IncludeUnexported: true,
 		PrintStringers:    true,
 	}
 	for _, test := range tests {
 		t.Run(test.input, func(t *testing.T) {
 			got := scanTokens([]byte(test.input), "", zcl.Pos{Byte: 0, Line: 1, Column: 1})
 			if !reflect.DeepEqual(got, test.want) {
 				diff := prettyConfig.Compare(test.want, got)
 				t.Errorf(
 					"wrong result\ninput: %s\ndiff:  %s",
 					test.input, diff,
 				)
 			}
 		})
 	}
 }
--- a/zcl/zclsyntax/token.go
+++ b/zcl/zclsyntax/token.go
@ -79,7 +79,7 @@ const (
 	TokenStarStar   TokenType = '➚'
 	TokenBacktick   TokenType = '`'
 	TokenSemicolon  TokenType = ';'
-	TokenTab        TokenType = '␉'
+	TokenTabs       TokenType = '␉'
 	TokenInvalid    TokenType = '<27>'
 	TokenBadUTF8    TokenType = '💩'
 )
@ -91,21 +91,21 @@ func (t TokenType) GoString() string {
 type tokenAccum struct {
 	Filename string
 	Bytes    []byte
-	Start    zcl.Pos
+	Pos      zcl.Pos
 	Tokens   []Token
 }
-func (f *tokenAccum) emitToken(ty TokenType, startOfs int, endOfs int) {
+func (f *tokenAccum) emitToken(ty TokenType, startOfs, endOfs int) {
 	// Walk through our buffer to figure out how much we need to adjust
 	// the start pos to get our end pos.
-	start := f.Start
+	start := f.Pos
-	start.Byte += startOfs
+	start.Column += startOfs - f.Pos.Byte // Safe because only ASCII spaces can be in the offset
-	start.Column += startOfs // Safe because only ASCII spaces can be in the offset
+	start.Byte = startOfs
 	end := start
-	end.Byte = f.Start.Byte + endOfs
+	end.Byte = endOfs
-	b := f.Bytes
+	b := f.Bytes[startOfs:endOfs]
 	for len(b) > 0 {
 		advance, seq, _ := textseg.ScanGraphemeClusters(b, true)
 		if len(seq) == 1 && seq[0] == '\n' {
@ -117,6 +117,8 @@ func (f *tokenAccum) emitToken(ty TokenType, startOfs int, endOfs int) {
 		b = b[advance:]
 	}
 	f.Pos = end
 	f.Tokens = append(f.Tokens, Token{
 		Type:  ty,
 		Bytes: f.Bytes[startOfs:endOfs],
--- a/zcl/zclsyntax/token_type_string.go
+++ b/zcl/zclsyntax/token_type_string.go
@ -4,7 +4,7 @@ package zclsyntax
 import "fmt"
-const _TokenType_name = "TokenNewlineTokenBangTokenBitwiseAndTokenOParenTokenCParenTokenStarTokenPlusTokenMinusTokenDotTokenSlashTokenColonTokenSemicolonTokenLessThanTokenEqualTokenGreaterThanTokenQuestionTokenHeredocTokenIdentTokenNumberLitTokenStringLitTokenOBrackTokenCBrackTokenBitwiseXorTokenBacktickTokenOBraceTokenBitwiseOrTokenCBraceTokenBitwiseNotTokenOQuoteTokenCQuoteTokenTemplateControlTokenTemplateSeqEndTokenAndTokenOrTokenTemplateInterpTokenNotEqualTokenLessThanEqTokenGreaterThanEqTokenEOFTokenTabTokenStarStarTokenInvalidTokenBadUTF8"
+const _TokenType_name = "TokenNewlineTokenBangTokenBitwiseAndTokenOParenTokenCParenTokenStarTokenPlusTokenMinusTokenDotTokenSlashTokenColonTokenSemicolonTokenLessThanTokenEqualTokenGreaterThanTokenQuestionTokenHeredocTokenIdentTokenNumberLitTokenStringLitTokenOBrackTokenCBrackTokenBitwiseXorTokenBacktickTokenOBraceTokenBitwiseOrTokenCBraceTokenBitwiseNotTokenOQuoteTokenCQuoteTokenTemplateControlTokenTemplateSeqEndTokenAndTokenOrTokenTemplateInterpTokenNotEqualTokenLessThanEqTokenGreaterThanEqTokenEOFTokenTabsTokenStarStarTokenInvalidTokenBadUTF8"
 var _TokenType_map = map[TokenType]string{
 	10:     _TokenType_name[0:12],
@ -46,10 +46,10 @@ var _TokenType_map = map[TokenType]string{
 	8804:   _TokenType_name[439:454],
 	8805:   _TokenType_name[454:472],
 	9220:   _TokenType_name[472:480],
-	9225:   _TokenType_name[480:488],
+	9225:   _TokenType_name[480:489],
-	10138:  _TokenType_name[488:501],
+	10138:  _TokenType_name[489:502],
-	65533:  _TokenType_name[501:513],
+	65533:  _TokenType_name[502:514],
-	128169: _TokenType_name[513:525],
+	128169: _TokenType_name[514:526],
 }
 func (i TokenType) String() string {