zclsyntax: define the initial set of language tokens for the scanner

2017-05-27 19:00:00 -07:00 · 2017-05-27 19:00:00 -07:00 · e65eafbe83
commit e65eafbe83
parent 6bf26fc9cc
1 changed files with 119 additions and 0 deletions
--- a/zcl/zclsyntax/token.go
+++ b/zcl/zclsyntax/token.go
@ -0,0 +1,119 @@
+package zclsyntax
+
+import (
+	"github.com/apparentlymart/go-textseg/textseg"
+	"github.com/zclconf/go-zcl/zcl"
+)
+
+// Token represents a sequence of bytes from some zcl code that has been
+// tagged with a type and its range within the source file.
+type Token struct {
+	Type  TokenType
+	Bytes []byte
+	Range zcl.Range
+}
+
+// TokenType is an enumeration used for the Type field on Token.
+type TokenType rune
+
+const (
+	// Single-character tokens are represented by their own character, for
+	// convenience in producing these within the scanner. However, the values
+	// are otherwise arbitrary and just intended to be mnemonic for humans
+	// who might see them in debug output.
+
+	TokenOBrace TokenType = '{'
+	TokenCBrace TokenType = '}'
+	TokenOBrack TokenType = '['
+	TokenCBrack TokenType = ']'
+	TokenOParen TokenType = '('
+	TokenCParen TokenType = ')'
+	TokenOQuote TokenType = '«'
+	TokenCQuote TokenType = '»'
+
+	TokenDot   TokenType = '.'
+	TokenStar  TokenType = '*'
+	TokenSlash TokenType = '/'
+	TokenPlus  TokenType = '+'
+	TokenMinus TokenType = '-'
+
+	TokenEqual         TokenType = '='
+	TokenNotEqual      TokenType = '≠'
+	TokenLessThan      TokenType = '<'
+	TokenLessThanEq    TokenType = '≤'
+	TokenGreaterThan   TokenType = '>'
+	TokenGreaterThanEq TokenType = '≥'
+
+	TokenAnd  TokenType = '∧'
+	TokenOr   TokenType = '∨'
+	TokenBang TokenType = '!'
+
+	TokenQuestion TokenType = '?'
+	TokenColon    TokenType = ':'
+
+	TokenTemplateInterp  TokenType = '∫'
+	TokenTemplateControl TokenType = 'λ'
+
+	TokenStringLit TokenType = 'S'
+	TokenHeredoc   TokenType = 'H'
+	TokenNumberLit TokenType = 'N'
+	TokenIdent     TokenType = 'I'
+
+	TokenNewline TokenType = '\n'
+	TokenEOF     TokenType = '␄'
+
+	// The rest are not used in the language but recognized by the scanner so
+	// we can generate good diagnostics in the parser when users try to write
+	// things that might work in other languages they are familiar with, or
+	// simply make incorrect assumptions about the zcl language.
+
+	TokenBitwiseAnd TokenType = '&'
+	TokenBitwiseOr  TokenType = '|'
+	TokenBitwiseNot TokenType = '~'
+	TokenBitwiseXor TokenType = '^'
+	TokenStarStar   TokenType = '➚'
+	TokenBacktick   TokenType = '`'
+	TokenSemicolon  TokenType = ';'
+	TokenTab        TokenType = '␉'
+	TokenInvalid    TokenType = '<27>'
+	TokenBadUTF8    TokenType = '💩'
+)
+
+type tokenFactory struct {
+	Filename string
+	Bytes    []byte
+	Start    zcl.Pos
+}
+
+func (f tokenFactory) makeToken(ty TokenType, startOfs int, endOfs int) (Token, []byte) {
+	// Walk through our buffer to figure out how much we need to adjust
+	// the start pos to get our end pos.
+
+	start := f.Start
+	start.Byte += startOfs
+	start.Column += startOfs // Safe because only ASCII spaces can be in the offset
+
+	end := start
+	end.Byte = f.Start.Byte + endOfs
+	b := f.Bytes
+	for len(b) > 0 {
+		advance, seq, _ := textseg.ScanGraphemeClusters(b, true)
+		if len(seq) == 1 && seq[0] == '\n' {
+			end.Line++
+			end.Column = 1
+		} else {
+			end.Column++
+		}
+		b = b[advance:]
+	}
+
+	return Token{
+		Type:  ty,
+		Bytes: f.Bytes[startOfs:endOfs],
+		Range: zcl.Range{
+			Filename: f.Filename,
+			Start:    start,
+			End:      end,
+		},
+	}, f.Bytes[endOfs:]
+}