lexer: more robust implementation
This commit is contained in:
parent
97fb05dd4a
commit
32ad59fcd7
@ -16,14 +16,15 @@ type Scanner struct {
|
||||
src *bytes.Buffer
|
||||
srcBytes []byte
|
||||
|
||||
ch rune // current character
|
||||
lastCharLen int // length of last character in bytes
|
||||
pos Position
|
||||
// ch rune // current character
|
||||
lastCharLen int // length of last character in bytes
|
||||
|
||||
// Token text buffer
|
||||
tokBuf bytes.Buffer
|
||||
tokPos int // token text tail position (srcBuf index); valid if >= 0
|
||||
tokEnd int // token text tail end (srcBuf index)
|
||||
currPos Position // current position
|
||||
prevPos Position // previous position
|
||||
|
||||
tokBuf bytes.Buffer // token text buffer
|
||||
tokPos int // token text tail position (srcBuf index); valid if >= 0
|
||||
tokEnd int // token text tail end (srcBuf index)
|
||||
}
|
||||
|
||||
// NewLexer returns a new instance of Lexer. Even though src is an io.Reader,
|
||||
@ -44,23 +45,41 @@ func NewLexer(src io.Reader) (*Scanner, error) {
|
||||
// next reads the next rune from the bufferred reader. Returns the rune(0) if
|
||||
// an error occurs (or io.EOF is returned).
|
||||
func (s *Scanner) next() rune {
|
||||
var err error
|
||||
var size int
|
||||
s.ch, size, err = s.src.ReadRune()
|
||||
ch, size, err := s.src.ReadRune()
|
||||
if err != nil {
|
||||
return eof
|
||||
}
|
||||
|
||||
s.lastCharLen = size
|
||||
s.pos.Offset += size
|
||||
s.pos.Column += size
|
||||
// remember last position
|
||||
s.prevPos = s.currPos
|
||||
|
||||
if s.ch == '\n' {
|
||||
s.pos.Line++
|
||||
s.pos.Column = 0
|
||||
s.lastCharLen = size
|
||||
s.currPos.Offset += size
|
||||
s.currPos.Column += size
|
||||
|
||||
if ch == '\n' {
|
||||
s.currPos.Line++
|
||||
s.currPos.Column = 0
|
||||
}
|
||||
|
||||
return s.ch
|
||||
return ch
|
||||
}
|
||||
|
||||
func (s *Scanner) unread() {
|
||||
if err := s.src.UnreadRune(); err != nil {
|
||||
panic(err) // this is user fault, we should catch it
|
||||
}
|
||||
s.currPos = s.prevPos // put back last position
|
||||
}
|
||||
|
||||
func (s *Scanner) peek() rune {
|
||||
peek, _, err := s.src.ReadRune()
|
||||
if err != nil {
|
||||
return eof
|
||||
}
|
||||
|
||||
s.src.UnreadRune()
|
||||
return peek
|
||||
}
|
||||
|
||||
// Scan scans the next token and returns the token and it's literal string.
|
||||
@ -74,16 +93,19 @@ func (s *Scanner) Scan() (tok Token, lit string) {
|
||||
|
||||
// start the token position
|
||||
s.tokBuf.Reset()
|
||||
s.tokPos = s.pos.Offset - s.lastCharLen
|
||||
s.tokPos = s.currPos.Offset - s.lastCharLen
|
||||
|
||||
// identifier
|
||||
if isLetter(ch) {
|
||||
tok = IDENT
|
||||
s.scanIdentifier()
|
||||
lit = s.scanIdentifier()
|
||||
if lit == "true" || lit == "false" {
|
||||
tok = BOOL
|
||||
}
|
||||
}
|
||||
|
||||
if isDigit(ch) {
|
||||
// scan for number
|
||||
// scanDigits()
|
||||
// TODO(arslan)
|
||||
}
|
||||
|
||||
switch ch {
|
||||
@ -92,10 +114,9 @@ func (s *Scanner) Scan() (tok Token, lit string) {
|
||||
case '"':
|
||||
tok = STRING
|
||||
s.scanString()
|
||||
s.next() // move forward so we finalize the string
|
||||
}
|
||||
|
||||
s.tokEnd = s.pos.Offset - s.lastCharLen
|
||||
s.tokEnd = s.currPos.Offset
|
||||
|
||||
return tok, s.TokenLiteral()
|
||||
}
|
||||
@ -120,10 +141,16 @@ func (s *Scanner) scanString() {
|
||||
return
|
||||
}
|
||||
|
||||
func (s *Scanner) scanIdentifier() {
|
||||
for isLetter(s.ch) || isDigit(s.ch) {
|
||||
s.next()
|
||||
func (s *Scanner) scanIdentifier() string {
|
||||
offs := s.currPos.Offset - s.lastCharLen
|
||||
ch := s.next()
|
||||
for isLetter(ch) || isDigit(ch) {
|
||||
ch = s.next()
|
||||
}
|
||||
s.unread() // we got identifier, put back latest char
|
||||
|
||||
// return string(s.srcBytes[offs:(s.currPos.Offset - s.lastCharLen)])
|
||||
return string(s.srcBytes[offs:s.currPos.Offset])
|
||||
}
|
||||
|
||||
// TokenLiteral returns the literal string corresponding to the most recently
|
||||
|
@ -13,8 +13,38 @@ type token struct {
|
||||
text string
|
||||
}
|
||||
|
||||
func TestBool(t *testing.T) {
|
||||
var tokenList = []token{
|
||||
{BOOL, "true"},
|
||||
{BOOL, "false"},
|
||||
}
|
||||
|
||||
// create artifical source code
|
||||
buf := new(bytes.Buffer)
|
||||
for _, ident := range tokenList {
|
||||
fmt.Fprintf(buf, " \t%s\n", ident.text)
|
||||
}
|
||||
|
||||
l, err := NewLexer(buf)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
for _, ident := range tokenList {
|
||||
tok, lit := l.Scan()
|
||||
if tok != ident.tok {
|
||||
t.Errorf("tok = %s want %s for %s\n", tok, ident.tok, ident.text)
|
||||
}
|
||||
|
||||
if lit != ident.text {
|
||||
t.Errorf("text = %s want %s", lit, ident.text)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestIdent(t *testing.T) {
|
||||
var identList = []token{
|
||||
var tokenList = []token{
|
||||
{IDENT, "a"},
|
||||
{IDENT, "a0"},
|
||||
{IDENT, "foobar"},
|
||||
@ -35,7 +65,7 @@ func TestIdent(t *testing.T) {
|
||||
|
||||
// create artifical source code
|
||||
buf := new(bytes.Buffer)
|
||||
for _, ident := range identList {
|
||||
for _, ident := range tokenList {
|
||||
fmt.Fprintf(buf, " \t%s\n", ident.text)
|
||||
}
|
||||
|
||||
@ -44,7 +74,7 @@ func TestIdent(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
for _, ident := range identList {
|
||||
for _, ident := range tokenList {
|
||||
tok, lit := l.Scan()
|
||||
if tok != ident.tok {
|
||||
t.Errorf("tok = %s want %s for %s\n", tok, ident.tok, ident.text)
|
||||
@ -58,7 +88,7 @@ func TestIdent(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestString(t *testing.T) {
|
||||
var identList = []token{
|
||||
var tokenList = []token{
|
||||
{STRING, `" "`},
|
||||
{STRING, `"a"`},
|
||||
{STRING, `"本"`},
|
||||
@ -83,7 +113,7 @@ func TestString(t *testing.T) {
|
||||
|
||||
// create artifical source code
|
||||
buf := new(bytes.Buffer)
|
||||
for _, ident := range identList {
|
||||
for _, ident := range tokenList {
|
||||
fmt.Fprintf(buf, " \t%s\n", ident.text)
|
||||
}
|
||||
|
||||
@ -92,7 +122,7 @@ func TestString(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
for _, ident := range identList {
|
||||
for _, ident := range tokenList {
|
||||
tok, lit := l.Scan()
|
||||
if tok != ident.tok {
|
||||
t.Errorf("tok = %s want %s for %s\n", tok, ident.tok, ident.text)
|
||||
|
Loading…
x
Reference in New Issue
Block a user