lexer: more robust implementation

This commit is contained in:
Fatih Arslan 2015-10-04 01:29:13 +03:00
parent 97fb05dd4a
commit 32ad59fcd7
2 changed files with 89 additions and 32 deletions

View File

@ -16,14 +16,15 @@ type Scanner struct {
src *bytes.Buffer src *bytes.Buffer
srcBytes []byte srcBytes []byte
ch rune // current character // ch rune // current character
lastCharLen int // length of last character in bytes lastCharLen int // length of last character in bytes
pos Position
// Token text buffer currPos Position // current position
tokBuf bytes.Buffer prevPos Position // previous position
tokPos int // token text tail position (srcBuf index); valid if >= 0
tokEnd int // token text tail end (srcBuf index) tokBuf bytes.Buffer // token text buffer
tokPos int // token text tail position (srcBuf index); valid if >= 0
tokEnd int // token text tail end (srcBuf index)
} }
// NewLexer returns a new instance of Lexer. Even though src is an io.Reader, // NewLexer returns a new instance of Lexer. Even though src is an io.Reader,
@ -44,23 +45,41 @@ func NewLexer(src io.Reader) (*Scanner, error) {
// next reads the next rune from the bufferred reader. Returns the rune(0) if // next reads the next rune from the bufferred reader. Returns the rune(0) if
// an error occurs (or io.EOF is returned). // an error occurs (or io.EOF is returned).
func (s *Scanner) next() rune { func (s *Scanner) next() rune {
var err error ch, size, err := s.src.ReadRune()
var size int
s.ch, size, err = s.src.ReadRune()
if err != nil { if err != nil {
return eof return eof
} }
s.lastCharLen = size // remember last position
s.pos.Offset += size s.prevPos = s.currPos
s.pos.Column += size
if s.ch == '\n' { s.lastCharLen = size
s.pos.Line++ s.currPos.Offset += size
s.pos.Column = 0 s.currPos.Column += size
if ch == '\n' {
s.currPos.Line++
s.currPos.Column = 0
} }
return s.ch return ch
}
func (s *Scanner) unread() {
if err := s.src.UnreadRune(); err != nil {
panic(err) // this is user fault, we should catch it
}
s.currPos = s.prevPos // put back last position
}
func (s *Scanner) peek() rune {
peek, _, err := s.src.ReadRune()
if err != nil {
return eof
}
s.src.UnreadRune()
return peek
} }
// Scan scans the next token and returns the token and it's literal string. // Scan scans the next token and returns the token and it's literal string.
@ -74,16 +93,19 @@ func (s *Scanner) Scan() (tok Token, lit string) {
// start the token position // start the token position
s.tokBuf.Reset() s.tokBuf.Reset()
s.tokPos = s.pos.Offset - s.lastCharLen s.tokPos = s.currPos.Offset - s.lastCharLen
// identifier
if isLetter(ch) { if isLetter(ch) {
tok = IDENT tok = IDENT
s.scanIdentifier() lit = s.scanIdentifier()
if lit == "true" || lit == "false" {
tok = BOOL
}
} }
if isDigit(ch) { if isDigit(ch) {
// scan for number // scanDigits()
// TODO(arslan)
} }
switch ch { switch ch {
@ -92,10 +114,9 @@ func (s *Scanner) Scan() (tok Token, lit string) {
case '"': case '"':
tok = STRING tok = STRING
s.scanString() s.scanString()
s.next() // move forward so we finalize the string
} }
s.tokEnd = s.pos.Offset - s.lastCharLen s.tokEnd = s.currPos.Offset
return tok, s.TokenLiteral() return tok, s.TokenLiteral()
} }
@ -120,10 +141,16 @@ func (s *Scanner) scanString() {
return return
} }
func (s *Scanner) scanIdentifier() { func (s *Scanner) scanIdentifier() string {
for isLetter(s.ch) || isDigit(s.ch) { offs := s.currPos.Offset - s.lastCharLen
s.next() ch := s.next()
for isLetter(ch) || isDigit(ch) {
ch = s.next()
} }
s.unread() // we got identifier, put back latest char
// return string(s.srcBytes[offs:(s.currPos.Offset - s.lastCharLen)])
return string(s.srcBytes[offs:s.currPos.Offset])
} }
// TokenLiteral returns the literal string corresponding to the most recently // TokenLiteral returns the literal string corresponding to the most recently

View File

@ -13,8 +13,38 @@ type token struct {
text string text string
} }
func TestBool(t *testing.T) {
var tokenList = []token{
{BOOL, "true"},
{BOOL, "false"},
}
// create artifical source code
buf := new(bytes.Buffer)
for _, ident := range tokenList {
fmt.Fprintf(buf, " \t%s\n", ident.text)
}
l, err := NewLexer(buf)
if err != nil {
t.Fatal(err)
}
for _, ident := range tokenList {
tok, lit := l.Scan()
if tok != ident.tok {
t.Errorf("tok = %s want %s for %s\n", tok, ident.tok, ident.text)
}
if lit != ident.text {
t.Errorf("text = %s want %s", lit, ident.text)
}
}
}
func TestIdent(t *testing.T) { func TestIdent(t *testing.T) {
var identList = []token{ var tokenList = []token{
{IDENT, "a"}, {IDENT, "a"},
{IDENT, "a0"}, {IDENT, "a0"},
{IDENT, "foobar"}, {IDENT, "foobar"},
@ -35,7 +65,7 @@ func TestIdent(t *testing.T) {
// create artifical source code // create artifical source code
buf := new(bytes.Buffer) buf := new(bytes.Buffer)
for _, ident := range identList { for _, ident := range tokenList {
fmt.Fprintf(buf, " \t%s\n", ident.text) fmt.Fprintf(buf, " \t%s\n", ident.text)
} }
@ -44,7 +74,7 @@ func TestIdent(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
for _, ident := range identList { for _, ident := range tokenList {
tok, lit := l.Scan() tok, lit := l.Scan()
if tok != ident.tok { if tok != ident.tok {
t.Errorf("tok = %s want %s for %s\n", tok, ident.tok, ident.text) t.Errorf("tok = %s want %s for %s\n", tok, ident.tok, ident.text)
@ -58,7 +88,7 @@ func TestIdent(t *testing.T) {
} }
func TestString(t *testing.T) { func TestString(t *testing.T) {
var identList = []token{ var tokenList = []token{
{STRING, `" "`}, {STRING, `" "`},
{STRING, `"a"`}, {STRING, `"a"`},
{STRING, `"本"`}, {STRING, `"本"`},
@ -83,7 +113,7 @@ func TestString(t *testing.T) {
// create artifical source code // create artifical source code
buf := new(bytes.Buffer) buf := new(bytes.Buffer)
for _, ident := range identList { for _, ident := range tokenList {
fmt.Fprintf(buf, " \t%s\n", ident.text) fmt.Fprintf(buf, " \t%s\n", ident.text)
} }
@ -92,7 +122,7 @@ func TestString(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
for _, ident := range identList { for _, ident := range tokenList {
tok, lit := l.Scan() tok, lit := l.Scan()
if tok != ident.tok { if tok != ident.tok {
t.Errorf("tok = %s want %s for %s\n", tok, ident.tok, ident.text) t.Errorf("tok = %s want %s for %s\n", tok, ident.tok, ident.text)