lexer: more robust implementation

This commit is contained in:
Fatih Arslan 2015-10-04 01:29:13 +03:00
parent 97fb05dd4a
commit 32ad59fcd7
2 changed files with 89 additions and 32 deletions

View File

@ -16,14 +16,15 @@ type Scanner struct {
src *bytes.Buffer
srcBytes []byte
ch rune // current character
lastCharLen int // length of last character in bytes
pos Position
// ch rune // current character
lastCharLen int // length of last character in bytes
// Token text buffer
tokBuf bytes.Buffer
tokPos int // token text tail position (srcBuf index); valid if >= 0
tokEnd int // token text tail end (srcBuf index)
currPos Position // current position
prevPos Position // previous position
tokBuf bytes.Buffer // token text buffer
tokPos int // token text tail position (srcBuf index); valid if >= 0
tokEnd int // token text tail end (srcBuf index)
}
// NewLexer returns a new instance of Lexer. Even though src is an io.Reader,
@ -44,23 +45,41 @@ func NewLexer(src io.Reader) (*Scanner, error) {
// next reads the next rune from the bufferred reader. Returns the rune(0) if
// an error occurs (or io.EOF is returned).
func (s *Scanner) next() rune {
var err error
var size int
s.ch, size, err = s.src.ReadRune()
ch, size, err := s.src.ReadRune()
if err != nil {
return eof
}
s.lastCharLen = size
s.pos.Offset += size
s.pos.Column += size
// remember last position
s.prevPos = s.currPos
if s.ch == '\n' {
s.pos.Line++
s.pos.Column = 0
s.lastCharLen = size
s.currPos.Offset += size
s.currPos.Column += size
if ch == '\n' {
s.currPos.Line++
s.currPos.Column = 0
}
return s.ch
return ch
}
func (s *Scanner) unread() {
if err := s.src.UnreadRune(); err != nil {
panic(err) // this is user fault, we should catch it
}
s.currPos = s.prevPos // put back last position
}
func (s *Scanner) peek() rune {
peek, _, err := s.src.ReadRune()
if err != nil {
return eof
}
s.src.UnreadRune()
return peek
}
// Scan scans the next token and returns the token and it's literal string.
@ -74,16 +93,19 @@ func (s *Scanner) Scan() (tok Token, lit string) {
// start the token position
s.tokBuf.Reset()
s.tokPos = s.pos.Offset - s.lastCharLen
s.tokPos = s.currPos.Offset - s.lastCharLen
// identifier
if isLetter(ch) {
tok = IDENT
s.scanIdentifier()
lit = s.scanIdentifier()
if lit == "true" || lit == "false" {
tok = BOOL
}
}
if isDigit(ch) {
// scan for number
// scanDigits()
// TODO(arslan)
}
switch ch {
@ -92,10 +114,9 @@ func (s *Scanner) Scan() (tok Token, lit string) {
case '"':
tok = STRING
s.scanString()
s.next() // move forward so we finalize the string
}
s.tokEnd = s.pos.Offset - s.lastCharLen
s.tokEnd = s.currPos.Offset
return tok, s.TokenLiteral()
}
@ -120,10 +141,16 @@ func (s *Scanner) scanString() {
return
}
func (s *Scanner) scanIdentifier() {
for isLetter(s.ch) || isDigit(s.ch) {
s.next()
func (s *Scanner) scanIdentifier() string {
offs := s.currPos.Offset - s.lastCharLen
ch := s.next()
for isLetter(ch) || isDigit(ch) {
ch = s.next()
}
s.unread() // we got identifier, put back latest char
// return string(s.srcBytes[offs:(s.currPos.Offset - s.lastCharLen)])
return string(s.srcBytes[offs:s.currPos.Offset])
}
// TokenLiteral returns the literal string corresponding to the most recently

View File

@ -13,8 +13,38 @@ type token struct {
text string
}
func TestBool(t *testing.T) {
var tokenList = []token{
{BOOL, "true"},
{BOOL, "false"},
}
// create artifical source code
buf := new(bytes.Buffer)
for _, ident := range tokenList {
fmt.Fprintf(buf, " \t%s\n", ident.text)
}
l, err := NewLexer(buf)
if err != nil {
t.Fatal(err)
}
for _, ident := range tokenList {
tok, lit := l.Scan()
if tok != ident.tok {
t.Errorf("tok = %s want %s for %s\n", tok, ident.tok, ident.text)
}
if lit != ident.text {
t.Errorf("text = %s want %s", lit, ident.text)
}
}
}
func TestIdent(t *testing.T) {
var identList = []token{
var tokenList = []token{
{IDENT, "a"},
{IDENT, "a0"},
{IDENT, "foobar"},
@ -35,7 +65,7 @@ func TestIdent(t *testing.T) {
// create artifical source code
buf := new(bytes.Buffer)
for _, ident := range identList {
for _, ident := range tokenList {
fmt.Fprintf(buf, " \t%s\n", ident.text)
}
@ -44,7 +74,7 @@ func TestIdent(t *testing.T) {
t.Fatal(err)
}
for _, ident := range identList {
for _, ident := range tokenList {
tok, lit := l.Scan()
if tok != ident.tok {
t.Errorf("tok = %s want %s for %s\n", tok, ident.tok, ident.text)
@ -58,7 +88,7 @@ func TestIdent(t *testing.T) {
}
func TestString(t *testing.T) {
var identList = []token{
var tokenList = []token{
{STRING, `" "`},
{STRING, `"a"`},
{STRING, `"本"`},
@ -83,7 +113,7 @@ func TestString(t *testing.T) {
// create artifical source code
buf := new(bytes.Buffer)
for _, ident := range identList {
for _, ident := range tokenList {
fmt.Fprintf(buf, " \t%s\n", ident.text)
}
@ -92,7 +122,7 @@ func TestString(t *testing.T) {
t.Fatal(err)
}
for _, ident := range identList {
for _, ident := range tokenList {
tok, lit := l.Scan()
if tok != ident.tok {
t.Errorf("tok = %s want %s for %s\n", tok, ident.tok, ident.text)