lexer: implement positions
This commit is contained in:
parent
4b72853d41
commit
cc7a91eeab
@ -1,9 +1,9 @@
|
|||||||
package parser
|
package parser
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
|
||||||
"bytes"
|
"bytes"
|
||||||
"io"
|
"io"
|
||||||
|
"io/ioutil"
|
||||||
"unicode"
|
"unicode"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -12,40 +12,56 @@ const eof = rune(0)
|
|||||||
|
|
||||||
// Lexer defines a lexical scanner
|
// Lexer defines a lexical scanner
|
||||||
type Scanner struct {
|
type Scanner struct {
|
||||||
src *bufio.Reader // input
|
src *bytes.Buffer
|
||||||
|
srcBytes []byte
|
||||||
|
|
||||||
ch rune // current character
|
ch rune // current character
|
||||||
|
lastCharLen int // length of last character in bytes
|
||||||
|
pos Position
|
||||||
|
|
||||||
|
// Token text buffer
|
||||||
|
tokBuf bytes.Buffer
|
||||||
|
tokPos int // token text tail position (srcBuf index); valid if >= 0
|
||||||
|
tokEnd int // token text tail end (srcBuf index)
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewLexer returns a new instance of Lexer.
|
// NewLexer returns a new instance of Lexer. Even though src is an io.Reader,
|
||||||
func NewLexer(src io.Reader) *Scanner {
|
// we fully consume the content.
|
||||||
return &Scanner{
|
func NewLexer(src io.Reader) (*Scanner, error) {
|
||||||
src: bufio.NewReader(src),
|
buf, err := ioutil.ReadAll(src)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
b := bytes.NewBuffer(buf)
|
||||||
|
return &Scanner{
|
||||||
|
src: b,
|
||||||
|
srcBytes: b.Bytes(),
|
||||||
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// next reads the next rune from the bufferred reader. Returns the rune(0) if
|
// next reads the next rune from the bufferred reader. Returns the rune(0) if
|
||||||
// an error occurs (or io.EOF is returned).
|
// an error occurs (or io.EOF is returned).
|
||||||
func (s *Scanner) next() rune {
|
func (s *Scanner) next() rune {
|
||||||
var err error
|
var err error
|
||||||
s.ch, _, err = s.src.ReadRune()
|
var size int
|
||||||
|
s.ch, size, err = s.src.ReadRune()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return eof
|
return eof
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.lastCharLen = size
|
||||||
|
s.pos.Offset += size
|
||||||
|
s.pos.Column += size
|
||||||
|
|
||||||
|
if s.ch == '\n' {
|
||||||
|
s.pos.Line++
|
||||||
|
s.pos.Column = 0
|
||||||
|
}
|
||||||
|
|
||||||
return s.ch
|
return s.ch
|
||||||
}
|
}
|
||||||
|
|
||||||
// unread places the previously read rune back on the reader.
|
|
||||||
func (s *Scanner) unread() { _ = s.src.UnreadRune() }
|
|
||||||
|
|
||||||
func (s *Scanner) peek() rune {
|
|
||||||
prev := s.ch
|
|
||||||
peekCh := s.next()
|
|
||||||
s.unread()
|
|
||||||
s.ch = prev
|
|
||||||
return peekCh
|
|
||||||
}
|
|
||||||
|
|
||||||
// Scan scans the next token and returns the token and it's literal string.
|
// Scan scans the next token and returns the token and it's literal string.
|
||||||
func (s *Scanner) Scan() (tok Token, lit string) {
|
func (s *Scanner) Scan() (tok Token, lit string) {
|
||||||
ch := s.next()
|
ch := s.next()
|
||||||
@ -55,29 +71,49 @@ func (s *Scanner) Scan() (tok Token, lit string) {
|
|||||||
ch = s.next()
|
ch = s.next()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// start the token position
|
||||||
|
s.tokBuf.Reset()
|
||||||
|
s.tokPos = s.pos.Offset - s.lastCharLen
|
||||||
|
|
||||||
// identifier
|
// identifier
|
||||||
if isLetter(ch) {
|
if isLetter(ch) {
|
||||||
return s.scanIdentifier()
|
s.scanIdentifier()
|
||||||
|
tok = IDENT
|
||||||
|
}
|
||||||
|
|
||||||
|
if isDigit(ch) {
|
||||||
|
// scan for number
|
||||||
}
|
}
|
||||||
|
|
||||||
switch ch {
|
switch ch {
|
||||||
case eof:
|
case eof:
|
||||||
return EOF, ""
|
tok = EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0, ""
|
s.tokEnd = s.pos.Offset - s.lastCharLen
|
||||||
|
|
||||||
|
return tok, s.TokenLiteral()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scanner) scanIdentifier() (Token, string) {
|
func (s *Scanner) scanIdentifier() {
|
||||||
// Create a buffer and read the current character into it.
|
|
||||||
var buf bytes.Buffer
|
|
||||||
|
|
||||||
for isLetter(s.ch) || isDigit(s.ch) {
|
for isLetter(s.ch) || isDigit(s.ch) {
|
||||||
buf.WriteRune(s.ch)
|
|
||||||
s.next()
|
s.next()
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return IDENT, buf.String()
|
// TokenLiteral returns the literal string corresponding to the most recently
|
||||||
|
// scanned token.
|
||||||
|
func (s *Scanner) TokenLiteral() string {
|
||||||
|
if s.tokPos < 0 {
|
||||||
|
// no token text
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// part of the token text was saved in tokBuf: save the rest in
|
||||||
|
// tokBuf as well and return its content
|
||||||
|
s.tokBuf.Write(s.srcBytes[s.tokPos:s.tokEnd])
|
||||||
|
s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
|
||||||
|
return s.tokBuf.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pos returns the position of the character immediately after the character or
|
// Pos returns the position of the character immediately after the character or
|
||||||
|
@ -37,7 +37,10 @@ func TestIdent(t *testing.T) {
|
|||||||
fmt.Fprintf(buf, " \t%s\n", ident.text)
|
fmt.Fprintf(buf, " \t%s\n", ident.text)
|
||||||
}
|
}
|
||||||
|
|
||||||
l := NewLexer(buf)
|
l, err := NewLexer(buf)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
for _, ident := range identList {
|
for _, ident := range identList {
|
||||||
tok, lit := l.Scan()
|
tok, lit := l.Scan()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user