2015-10-03 14:08:09 +00:00
|
|
|
package parser
|
|
|
|
|
|
|
|
import (
|
2015-10-03 16:45:57 +00:00
|
|
|
"bytes"
|
2015-10-03 14:08:09 +00:00
|
|
|
"io"
|
2015-10-03 20:50:50 +00:00
|
|
|
"io/ioutil"
|
2015-10-03 21:20:26 +00:00
|
|
|
"log"
|
2015-10-03 14:08:09 +00:00
|
|
|
"unicode"
|
|
|
|
)
|
|
|
|
|
|
|
|
// eof represents a marker rune for the end of the reader.
|
|
|
|
const eof = rune(0)
|
|
|
|
|
|
|
|
// Lexer defines a lexical scanner
|
2015-10-03 18:25:21 +00:00
|
|
|
type Scanner struct {
|
2015-10-03 20:50:50 +00:00
|
|
|
src *bytes.Buffer
|
|
|
|
srcBytes []byte
|
|
|
|
|
|
|
|
ch rune // current character
|
|
|
|
lastCharLen int // length of last character in bytes
|
|
|
|
pos Position
|
|
|
|
|
|
|
|
// Token text buffer
|
|
|
|
tokBuf bytes.Buffer
|
|
|
|
tokPos int // token text tail position (srcBuf index); valid if >= 0
|
|
|
|
tokEnd int // token text tail end (srcBuf index)
|
2015-10-03 14:08:09 +00:00
|
|
|
}
|
|
|
|
|
2015-10-03 20:50:50 +00:00
|
|
|
// NewLexer returns a new instance of Lexer. Even though src is an io.Reader,
|
|
|
|
// we fully consume the content.
|
|
|
|
func NewLexer(src io.Reader) (*Scanner, error) {
|
|
|
|
buf, err := ioutil.ReadAll(src)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
2015-10-03 14:08:09 +00:00
|
|
|
}
|
2015-10-03 20:50:50 +00:00
|
|
|
|
|
|
|
b := bytes.NewBuffer(buf)
|
|
|
|
return &Scanner{
|
|
|
|
src: b,
|
|
|
|
srcBytes: b.Bytes(),
|
|
|
|
}, nil
|
2015-10-03 14:08:09 +00:00
|
|
|
}
|
|
|
|
|
2015-10-03 17:32:27 +00:00
|
|
|
// next reads the next rune from the bufferred reader. Returns the rune(0) if
|
2015-10-03 14:08:09 +00:00
|
|
|
// an error occurs (or io.EOF is returned).
|
2015-10-03 18:25:21 +00:00
|
|
|
func (s *Scanner) next() rune {
|
2015-10-03 16:45:57 +00:00
|
|
|
var err error
|
2015-10-03 20:50:50 +00:00
|
|
|
var size int
|
|
|
|
s.ch, size, err = s.src.ReadRune()
|
2015-10-03 14:08:09 +00:00
|
|
|
if err != nil {
|
|
|
|
return eof
|
|
|
|
}
|
2015-10-03 17:32:27 +00:00
|
|
|
|
2015-10-03 20:50:50 +00:00
|
|
|
s.lastCharLen = size
|
|
|
|
s.pos.Offset += size
|
|
|
|
s.pos.Column += size
|
2015-10-03 14:08:09 +00:00
|
|
|
|
2015-10-03 20:50:50 +00:00
|
|
|
if s.ch == '\n' {
|
|
|
|
s.pos.Line++
|
|
|
|
s.pos.Column = 0
|
|
|
|
}
|
2015-10-03 14:08:09 +00:00
|
|
|
|
2015-10-03 20:50:50 +00:00
|
|
|
return s.ch
|
2015-10-03 17:33:51 +00:00
|
|
|
}
|
|
|
|
|
2015-10-03 14:08:09 +00:00
|
|
|
// Scan scans the next token and returns the token and it's literal string.
|
2015-10-03 18:25:21 +00:00
|
|
|
func (s *Scanner) Scan() (tok Token, lit string) {
|
|
|
|
ch := s.next()
|
2015-10-03 14:08:09 +00:00
|
|
|
|
2015-10-03 16:45:57 +00:00
|
|
|
// skip white space
|
|
|
|
for isWhitespace(ch) {
|
2015-10-03 18:25:21 +00:00
|
|
|
ch = s.next()
|
2015-10-03 14:08:09 +00:00
|
|
|
}
|
|
|
|
|
2015-10-03 20:50:50 +00:00
|
|
|
// start the token position
|
|
|
|
s.tokBuf.Reset()
|
|
|
|
s.tokPos = s.pos.Offset - s.lastCharLen
|
|
|
|
|
2015-10-03 16:45:57 +00:00
|
|
|
// identifier
|
|
|
|
if isLetter(ch) {
|
2015-10-03 20:50:50 +00:00
|
|
|
tok = IDENT
|
2015-10-03 21:20:26 +00:00
|
|
|
s.scanIdentifier()
|
2015-10-03 20:50:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if isDigit(ch) {
|
|
|
|
// scan for number
|
2015-10-03 16:45:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
switch ch {
|
|
|
|
case eof:
|
2015-10-03 20:50:50 +00:00
|
|
|
tok = EOF
|
2015-10-03 21:20:26 +00:00
|
|
|
case '"':
|
|
|
|
tok = STRING
|
|
|
|
s.scanString()
|
|
|
|
s.next() // move forward so we finalize the string
|
2015-10-03 16:45:57 +00:00
|
|
|
}
|
|
|
|
|
2015-10-03 20:50:50 +00:00
|
|
|
s.tokEnd = s.pos.Offset - s.lastCharLen
|
2015-10-03 14:08:09 +00:00
|
|
|
|
2015-10-03 20:50:50 +00:00
|
|
|
return tok, s.TokenLiteral()
|
|
|
|
}
|
2015-10-03 16:45:57 +00:00
|
|
|
|
2015-10-03 21:20:26 +00:00
|
|
|
func (s *Scanner) scanString() {
|
|
|
|
// '"' opening already consumed
|
|
|
|
ch := s.next() // read character after quote
|
|
|
|
for ch != '"' {
|
|
|
|
if ch == '\n' || ch < 0 {
|
|
|
|
log.Println("[ERROR] literal not terminated")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
if ch == '\\' {
|
|
|
|
// scanEscape
|
|
|
|
return
|
|
|
|
} else {
|
|
|
|
ch = s.next()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2015-10-03 20:50:50 +00:00
|
|
|
func (s *Scanner) scanIdentifier() {
|
2015-10-03 18:25:21 +00:00
|
|
|
for isLetter(s.ch) || isDigit(s.ch) {
|
|
|
|
s.next()
|
2015-10-03 18:06:30 +00:00
|
|
|
}
|
2015-10-03 20:50:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// TokenLiteral returns the literal string corresponding to the most recently
|
|
|
|
// scanned token.
|
|
|
|
func (s *Scanner) TokenLiteral() string {
|
|
|
|
if s.tokPos < 0 {
|
|
|
|
// no token text
|
|
|
|
return ""
|
|
|
|
}
|
2015-10-03 16:45:57 +00:00
|
|
|
|
2015-10-03 20:50:50 +00:00
|
|
|
// part of the token text was saved in tokBuf: save the rest in
|
|
|
|
// tokBuf as well and return its content
|
|
|
|
s.tokBuf.Write(s.srcBytes[s.tokPos:s.tokEnd])
|
|
|
|
s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
|
|
|
|
return s.tokBuf.String()
|
2015-10-03 14:08:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Pos returns the position of the character immediately after the character or
|
|
|
|
// token returned by the last call to Next or Scan.
|
2015-10-03 18:25:21 +00:00
|
|
|
func (s *Scanner) Pos() Position {
|
2015-10-03 14:08:09 +00:00
|
|
|
return Position{}
|
|
|
|
}
|
|
|
|
|
|
|
|
// isSpace reports whether r is a space character.
|
|
|
|
func isSpace(r rune) bool {
|
|
|
|
return r == ' ' || r == '\t'
|
|
|
|
}
|
|
|
|
|
|
|
|
// isEndOfLine reports whether r is an end-of-line character.
|
|
|
|
func isEndOfLine(r rune) bool {
|
|
|
|
return r == '\r' || r == '\n'
|
|
|
|
}
|
|
|
|
|
|
|
|
func isLetter(ch rune) bool {
|
|
|
|
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
|
|
|
|
}
|
|
|
|
|
|
|
|
func isDigit(ch rune) bool {
|
|
|
|
return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
|
|
|
|
}
|
|
|
|
|
|
|
|
// isWhitespace returns true if the rune is a space, tab, newline or carriage return
|
|
|
|
func isWhitespace(ch rune) bool {
|
|
|
|
return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
|
|
|
|
}
|