hcl/scanner/scanner.go

409 lines
8.4 KiB
Go
Raw Normal View History

2015-10-04 17:16:43 +00:00
package scanner
2015-10-03 14:08:09 +00:00
import (
"bytes"
2015-10-04 19:01:10 +00:00
"fmt"
2015-10-03 14:08:09 +00:00
"io"
2015-10-03 20:50:50 +00:00
"io/ioutil"
2015-10-04 19:01:10 +00:00
"os"
2015-10-03 14:08:09 +00:00
"unicode"
2015-10-04 17:19:39 +00:00
"github.com/fatih/hcl/token"
2015-10-03 14:08:09 +00:00
)
// eof represents a marker rune for the end of the reader.
const eof = rune(0)
2015-10-04 17:22:37 +00:00
// Scanner defines a lexical scanner
2015-10-03 18:25:21 +00:00
type Scanner struct {
2015-10-03 20:50:50 +00:00
src *bytes.Buffer
srcBytes []byte
2015-10-03 22:29:13 +00:00
lastCharLen int // length of last character in bytes
currPos Position // current position
prevPos Position // previous position
tokBuf bytes.Buffer // token text buffer
tokPos int // token text tail position (srcBuf index); valid if >= 0
tokEnd int // token text tail end (srcBuf index)
2015-10-04 19:01:10 +00:00
// Error is called for each error encountered. If no Error
// function is set, the error is reported to os.Stderr.
Error func(pos Position, msg string)
// ErrorCount is incremented by one for each error encountered.
ErrorCount int
2015-10-03 14:08:09 +00:00
}
2015-10-04 17:22:37 +00:00
// NewScanner returns a new instance of Lexer. Even though src is an io.Reader,
2015-10-03 20:50:50 +00:00
// we fully consume the content.
2015-10-04 17:22:37 +00:00
func NewScanner(src io.Reader) (*Scanner, error) {
2015-10-03 20:50:50 +00:00
buf, err := ioutil.ReadAll(src)
if err != nil {
return nil, err
2015-10-03 14:08:09 +00:00
}
2015-10-03 20:50:50 +00:00
b := bytes.NewBuffer(buf)
return &Scanner{
src: b,
srcBytes: b.Bytes(),
}, nil
2015-10-03 14:08:09 +00:00
}
2015-10-03 17:32:27 +00:00
// next reads the next rune from the bufferred reader. Returns the rune(0) if
2015-10-03 14:08:09 +00:00
// an error occurs (or io.EOF is returned).
2015-10-03 18:25:21 +00:00
func (s *Scanner) next() rune {
2015-10-03 22:29:13 +00:00
ch, size, err := s.src.ReadRune()
2015-10-03 14:08:09 +00:00
if err != nil {
return eof
}
2015-10-03 17:32:27 +00:00
2015-10-03 22:29:13 +00:00
// remember last position
s.prevPos = s.currPos
2015-10-03 20:50:50 +00:00
s.lastCharLen = size
2015-10-03 22:29:13 +00:00
s.currPos.Offset += size
s.currPos.Column += size
if ch == '\n' {
s.currPos.Line++
s.currPos.Column = 0
}
return ch
}
func (s *Scanner) unread() {
if err := s.src.UnreadRune(); err != nil {
panic(err) // this is user fault, we should catch it
}
s.currPos = s.prevPos // put back last position
}
2015-10-03 14:08:09 +00:00
2015-10-03 22:29:13 +00:00
func (s *Scanner) peek() rune {
peek, _, err := s.src.ReadRune()
if err != nil {
return eof
2015-10-03 20:50:50 +00:00
}
2015-10-03 14:08:09 +00:00
2015-10-03 22:29:13 +00:00
s.src.UnreadRune()
return peek
2015-10-03 17:33:51 +00:00
}
2015-10-03 22:35:29 +00:00
// Scan scans the next token and returns the token.
2015-10-04 17:19:39 +00:00
func (s *Scanner) Scan() (tok token.Token) {
2015-10-03 18:25:21 +00:00
ch := s.next()
2015-10-03 14:08:09 +00:00
// skip white space
for isWhitespace(ch) {
2015-10-03 18:25:21 +00:00
ch = s.next()
2015-10-03 14:08:09 +00:00
}
2015-10-03 20:50:50 +00:00
// start the token position
s.tokBuf.Reset()
2015-10-03 22:29:13 +00:00
s.tokPos = s.currPos.Offset - s.lastCharLen
2015-10-03 20:50:50 +00:00
switch {
case isLetter(ch):
2015-10-04 17:19:39 +00:00
tok = token.IDENT
2015-10-03 22:35:29 +00:00
lit := s.scanIdentifier()
2015-10-03 22:29:13 +00:00
if lit == "true" || lit == "false" {
2015-10-04 17:19:39 +00:00
tok = token.BOOL
2015-10-03 22:29:13 +00:00
}
2015-10-04 20:21:34 +00:00
case isDecimal(ch):
tok = s.scanNumber(ch)
default:
switch ch {
case eof:
tok = token.EOF
case '"':
tok = token.STRING
s.scanString()
case '#':
tok = token.COMMENT
s.scanComment(ch)
case '.':
2015-10-05 10:12:48 +00:00
ch = s.peek()
if isDecimal(ch) {
tok = token.FLOAT
ch = s.scanMantissa(ch)
ch = s.scanExponent(ch)
2015-10-05 10:12:48 +00:00
} else {
tok = token.PERIOD
}
2015-10-05 10:12:48 +00:00
case '[':
tok = token.LBRACK
case ']':
tok = token.RBRACK
case '{':
tok = token.LBRACE
case '}':
tok = token.RBRACE
case ',':
tok = token.COMMA
case '=':
tok = token.ASSIGN
case '+':
tok = token.ADD
case '-':
tok = token.SUB
}
}
2015-10-03 22:29:13 +00:00
s.tokEnd = s.currPos.Offset
2015-10-03 22:35:29 +00:00
return tok
2015-10-03 20:50:50 +00:00
}
func (s *Scanner) scanComment(ch rune) {
if ch == '#' {
// line comment
ch = s.next()
for ch != '\n' && ch >= 0 {
ch = s.next()
}
s.unread()
}
}
2015-10-04 20:21:34 +00:00
// scanNumber scans a HCL number definition starting with the given rune
func (s *Scanner) scanNumber(ch rune) token.Token {
if ch == '0' {
2015-10-05 08:56:11 +00:00
// check for hexadecimal, octal or float
ch = s.next()
if ch == 'x' || ch == 'X' {
2015-10-05 08:56:11 +00:00
// hexadecimal
ch = s.next()
2015-10-05 08:56:11 +00:00
found := false
for isHexadecimal(ch) {
ch = s.next()
found = true
}
s.unread()
if !found {
s.err("illegal hexadecimal number")
}
return token.NUMBER
}
2015-10-05 08:56:11 +00:00
// now it's either something like: 0421(octal) or 0.1231(float)
illegalOctal := false
2015-10-05 09:26:22 +00:00
for isDecimal(ch) {
2015-10-05 08:56:11 +00:00
ch = s.next()
if ch == '8' || ch == '9' {
2015-10-05 09:26:22 +00:00
// this is just a possibility. For example 0159 is illegal, but
2015-10-05 09:59:55 +00:00
// 0159.23 is valid. So we mark a possible illegal octal. If
// the next character is not a period, we'll print the error.
2015-10-05 08:56:11 +00:00
illegalOctal = true
2015-10-05 09:26:22 +00:00
2015-10-05 08:56:11 +00:00
}
2015-10-05 09:59:55 +00:00
2015-10-05 08:56:11 +00:00
}
s.unread()
if ch == '.' || ch == 'e' || ch == 'E' {
2015-10-05 09:59:55 +00:00
ch = s.next()
2015-10-05 09:26:22 +00:00
ch = s.scanFraction(ch)
ch = s.scanExponent(ch)
2015-10-05 08:56:11 +00:00
return token.FLOAT
}
if illegalOctal {
s.err("illegal octal number")
}
return token.NUMBER
2015-10-04 20:21:34 +00:00
}
2015-10-05 09:26:22 +00:00
ch = s.scanMantissa(ch)
if ch == '.' || ch == 'e' || ch == 'E' {
2015-10-05 09:59:55 +00:00
ch = s.next() // seek forward
2015-10-05 09:26:22 +00:00
ch = s.scanFraction(ch)
ch = s.scanExponent(ch)
return token.FLOAT
}
2015-10-04 20:21:34 +00:00
return token.NUMBER
}
2015-10-05 09:59:55 +00:00
// scanMantissa scans the mantissa begining from the rune. It returns the next
// non decimal rune. It's used to determine wheter it's a fraction or exponent.
func (s *Scanner) scanMantissa(ch rune) rune {
scanned := false
for isDecimal(ch) {
ch = s.next()
scanned = true
}
if scanned {
s.unread()
}
return ch
}
2015-10-05 09:26:22 +00:00
func (s *Scanner) scanFraction(ch rune) rune {
if ch == '.' {
2015-10-05 10:03:46 +00:00
ch = s.peek() // we peek just to see if we can move forward
ch = s.scanMantissa(ch)
2015-10-05 09:26:22 +00:00
}
return ch
}
func (s *Scanner) scanExponent(ch rune) rune {
if ch == 'e' || ch == 'E' {
ch = s.next()
if ch == '-' || ch == '+' {
ch = s.next()
}
ch = s.scanMantissa(ch)
}
return ch
}
2015-10-04 19:17:59 +00:00
// scanString scans a quoted string
2015-10-03 21:20:26 +00:00
func (s *Scanner) scanString() {
2015-10-04 19:01:10 +00:00
for {
// '"' opening already consumed
// read character after quote
ch := s.next()
if ch == '\n' || ch < 0 || ch == eof {
s.err("literal not terminated")
2015-10-03 21:20:26 +00:00
return
}
2015-10-04 19:01:10 +00:00
if ch == '"' {
break
}
2015-10-03 21:20:26 +00:00
if ch == '\\' {
2015-10-04 19:01:10 +00:00
s.scanEscape()
2015-10-03 21:20:26 +00:00
}
}
return
}
2015-10-04 19:01:10 +00:00
// scanEscape scans an escape sequence
func (s *Scanner) scanEscape() rune {
// http://en.cppreference.com/w/cpp/language/escape
ch := s.next() // read character after '/'
switch ch {
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
// nothing to do
case '0', '1', '2', '3', '4', '5', '6', '7':
// octal notation
ch = s.scanDigits(ch, 8, 3)
case 'x':
// hexademical notation
ch = s.scanDigits(s.next(), 16, 2)
case 'u':
// universal character name
ch = s.scanDigits(s.next(), 16, 4)
case 'U':
// universal character name
ch = s.scanDigits(s.next(), 16, 8)
default:
s.err("illegal char escape")
}
return ch
}
// scanDigits scans a rune with the given base for n times. For example an
// octan notation \184 would yield in scanDigits(ch, 8, 3)
func (s *Scanner) scanDigits(ch rune, base, n int) rune {
for n > 0 && digitVal(ch) < base {
ch = s.next()
n--
}
if n > 0 {
s.err("illegal char escape")
}
// we scanned all digits, put the last non digit char back
s.unread()
return ch
}
// scanIdentifier scans an identifier and returns the literal string
2015-10-03 22:29:13 +00:00
func (s *Scanner) scanIdentifier() string {
offs := s.currPos.Offset - s.lastCharLen
ch := s.next()
for isLetter(ch) || isDigit(ch) {
ch = s.next()
2015-10-03 18:06:30 +00:00
}
2015-10-03 22:29:13 +00:00
s.unread() // we got identifier, put back latest char
// return string(s.srcBytes[offs:(s.currPos.Offset - s.lastCharLen)])
return string(s.srcBytes[offs:s.currPos.Offset])
2015-10-03 20:50:50 +00:00
}
2015-10-03 22:35:29 +00:00
// TokenText returns the literal string corresponding to the most recently
2015-10-03 20:50:50 +00:00
// scanned token.
2015-10-03 22:35:29 +00:00
func (s *Scanner) TokenText() string {
2015-10-03 20:50:50 +00:00
if s.tokPos < 0 {
// no token text
return ""
}
2015-10-03 20:50:50 +00:00
// part of the token text was saved in tokBuf: save the rest in
// tokBuf as well and return its content
s.tokBuf.Write(s.srcBytes[s.tokPos:s.tokEnd])
s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
return s.tokBuf.String()
2015-10-03 14:08:09 +00:00
}
// Pos returns the position of the character immediately after the character or
2015-10-03 22:32:45 +00:00
// token returned by the last call to Scan.
2015-10-03 18:25:21 +00:00
func (s *Scanner) Pos() Position {
2015-10-03 22:32:45 +00:00
return s.currPos
2015-10-03 14:08:09 +00:00
}
2015-10-04 19:01:10 +00:00
func (s *Scanner) err(msg string) {
s.ErrorCount++
if s.Error != nil {
s.Error(s.currPos, msg)
return
}
fmt.Fprintf(os.Stderr, "%s: %s\n", s.currPos, msg)
}
2015-10-03 14:08:09 +00:00
func isLetter(ch rune) bool {
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
}
func isDigit(ch rune) bool {
return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
}
2015-10-05 08:56:11 +00:00
func isOctal(ch rune) bool {
return '0' <= ch && ch <= '7'
}
2015-10-04 20:21:34 +00:00
func isDecimal(ch rune) bool {
return '0' <= ch && ch <= '9'
}
func isHexadecimal(ch rune) bool {
return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F'
}
2015-10-03 14:08:09 +00:00
// isWhitespace returns true if the rune is a space, tab, newline or carriage return
func isWhitespace(ch rune) bool {
return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
}
2015-10-04 19:01:10 +00:00
func digitVal(ch rune) int {
switch {
case '0' <= ch && ch <= '9':
return int(ch - '0')
case 'a' <= ch && ch <= 'f':
return int(ch - 'a' + 10)
case 'A' <= ch && ch <= 'F':
return int(ch - 'A' + 10)
}
return 16 // larger than any legal digit val
}