hcl/scanner/scanner.go

package scanner

import (
	"bytes"
	"fmt"
	"io"
	"io/ioutil"
	"os"
	"unicode"

	"github.com/fatih/hcl/token"
)

// eof represents a marker rune for the end of the reader.
const eof = rune(0)

// Scanner defines a lexical scanner
type Scanner struct {
	src      *bytes.Buffer
	srcBytes []byte

	lastCharLen int // length of last character in bytes

	currPos Position // current position
	prevPos Position // previous position

	tokBuf bytes.Buffer // token text buffer
	tokPos int          // token text tail position (srcBuf index); valid if >= 0
	tokEnd int          // token text tail end (srcBuf index)

	// Error is called for each error encountered. If no Error
	// function is set, the error is reported to os.Stderr.
	Error func(pos Position, msg string)

	// ErrorCount is incremented by one for each error encountered.
	ErrorCount int
}

// NewScanner returns a new instance of Lexer. Even though src is an io.Reader,
// we fully consume the content.
func NewScanner(src io.Reader) (*Scanner, error) {
	buf, err := ioutil.ReadAll(src)
	if err != nil {
		return nil, err
	}

	b := bytes.NewBuffer(buf)
	return &Scanner{
		src:      b,
		srcBytes: b.Bytes(),
	}, nil
}

// next reads the next rune from the bufferred reader. Returns the rune(0) if
// an error occurs (or io.EOF is returned).
func (s *Scanner) next() rune {
	ch, size, err := s.src.ReadRune()
	if err != nil {
		return eof
	}

	// remember last position
	s.prevPos = s.currPos

	s.lastCharLen = size
	s.currPos.Offset += size
	s.currPos.Column += size

	if ch == '\n' {
		s.currPos.Line++
		s.currPos.Column = 0
	}

	return ch
}

func (s *Scanner) unread() {
	if err := s.src.UnreadRune(); err != nil {
		panic(err) // this is user fault, we should catch it
	}
	s.currPos = s.prevPos // put back last position
}

func (s *Scanner) peek() rune {
	peek, _, err := s.src.ReadRune()
	if err != nil {
		return eof
	}

	s.src.UnreadRune()
	return peek
}

// Scan scans the next token and returns the token.
func (s *Scanner) Scan() (tok token.Token) {
	ch := s.next()

	// skip white space
	for isWhitespace(ch) {
		ch = s.next()
	}

	// start the token position
	s.tokBuf.Reset()
	s.tokPos = s.currPos.Offset - s.lastCharLen

	switch {
	case isLetter(ch):
		tok = token.IDENT
		lit := s.scanIdentifier()
		if lit == "true" || lit == "false" {
			tok = token.BOOL
		}

	case isDigit(ch):
		s.scanNumber()
		// TODO(arslan)
	default:
		switch ch {
		case eof:
			tok = token.EOF
		case '"':
			tok = token.STRING
			s.scanString()
		}
	}

	s.tokEnd = s.currPos.Offset
	return tok
}

func (s *Scanner) scanNumber() {
}

// scanString scans a quoted string
func (s *Scanner) scanString() {
	for {
		// '"' opening already consumed
		// read character after quote
		ch := s.next()

		if ch == '\n' || ch < 0 || ch == eof {
			s.err("literal not terminated")
			return
		}

		if ch == '"' {
			break
		}

		if ch == '\\' {
			s.scanEscape()
		}
	}

	return
}

// scanEscape scans an escape sequence
func (s *Scanner) scanEscape() rune {
	// http://en.cppreference.com/w/cpp/language/escape
	ch := s.next() // read character after '/'
	switch ch {
	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
		// nothing to do
	case '0', '1', '2', '3', '4', '5', '6', '7':
		// octal notation
		ch = s.scanDigits(ch, 8, 3)
	case 'x':
		// hexademical notation
		ch = s.scanDigits(s.next(), 16, 2)
	case 'u':
		// universal character name
		ch = s.scanDigits(s.next(), 16, 4)
	case 'U':
		// universal character name
		ch = s.scanDigits(s.next(), 16, 8)
	default:
		s.err("illegal char escape")
	}
	return ch
}

// scanDigits scans a rune with the given base for n times. For example an
// octan notation \184 would yield in scanDigits(ch, 8, 3)
func (s *Scanner) scanDigits(ch rune, base, n int) rune {
	for n > 0 && digitVal(ch) < base {
		ch = s.next()
		n--
	}
	if n > 0 {
		s.err("illegal char escape")
	}

	// we scanned all digits, put the last non digit char back
	s.unread()
	return ch
}

// scanIdentifier scans an identifier and returns the literal string
func (s *Scanner) scanIdentifier() string {
	offs := s.currPos.Offset - s.lastCharLen
	ch := s.next()
	for isLetter(ch) || isDigit(ch) {
		ch = s.next()
	}
	s.unread() // we got identifier, put back latest char

	// return string(s.srcBytes[offs:(s.currPos.Offset - s.lastCharLen)])
	return string(s.srcBytes[offs:s.currPos.Offset])
}

// TokenText returns the literal string corresponding to the most recently
// scanned token.
func (s *Scanner) TokenText() string {
	if s.tokPos < 0 {
		// no token text
		return ""
	}

	// part of the token text was saved in tokBuf: save the rest in
	// tokBuf as well and return its content
	s.tokBuf.Write(s.srcBytes[s.tokPos:s.tokEnd])
	s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
	return s.tokBuf.String()
}

// Pos returns the position of the character immediately after the character or
// token returned by the last call to Scan.
func (s *Scanner) Pos() Position {
	return s.currPos
}

func (s *Scanner) err(msg string) {
	s.ErrorCount++
	if s.Error != nil {
		s.Error(s.currPos, msg)
		return
	}

	fmt.Fprintf(os.Stderr, "%s: %s\n", s.currPos, msg)
}

func isLetter(ch rune) bool {
	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
}

func isDigit(ch rune) bool {
	return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
}

// isWhitespace returns true if the rune is a space, tab, newline or carriage return
func isWhitespace(ch rune) bool {
	return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
}

func digitVal(ch rune) int {
	switch {
	case '0' <= ch && ch <= '9':
		return int(ch - '0')
	case 'a' <= ch && ch <= 'f':
		return int(ch - 'a' + 10)
	case 'A' <= ch && ch <= 'F':
		return int(ch - 'A' + 10)
	}
	return 16 // larger than any legal digit val
}