hcl/scanner/scanner.go

package scanner

import (
	"bytes"
	"io"
	"io/ioutil"
	"log"
	"unicode"

	"github.com/fatih/hcl/token"
)

// eof represents a marker rune for the end of the reader.
const eof = rune(0)

// Scanner defines a lexical scanner
type Scanner struct {
	src      *bytes.Buffer
	srcBytes []byte

	lastCharLen int // length of last character in bytes

	currPos Position // current position
	prevPos Position // previous position

	tokBuf bytes.Buffer // token text buffer
	tokPos int          // token text tail position (srcBuf index); valid if >= 0
	tokEnd int          // token text tail end (srcBuf index)
}

// NewScanner returns a new instance of Lexer. Even though src is an io.Reader,
// we fully consume the content.
func NewScanner(src io.Reader) (*Scanner, error) {
	buf, err := ioutil.ReadAll(src)
	if err != nil {
		return nil, err
	}

	b := bytes.NewBuffer(buf)
	return &Scanner{
		src:      b,
		srcBytes: b.Bytes(),
	}, nil
}

// next reads the next rune from the bufferred reader. Returns the rune(0) if
// an error occurs (or io.EOF is returned).
func (s *Scanner) next() rune {
	ch, size, err := s.src.ReadRune()
	if err != nil {
		return eof
	}

	// remember last position
	s.prevPos = s.currPos

	s.lastCharLen = size
	s.currPos.Offset += size
	s.currPos.Column += size

	if ch == '\n' {
		s.currPos.Line++
		s.currPos.Column = 0
	}

	return ch
}

func (s *Scanner) unread() {
	if err := s.src.UnreadRune(); err != nil {
		panic(err) // this is user fault, we should catch it
	}
	s.currPos = s.prevPos // put back last position
}

func (s *Scanner) peek() rune {
	peek, _, err := s.src.ReadRune()
	if err != nil {
		return eof
	}

	s.src.UnreadRune()
	return peek
}

// Scan scans the next token and returns the token.
func (s *Scanner) Scan() (tok token.Token) {
	ch := s.next()

	// skip white space
	for isWhitespace(ch) {
		ch = s.next()
	}

	// start the token position
	s.tokBuf.Reset()
	s.tokPos = s.currPos.Offset - s.lastCharLen

	if isLetter(ch) {
		tok = token.IDENT
		lit := s.scanIdentifier()
		if lit == "true" || lit == "false" {
			tok = token.BOOL
		}
	}

	if isDigit(ch) {
		// scanDigits()
		// TODO(arslan)
	}

	switch ch {
	case eof:
		tok = token.EOF
	case '"':
		tok = token.STRING
		s.scanString()
	}

	s.tokEnd = s.currPos.Offset
	return tok
}

func (s *Scanner) scanString() {
	// '"' opening already consumed
	ch := s.next() // read character after quote
	for ch != '"' {
		if ch == '\n' || ch < 0 {
			log.Println("[ERROR] literal not terminated")
			return
		}

		if ch == '\\' {
			// scanEscape
			return
		} else {
			ch = s.next()
		}
	}

	return
}

func (s *Scanner) scanIdentifier() string {
	offs := s.currPos.Offset - s.lastCharLen
	ch := s.next()
	for isLetter(ch) || isDigit(ch) {
		ch = s.next()
	}
	s.unread() // we got identifier, put back latest char

	// return string(s.srcBytes[offs:(s.currPos.Offset - s.lastCharLen)])
	return string(s.srcBytes[offs:s.currPos.Offset])
}

// TokenText returns the literal string corresponding to the most recently
// scanned token.
func (s *Scanner) TokenText() string {
	if s.tokPos < 0 {
		// no token text
		return ""
	}

	// part of the token text was saved in tokBuf: save the rest in
	// tokBuf as well and return its content
	s.tokBuf.Write(s.srcBytes[s.tokPos:s.tokEnd])
	s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
	return s.tokBuf.String()
}

// Pos returns the position of the character immediately after the character or
// token returned by the last call to Scan.
func (s *Scanner) Pos() Position {
	return s.currPos
}

func isLetter(ch rune) bool {
	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
}

func isDigit(ch rune) bool {
	return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
}

// isWhitespace returns true if the rune is a space, tab, newline or carriage return
func isWhitespace(ch rune) bool {
	return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
}
hcl: split up package for more control 2015-10-04 17:16:43 +00:00			`package scanner`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00
			`import (`
lexer: various changes, trying text/scanner 2015-10-03 16:45:57 +00:00			`"bytes"`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00			`"io"`
lexer: implement positions 2015-10-03 20:50:50 +00:00			`"io/ioutil"`
lexer: scan strings 2015-10-03 21:20:26 +00:00			`"log"`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00			`"unicode"`
scanner: use new hcl/token package 2015-10-04 17:19:39 +00:00
			`"github.com/fatih/hcl/token"`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00			`)`

			`// eof represents a marker rune for the end of the reader.`
			`const eof = rune(0)`

scanner: small fixes 2015-10-04 17:22:37 +00:00			`// Scanner defines a lexical scanner`
lexer: scanner is more Go idiomatic 2015-10-03 18:25:21 +00:00			`type Scanner struct {`
lexer: implement positions 2015-10-03 20:50:50 +00:00			`src *bytes.Buffer`
			`srcBytes []byte`

lexer: more robust implementation 2015-10-03 22:29:13 +00:00			`lastCharLen int // length of last character in bytes`

			`currPos Position // current position`
			`prevPos Position // previous position`

			`tokBuf bytes.Buffer // token text buffer`
			`tokPos int // token text tail position (srcBuf index); valid if >= 0`
			`tokEnd int // token text tail end (srcBuf index)`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00			`}`

scanner: small fixes 2015-10-04 17:22:37 +00:00			`// NewScanner returns a new instance of Lexer. Even though src is an io.Reader,`
lexer: implement positions 2015-10-03 20:50:50 +00:00			`// we fully consume the content.`
scanner: small fixes 2015-10-04 17:22:37 +00:00			`func NewScanner(src io.Reader) (*Scanner, error) {`
lexer: implement positions 2015-10-03 20:50:50 +00:00			`buf, err := ioutil.ReadAll(src)`
			`if err != nil {`
			`return nil, err`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00			`}`
lexer: implement positions 2015-10-03 20:50:50 +00:00
			`b := bytes.NewBuffer(buf)`
			`return &Scanner{`
			`src: b,`
			`srcBytes: b.Bytes(),`
			`}, nil`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00			`}`

lexer: back to the roots 2015-10-03 17:32:27 +00:00			`// next reads the next rune from the bufferred reader. Returns the rune(0) if`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00			`// an error occurs (or io.EOF is returned).`
lexer: scanner is more Go idiomatic 2015-10-03 18:25:21 +00:00			`func (s *Scanner) next() rune {`
lexer: more robust implementation 2015-10-03 22:29:13 +00:00			`ch, size, err := s.src.ReadRune()`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00			`if err != nil {`
			`return eof`
			`}`
lexer: back to the roots 2015-10-03 17:32:27 +00:00
lexer: more robust implementation 2015-10-03 22:29:13 +00:00			`// remember last position`
			`s.prevPos = s.currPos`

lexer: implement positions 2015-10-03 20:50:50 +00:00			`s.lastCharLen = size`
lexer: more robust implementation 2015-10-03 22:29:13 +00:00			`s.currPos.Offset += size`
			`s.currPos.Column += size`

			`if ch == '\n' {`
			`s.currPos.Line++`
			`s.currPos.Column = 0`
			`}`

			`return ch`
			`}`

			`func (s *Scanner) unread() {`
			`if err := s.src.UnreadRune(); err != nil {`
			`panic(err) // this is user fault, we should catch it`
			`}`
			`s.currPos = s.prevPos // put back last position`
			`}`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00
lexer: more robust implementation 2015-10-03 22:29:13 +00:00			`func (s *Scanner) peek() rune {`
			`peek, _, err := s.src.ReadRune()`
			`if err != nil {`
			`return eof`
lexer: implement positions 2015-10-03 20:50:50 +00:00			`}`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00
lexer: more robust implementation 2015-10-03 22:29:13 +00:00			`s.src.UnreadRune()`
			`return peek`
lexer: add peek() method 2015-10-03 17:33:51 +00:00			`}`

parser: more idiomatic call 2015-10-03 22:35:29 +00:00			`// Scan scans the next token and returns the token.`
scanner: use new hcl/token package 2015-10-04 17:19:39 +00:00			`func (s *Scanner) Scan() (tok token.Token) {`
lexer: scanner is more Go idiomatic 2015-10-03 18:25:21 +00:00			`ch := s.next()`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00
lexer: various changes, trying text/scanner 2015-10-03 16:45:57 +00:00			`// skip white space`
			`for isWhitespace(ch) {`
lexer: scanner is more Go idiomatic 2015-10-03 18:25:21 +00:00			`ch = s.next()`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00			`}`

lexer: implement positions 2015-10-03 20:50:50 +00:00			`// start the token position`
			`s.tokBuf.Reset()`
lexer: more robust implementation 2015-10-03 22:29:13 +00:00			`s.tokPos = s.currPos.Offset - s.lastCharLen`
lexer: implement positions 2015-10-03 20:50:50 +00:00
lexer: various changes, trying text/scanner 2015-10-03 16:45:57 +00:00			`if isLetter(ch) {`
scanner: use new hcl/token package 2015-10-04 17:19:39 +00:00			`tok = token.IDENT`
parser: more idiomatic call 2015-10-03 22:35:29 +00:00			`lit := s.scanIdentifier()`
lexer: more robust implementation 2015-10-03 22:29:13 +00:00			`if lit == "true" \|\| lit == "false" {`
scanner: use new hcl/token package 2015-10-04 17:19:39 +00:00			`tok = token.BOOL`
lexer: more robust implementation 2015-10-03 22:29:13 +00:00			`}`
lexer: implement positions 2015-10-03 20:50:50 +00:00			`}`

			`if isDigit(ch) {`
lexer: more robust implementation 2015-10-03 22:29:13 +00:00			`// scanDigits()`
			`// TODO(arslan)`
lexer: various changes, trying text/scanner 2015-10-03 16:45:57 +00:00			`}`

			`switch ch {`
			`case eof:`
scanner: use new hcl/token package 2015-10-04 17:19:39 +00:00			`tok = token.EOF`
lexer: scan strings 2015-10-03 21:20:26 +00:00			`case '"':`
scanner: use new hcl/token package 2015-10-04 17:19:39 +00:00			`tok = token.STRING`
lexer: scan strings 2015-10-03 21:20:26 +00:00			`s.scanString()`
lexer: various changes, trying text/scanner 2015-10-03 16:45:57 +00:00			`}`

lexer: more robust implementation 2015-10-03 22:29:13 +00:00			`s.tokEnd = s.currPos.Offset`
parser: more idiomatic call 2015-10-03 22:35:29 +00:00			`return tok`
lexer: implement positions 2015-10-03 20:50:50 +00:00			`}`
lexer: various changes, trying text/scanner 2015-10-03 16:45:57 +00:00
lexer: scan strings 2015-10-03 21:20:26 +00:00			`func (s *Scanner) scanString() {`
			`// '"' opening already consumed`
			`ch := s.next() // read character after quote`
			`for ch != '"' {`
			`if ch == '\n' \|\| ch < 0 {`
			`log.Println("[ERROR] literal not terminated")`
			`return`
			`}`

			`if ch == '\\' {`
			`// scanEscape`
			`return`
			`} else {`
			`ch = s.next()`
			`}`
			`}`

			`return`
			`}`

lexer: more robust implementation 2015-10-03 22:29:13 +00:00			`func (s *Scanner) scanIdentifier() string {`
			`offs := s.currPos.Offset - s.lastCharLen`
			`ch := s.next()`
			`for isLetter(ch) \|\| isDigit(ch) {`
			`ch = s.next()`
parser: add scanning ident test 2015-10-03 18:06:30 +00:00			`}`
lexer: more robust implementation 2015-10-03 22:29:13 +00:00			`s.unread() // we got identifier, put back latest char`

			`// return string(s.srcBytes[offs:(s.currPos.Offset - s.lastCharLen)])`
			`return string(s.srcBytes[offs:s.currPos.Offset])`
lexer: implement positions 2015-10-03 20:50:50 +00:00			`}`

parser: more idiomatic call 2015-10-03 22:35:29 +00:00			`// TokenText returns the literal string corresponding to the most recently`
lexer: implement positions 2015-10-03 20:50:50 +00:00			`// scanned token.`
parser: more idiomatic call 2015-10-03 22:35:29 +00:00			`func (s *Scanner) TokenText() string {`
lexer: implement positions 2015-10-03 20:50:50 +00:00			`if s.tokPos < 0 {`
			`// no token text`
			`return ""`
			`}`
lexer: various changes, trying text/scanner 2015-10-03 16:45:57 +00:00
lexer: implement positions 2015-10-03 20:50:50 +00:00			`// part of the token text was saved in tokBuf: save the rest in`
			`// tokBuf as well and return its content`
			`s.tokBuf.Write(s.srcBytes[s.tokPos:s.tokEnd])`
			`s.tokPos = s.tokEnd // ensure idempotency of TokenText() call`
			`return s.tokBuf.String()`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00			`}`

			`// Pos returns the position of the character immediately after the character or`
lexer: fix Position() call 2015-10-03 22:32:45 +00:00			`// token returned by the last call to Scan.`
lexer: scanner is more Go idiomatic 2015-10-03 18:25:21 +00:00			`func (s *Scanner) Pos() Position {`
lexer: fix Position() call 2015-10-03 22:32:45 +00:00			`return s.currPos`
parser: initial lexer next method 2015-10-03 14:08:09 +00:00			`}`

			`func isLetter(ch rune) bool {`
			`return 'a' <= ch && ch <= 'z' \|\| 'A' <= ch && ch <= 'Z' \|\| ch == '_' \|\| ch >= 0x80 && unicode.IsLetter(ch)`
			`}`

			`func isDigit(ch rune) bool {`
			`return '0' <= ch && ch <= '9' \|\| ch >= 0x80 && unicode.IsDigit(ch)`
			`}`

			`// isWhitespace returns true if the rune is a space, tab, newline or carriage return`
			`func isWhitespace(ch rune) bool {`
			`return ch == ' ' \|\| ch == '\t' \|\| ch == '\n' \|\| ch == '\r'`
			`}`