scanner: implement string scanning

This commit is contained in:
Fatih Arslan 2015-10-04 22:01:10 +03:00
parent 94bd4afe4d
commit 1f011b4e82
2 changed files with 102 additions and 27 deletions

View File

@ -2,9 +2,10 @@ package scanner
import ( import (
"bytes" "bytes"
"fmt"
"io" "io"
"io/ioutil" "io/ioutil"
"log" "os"
"unicode" "unicode"
"github.com/fatih/hcl/token" "github.com/fatih/hcl/token"
@ -26,6 +27,13 @@ type Scanner struct {
tokBuf bytes.Buffer // token text buffer tokBuf bytes.Buffer // token text buffer
tokPos int // token text tail position (srcBuf index); valid if >= 0 tokPos int // token text tail position (srcBuf index); valid if >= 0
tokEnd int // token text tail end (srcBuf index) tokEnd int // token text tail end (srcBuf index)
// Error is called for each error encountered. If no Error
// function is set, the error is reported to os.Stderr.
Error func(pos Position, msg string)
// ErrorCount is incremented by one for each error encountered.
ErrorCount int
} }
// NewScanner returns a new instance of Lexer. Even though src is an io.Reader, // NewScanner returns a new instance of Lexer. Even though src is an io.Reader,
@ -122,25 +130,70 @@ func (s *Scanner) Scan() (tok token.Token) {
} }
func (s *Scanner) scanString() { func (s *Scanner) scanString() {
// '"' opening already consumed for {
ch := s.next() // read character after quote // '"' opening already consumed
for ch != '"' { // read character after quote
if ch == '\n' || ch < 0 { ch := s.next()
log.Println("[ERROR] literal not terminated")
if ch == '\n' || ch < 0 || ch == eof {
s.err("literal not terminated")
return return
} }
if ch == '"' {
break
}
if ch == '\\' { if ch == '\\' {
// scanEscape s.scanEscape()
return
} else {
ch = s.next()
} }
} }
return return
} }
// scanEscape scans an escape sequence
func (s *Scanner) scanEscape() rune {
// http://en.cppreference.com/w/cpp/language/escape
ch := s.next() // read character after '/'
switch ch {
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
// nothing to do
case '0', '1', '2', '3', '4', '5', '6', '7':
// octal notation
ch = s.scanDigits(ch, 8, 3)
case 'x':
// hexademical notation
ch = s.scanDigits(s.next(), 16, 2)
case 'u':
// universal character name
ch = s.scanDigits(s.next(), 16, 4)
case 'U':
// universal character name
ch = s.scanDigits(s.next(), 16, 8)
default:
s.err("illegal char escape")
}
return ch
}
// scanDigits scans a rune with the given base for n times. For example an
// octan notation \184 would yield in scanDigits(ch, 8, 3)
func (s *Scanner) scanDigits(ch rune, base, n int) rune {
for n > 0 && digitVal(ch) < base {
ch = s.next()
n--
}
if n > 0 {
s.err("illegal char escape")
}
// we scanned all digits, put the last non digit char back
s.unread()
return ch
}
// scanIdentifier scans an identifier and returns the literal string
func (s *Scanner) scanIdentifier() string { func (s *Scanner) scanIdentifier() string {
offs := s.currPos.Offset - s.lastCharLen offs := s.currPos.Offset - s.lastCharLen
ch := s.next() ch := s.next()
@ -174,6 +227,16 @@ func (s *Scanner) Pos() Position {
return s.currPos return s.currPos
} }
func (s *Scanner) err(msg string) {
s.ErrorCount++
if s.Error != nil {
s.Error(s.currPos, msg)
return
}
fmt.Fprintf(os.Stderr, "%s: %s\n", s.currPos, msg)
}
func isLetter(ch rune) bool { func isLetter(ch rune) bool {
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch) return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
} }
@ -186,3 +249,15 @@ func isDigit(ch rune) bool {
func isWhitespace(ch rune) bool { func isWhitespace(ch rune) bool {
return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
} }
func digitVal(ch rune) int {
switch {
case '0' <= ch && ch <= '9':
return int(ch - '0')
case 'a' <= ch && ch <= 'f':
return int(ch - 'a' + 10)
case 'A' <= ch && ch <= 'F':
return int(ch - 'A' + 10)
}
return 16 // larger than any legal digit val
}

View File

@ -94,23 +94,23 @@ func TestString(t *testing.T) {
{token.STRING, `" "`}, {token.STRING, `" "`},
{token.STRING, `"a"`}, {token.STRING, `"a"`},
{token.STRING, `"本"`}, {token.STRING, `"本"`},
// {STRING, `"\a"`}, {token.STRING, `"\a"`},
// {STRING, `"\b"`}, {token.STRING, `"\b"`},
// {STRING, `"\f"`}, {token.STRING, `"\f"`},
// {STRING, `"\n"`}, {token.STRING, `"\n"`},
// {STRING, `"\r"`}, {token.STRING, `"\r"`},
// {STRING, `"\t"`}, {token.STRING, `"\t"`},
// {STRING, `"\v"`}, {token.STRING, `"\v"`},
// {STRING, `"\""`}, {token.STRING, `"\""`},
// {STRING, `"\000"`}, {token.STRING, `"\000"`},
// {STRING, `"\777"`}, {token.STRING, `"\777"`},
// {STRING, `"\x00"`}, {token.STRING, `"\x00"`},
// {STRING, `"\xff"`}, {token.STRING, `"\xff"`},
// {STRING, `"\u0000"`}, {token.STRING, `"\u0000"`},
// {STRING, `"\ufA16"`}, {token.STRING, `"\ufA16"`},
// {STRING, `"\U00000000"`}, {token.STRING, `"\U00000000"`},
// {STRING, `"\U0000ffAB"`}, {token.STRING, `"\U0000ffAB"`},
// {STRING, `"` + f100 + `"`}, {token.STRING, `"` + f100 + `"`},
} }
// create artifical source code // create artifical source code