mirror of
https://github.com/odin-lang/Odin.git
synced 2025-12-30 09:54:45 +00:00
666 lines
16 KiB
Odin
666 lines
16 KiB
Odin
// package text/scanner provides a scanner and tokenizer for UTF-8-encoded text.
|
|
// It takes a string providing the source, which then can be tokenized through
|
|
// repeated calls to the scan procedure.
|
|
// For compatibility with existing tooling and languages, the NUL character is not allowed.
|
|
// If an UTF-8 encoded byte order mark (BOM) is the first character in the source, it will be discarded.
|
|
//
|
|
// By default, a Scanner skips white space and Odin comments and recognizes all literals defined by the Odin programming language specification.
|
|
// A Scanner may be customized to recognize only a subset of those literals and to recognize different identifiers and white space characters.
|
|
package text_scanner
|
|
|
|
import "base:runtime"
|
|
import "core:fmt"
|
|
import "core:strings"
|
|
import "core:unicode"
|
|
import "core:unicode/utf8"
|
|
|
|
// Position represents a source position
|
|
// A position is valid if line > 0
|
|
Position :: struct {
|
|
filename: string, // filename, if present
|
|
offset: int, // byte offset, starting @ 0
|
|
line: int, // line number, starting @ 1
|
|
column: int, // column number, starting @ 1 (character count per line)
|
|
}
|
|
|
|
// position_is_valid reports where the position is valid
|
|
@(require_results)
|
|
position_is_valid :: proc(pos: Position) -> bool {
|
|
return pos.line > 0
|
|
}
|
|
|
|
@(require_results)
|
|
position_to_string :: proc(pos: Position, allocator := context.temp_allocator) -> string {
|
|
s := pos.filename
|
|
if s == "" {
|
|
s = "<input>"
|
|
}
|
|
|
|
context.allocator = allocator
|
|
if position_is_valid(pos) {
|
|
return fmt.aprintf("%s(%d:%d)", s, pos.line, pos.column)
|
|
} else {
|
|
return strings.clone(s)
|
|
}
|
|
}
|
|
|
|
EOF :: -1
|
|
Ident :: -2
|
|
Int :: -3
|
|
Float :: -4
|
|
Char :: -5
|
|
String :: -6
|
|
Raw_String :: -7
|
|
Comment :: -8
|
|
|
|
Scan_Flag :: enum u32 {
|
|
Scan_Idents,
|
|
Scan_Ints,
|
|
Scan_C_Int_Prefixes,
|
|
Scan_Floats, // Includes integers and hexadecimal floats
|
|
Scan_Chars,
|
|
Scan_Strings,
|
|
Scan_Raw_Strings,
|
|
Scan_Comments,
|
|
Skip_Comments, // if set with .Scan_Comments, comments become white space
|
|
}
|
|
Scan_Flags :: distinct bit_set[Scan_Flag; u32]
|
|
|
|
Odin_Like_Tokens :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments}
|
|
C_Like_Tokens :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_C_Int_Prefixes, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments}
|
|
|
|
// Only allows for ASCII whitespace
|
|
Whitespace :: distinct bit_set['\x00'..<utf8.RUNE_SELF; u128]
|
|
|
|
// Odin_Whitespace is the default value for the Scanner's whitespace field
|
|
Odin_Whitespace :: Whitespace{'\t', '\n', '\r', ' '}
|
|
C_Whitespace :: Whitespace{'\t', '\n', '\r', '\v', '\f', ' '}
|
|
|
|
|
|
// Scanner allows for the reading of Unicode characters and tokens from a string
|
|
Scanner :: struct {
|
|
src: string,
|
|
|
|
src_pos: int,
|
|
src_end: int,
|
|
|
|
tok_pos: int,
|
|
tok_end: int,
|
|
|
|
ch: rune,
|
|
|
|
line: int,
|
|
column: int,
|
|
prev_line_len: int,
|
|
prev_char_len: int,
|
|
|
|
// error is called for each error encountered
|
|
// If no error procedure is set, the error is reported to os.stderr
|
|
error: proc(s: ^Scanner, msg: string),
|
|
|
|
// error_count is incremented by one for each error encountered
|
|
error_count: int,
|
|
|
|
// flags controls which tokens are recognized
|
|
// e.g. to recognize integers, set the .Scan_Ints flag
|
|
// This field may be changed by the user at any time during scanning
|
|
flags: Scan_Flags,
|
|
|
|
// The whitespace field controls which characters are recognized as white space
|
|
// This field may be changed by the user at any time during scanning
|
|
whitespace: Whitespace,
|
|
|
|
// is_ident_rune is a predicate controlling the characters accepted as the ith rune in an identifier
|
|
// The valid characters must not conflict with the set of white space characters
|
|
// If is_ident_rune is not set, regular Odin-like identifiers are accepted
|
|
// This field may be changed by the user at any time during scanning
|
|
is_ident_rune: proc(ch: rune, i: int) -> bool,
|
|
|
|
// Start position of most recently scanned token (set by scan(s))
|
|
// Call init or next invalidates the position
|
|
pos: Position,
|
|
}
|
|
|
|
// init initializes a scanner with a new source and returns itself.
|
|
// error_count is set to 0, flags is set to Odin_Like_Tokens, whitespace is set to Odin_Whitespace
|
|
init :: proc(s: ^Scanner, src: string, filename := "") -> ^Scanner {
|
|
s^ = {}
|
|
|
|
s.error_count = 0
|
|
s.src = src
|
|
s.pos.filename = filename
|
|
|
|
s.tok_pos = -1
|
|
|
|
s.ch = -2 // no char read yet, not an EOF
|
|
|
|
s.line = 1
|
|
|
|
s.flags = Odin_Like_Tokens
|
|
s.whitespace = Odin_Whitespace
|
|
|
|
return s
|
|
}
|
|
|
|
|
|
@(private, require_results)
|
|
advance :: proc(s: ^Scanner) -> rune {
|
|
if s.src_pos >= len(s.src) {
|
|
s.prev_char_len = 0
|
|
return EOF
|
|
}
|
|
ch, width := rune(s.src[s.src_pos]), 1
|
|
|
|
if ch >= utf8.RUNE_SELF {
|
|
ch, width = utf8.decode_rune_in_string(s.src[s.src_pos:])
|
|
if ch == utf8.RUNE_ERROR && width == 1 {
|
|
s.src_pos += width
|
|
s.prev_char_len = width
|
|
s.column += 1
|
|
error(s, "invalid UTF-8 encoding")
|
|
return ch
|
|
}
|
|
}
|
|
|
|
s.src_pos += width
|
|
s.prev_char_len = width
|
|
s.column += 1
|
|
|
|
switch ch {
|
|
case 0:
|
|
error(s, "invalid character NUL")
|
|
case '\n':
|
|
s.line += 1
|
|
s.prev_line_len = s.column
|
|
s.column = 0
|
|
}
|
|
|
|
return ch
|
|
}
|
|
|
|
// next reads and returns the next Unicode character. It returns EOF at the end of the source.
|
|
// next does not update the Scanner's pos field. Use 'position(s)' to get the current position
|
|
next :: proc(s: ^Scanner) -> rune {
|
|
s.tok_pos = -1
|
|
s.pos.line = 0
|
|
ch := peek(s)
|
|
if ch != EOF {
|
|
s.ch = advance(s)
|
|
}
|
|
return ch
|
|
}
|
|
|
|
// peek returns the next Unicode character in the source without advancing the scanner
|
|
// It returns EOF if the scanner's position is at least the last character of the source
|
|
// if n > 0, it call next n times and return the nth Unicode character and then restore the Scanner's state
|
|
@(require_results)
|
|
peek :: proc(s: ^Scanner, n := 0) -> (ch: rune) {
|
|
if s.ch == -2 {
|
|
s.ch = advance(s)
|
|
if s.ch == '\ufeff' { // Ignore BOM
|
|
s.ch = advance(s)
|
|
}
|
|
}
|
|
ch = s.ch
|
|
if n > 0 {
|
|
prev_s := s^
|
|
for _ in 0..<n {
|
|
next(s)
|
|
}
|
|
ch = s.ch
|
|
s^ = prev_s
|
|
}
|
|
return ch
|
|
}
|
|
// peek returns the next token in the source
|
|
// It returns EOF if the scanner's position is at least the last character of the source
|
|
// if n > 0, it call next n times and return the nth token and then restore the Scanner's state
|
|
@(require_results)
|
|
peek_token :: proc(s: ^Scanner, n := 0) -> (tok: rune) {
|
|
assert(n >= 0)
|
|
prev_s := s^
|
|
for _ in 0..<n {
|
|
tok = scan(s)
|
|
}
|
|
tok = scan(s)
|
|
s^ = prev_s
|
|
return
|
|
}
|
|
|
|
error :: proc(s: ^Scanner, msg: string) {
|
|
s.error_count += 1
|
|
if s.error != nil {
|
|
s.error(s, msg)
|
|
return
|
|
}
|
|
p := s.pos
|
|
if !position_is_valid(p) {
|
|
p = position(s)
|
|
}
|
|
|
|
s := p.filename
|
|
if s == "" {
|
|
s = "<input>"
|
|
}
|
|
|
|
if position_is_valid(p) {
|
|
fmt.eprintf("%s(%d:%d): %s\n", s, p.line, p.column, msg)
|
|
} else {
|
|
fmt.eprintf("%s: %s\n", s, msg)
|
|
}
|
|
}
|
|
|
|
errorf :: proc(s: ^Scanner, format: string, args: ..any) {
|
|
error(s, fmt.tprintf(format, ..args))
|
|
}
|
|
|
|
@(private, require_results)
|
|
is_ident_rune :: proc(s: ^Scanner, ch: rune, i: int) -> bool {
|
|
if s.is_ident_rune != nil {
|
|
return s.is_ident_rune(ch, i)
|
|
}
|
|
return ch == '_' || unicode.is_letter(ch) || unicode.is_digit(ch) && i > 0
|
|
}
|
|
|
|
@(private, require_results)
|
|
scan_identifier :: proc(s: ^Scanner) -> rune {
|
|
ch := advance(s)
|
|
for i := 1; is_ident_rune(s, ch, i); i += 1 {
|
|
ch = advance(s)
|
|
}
|
|
return ch
|
|
}
|
|
|
|
@(private, require_results) lower :: proc(ch: rune) -> rune { return ('a' - 'A') | ch }
|
|
@(private, require_results) is_decimal :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9' }
|
|
@(private, require_results) is_hex :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
|
|
|
|
|
|
|
|
@(private, require_results)
|
|
scan_number :: proc(s: ^Scanner, ch: rune, seen_dot: bool) -> (rune, rune) {
|
|
lit_name :: proc(prefix: rune) -> string {
|
|
switch prefix {
|
|
case 'b': return "binary literal"
|
|
case 'o': return "octal literal"
|
|
case 'z': return "dozenal literal"
|
|
case 'x': return "hexadecimal literal"
|
|
case 'h': return "hexadecimal literal"
|
|
}
|
|
return "decimal literal"
|
|
}
|
|
|
|
digits :: proc(s: ^Scanner, ch0: rune, base: int, invalid: ^rune) -> (ch: rune, digsep: int) {
|
|
ch = ch0
|
|
if base <= 10 {
|
|
max := rune('0' + base)
|
|
for is_decimal(ch) || ch == '_' {
|
|
ds := 1
|
|
if ch == '_' {
|
|
ds = 2
|
|
} else if ch >= max && invalid^ == 0 {
|
|
invalid^ = ch
|
|
}
|
|
digsep |= ds
|
|
ch = advance(s)
|
|
}
|
|
} else {
|
|
for is_hex(ch) || ch == '_' {
|
|
ds := 1
|
|
if ch == '_' {
|
|
ds = 2
|
|
}
|
|
digsep |= ds
|
|
ch = advance(s)
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
ch, seen_dot := ch, seen_dot
|
|
|
|
base := 10
|
|
prefix := rune(0)
|
|
digsep := 0
|
|
invalid := rune(0)
|
|
|
|
tok: rune
|
|
ds: int
|
|
|
|
if !seen_dot {
|
|
tok = Int
|
|
if ch == '0' {
|
|
ch = advance(s)
|
|
|
|
p := lower(ch)
|
|
if .Scan_C_Int_Prefixes in s.flags {
|
|
switch p {
|
|
case 'b':
|
|
ch = advance(s)
|
|
base, prefix = 2, 'b'
|
|
case 'x':
|
|
ch = advance(s)
|
|
base, prefix = 16, 'x'
|
|
case:
|
|
base, prefix = 8, 'o'
|
|
digsep = 1 // Leading zero
|
|
}
|
|
} else {
|
|
switch p {
|
|
case 'b':
|
|
ch = advance(s)
|
|
base, prefix = 2, 'b'
|
|
case 'o':
|
|
ch = advance(s)
|
|
base, prefix = 8, 'o'
|
|
case 'd':
|
|
ch = advance(s)
|
|
base, prefix = 10, 'd'
|
|
case 'z':
|
|
ch = advance(s)
|
|
base, prefix = 12, 'z'
|
|
case 'h':
|
|
tok = Float
|
|
ch = advance(s)
|
|
base, prefix = 16, 'h'
|
|
case 'x':
|
|
ch = advance(s)
|
|
base, prefix = 16, 'x'
|
|
case:
|
|
digsep = 1 // Leading zero
|
|
}
|
|
}
|
|
}
|
|
|
|
ch, ds = digits(s, ch, base, &invalid)
|
|
digsep |= ds
|
|
if ch == '.' && .Scan_Floats in s.flags {
|
|
ch = advance(s)
|
|
seen_dot = true
|
|
}
|
|
}
|
|
|
|
if seen_dot {
|
|
tok = Float
|
|
if prefix != 0 && prefix != 'x' {
|
|
errorf(s, "invalid radix point in %s", lit_name(prefix))
|
|
}
|
|
ch, ds = digits(s, ch, base, &invalid)
|
|
digsep |= ds
|
|
}
|
|
|
|
if digsep&1 == 0 {
|
|
errorf(s, "%s has no digits", lit_name(prefix))
|
|
}
|
|
|
|
if e := lower(ch); (e == 'e' || e == 'p') && .Scan_Floats in s.flags {
|
|
switch {
|
|
case e == 'e' && prefix != 0:
|
|
errorf(s, "%q exponent requires decimal mantissa", ch)
|
|
case e == 'p' && prefix != 'x':
|
|
errorf(s, "%q exponent requires hexadecimal mantissa", ch)
|
|
}
|
|
ch = advance(s)
|
|
tok = Float
|
|
if ch == '+' || ch == '-' {
|
|
ch = advance(s)
|
|
}
|
|
ch, ds = digits(s, ch, 10, nil)
|
|
digsep |= ds
|
|
if ds&1 == 0 {
|
|
error(s, "exponent has no digits")
|
|
}
|
|
} else if prefix == 'x' && tok == Float {
|
|
error(s, "hexadecimal mantissa requires a 'p' exponent")
|
|
}
|
|
|
|
if tok == Int && invalid != 0 {
|
|
errorf(s, "invalid digit %q in %s", invalid, lit_name(prefix))
|
|
}
|
|
|
|
if digsep&2 != 0 {
|
|
s.tok_end = s.src_pos - s.prev_char_len
|
|
}
|
|
return tok, ch
|
|
}
|
|
|
|
@(private, require_results)
|
|
scan_string :: proc(s: ^Scanner, quote: rune) -> (n: int) {
|
|
digit_val :: proc(ch: rune) -> int {
|
|
switch v := lower(ch); v {
|
|
case '0'..='9': return int(v - '0')
|
|
case 'a'..='z': return int(v - 'a')
|
|
}
|
|
return 16
|
|
}
|
|
|
|
scan_digits :: proc(s: ^Scanner, ch: rune, base, n: int) -> rune {
|
|
ch, n := ch, n
|
|
for n > 0 && digit_val(ch) < base {
|
|
ch = advance(s)
|
|
n -= 1
|
|
}
|
|
if n > 0 {
|
|
error(s, "invalid char escape")
|
|
}
|
|
return ch
|
|
}
|
|
|
|
ch := advance(s)
|
|
for ch != quote {
|
|
if ch == '\n' || ch < 0 {
|
|
error(s, "literal not terminated")
|
|
return
|
|
}
|
|
if ch == '\\' {
|
|
ch = advance(s)
|
|
switch ch {
|
|
case quote, 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v', '\\':
|
|
ch = advance(s)
|
|
case '0'..='7': ch = scan_digits(s, advance(s), 8, 3)
|
|
case 'x': ch = scan_digits(s, advance(s), 16, 2)
|
|
case 'u': ch = scan_digits(s, advance(s), 16, 4)
|
|
case 'U': ch = scan_digits(s, advance(s), 16, 8)
|
|
case:
|
|
error(s, "invalid char escape")
|
|
}
|
|
} else {
|
|
ch = advance(s)
|
|
}
|
|
n += 1
|
|
}
|
|
return
|
|
}
|
|
|
|
@(private)
|
|
scan_raw_string :: proc(s: ^Scanner) {
|
|
ch := advance(s)
|
|
for ch != '`' {
|
|
if ch < 0 {
|
|
error(s, "literal not terminated")
|
|
return
|
|
}
|
|
ch = advance(s)
|
|
}
|
|
}
|
|
|
|
@(private)
|
|
scan_char :: proc(s: ^Scanner) {
|
|
if scan_string(s, '\'') != 1 {
|
|
error(s, "invalid char literal")
|
|
}
|
|
}
|
|
|
|
@(private, require_results)
|
|
scan_comment :: proc(s: ^Scanner, ch: rune) -> rune {
|
|
ch := ch
|
|
if ch == '/' { // line comment
|
|
ch = advance(s)
|
|
for ch != '\n' && ch >= 0 {
|
|
ch = advance(s)
|
|
}
|
|
return ch
|
|
}
|
|
|
|
// block /**/ comment
|
|
ch = advance(s)
|
|
for {
|
|
if ch < 0 {
|
|
error(s, "comment not terminated")
|
|
break
|
|
}
|
|
ch0 := ch
|
|
ch = advance(s)
|
|
if ch0 == '*' && ch == '/' {
|
|
return advance(s)
|
|
}
|
|
}
|
|
return ch
|
|
}
|
|
|
|
// scan reads the next token or Unicode character from source and returns it
|
|
// It only recognizes tokens for which the respective flag that is set
|
|
// It returns EOF at the end of the source
|
|
// It reports Scanner errors by calling s.error, if not nil; otherwise it will print the error message to os.stderr
|
|
scan :: proc(s: ^Scanner) -> (tok: rune) {
|
|
ch := peek(s)
|
|
if ch == EOF {
|
|
return ch
|
|
}
|
|
|
|
// reset position
|
|
s.tok_pos = -1
|
|
s.pos.line = 0
|
|
|
|
redo: for {
|
|
for ch < utf8.RUNE_SELF && (ch in s.whitespace) {
|
|
ch = advance(s)
|
|
}
|
|
|
|
s.tok_pos = s.src_pos - s.prev_char_len
|
|
s.pos.offset = s.tok_pos
|
|
|
|
if s.column > 0 {
|
|
s.pos.line = s.line
|
|
s.pos.column = s.column
|
|
} else {
|
|
// previous character was newline
|
|
s.pos.line = s.line - 1
|
|
s.pos.column = s.prev_line_len
|
|
}
|
|
|
|
tok = ch
|
|
if is_ident_rune(s, ch, 0) {
|
|
if .Scan_Idents in s.flags {
|
|
tok = Ident
|
|
ch = scan_identifier(s)
|
|
} else {
|
|
ch = advance(s)
|
|
}
|
|
|
|
} else if is_decimal(ch) {
|
|
if .Scan_Ints in s.flags || .Scan_Floats in s.flags {
|
|
tok, ch = scan_number(s, ch, false)
|
|
} else {
|
|
ch = advance(s)
|
|
}
|
|
} else {
|
|
switch ch {
|
|
case EOF:
|
|
break
|
|
case '"':
|
|
if .Scan_Strings in s.flags {
|
|
_ = scan_string(s, '"')
|
|
tok = String
|
|
}
|
|
ch = advance(s)
|
|
case '\'':
|
|
if .Scan_Chars in s.flags {
|
|
_ = scan_string(s, '\'')
|
|
tok = Char
|
|
}
|
|
ch = advance(s)
|
|
case '`':
|
|
if .Scan_Raw_Strings in s.flags {
|
|
scan_raw_string(s)
|
|
tok = Raw_String
|
|
}
|
|
ch = advance(s)
|
|
case '.':
|
|
ch = advance(s)
|
|
if is_decimal(ch) && .Scan_Floats in s.flags {
|
|
tok, ch = scan_number(s, ch, true)
|
|
}
|
|
case '/':
|
|
ch = advance(s)
|
|
if (ch == '/' || ch == '*') && .Scan_Comments in s.flags {
|
|
if .Skip_Comments in s.flags {
|
|
s.tok_pos = -1
|
|
ch = scan_comment(s, ch)
|
|
continue redo
|
|
}
|
|
ch = scan_comment(s, ch)
|
|
tok = Comment
|
|
}
|
|
case:
|
|
ch = advance(s)
|
|
}
|
|
}
|
|
|
|
break redo
|
|
}
|
|
|
|
s.tok_end = s.src_pos - s.prev_char_len
|
|
|
|
s.ch = ch
|
|
return tok
|
|
}
|
|
|
|
// position returns the position of the character immediately after the character or token returns by the previous call to next or scan
|
|
// Use the Scanner's position field for the most recently scanned token position
|
|
@(require_results)
|
|
position :: proc(s: ^Scanner) -> Position {
|
|
pos: Position
|
|
pos.filename = s.pos.filename
|
|
pos.offset = s.src_pos - s.prev_char_len
|
|
switch {
|
|
case s.column > 0:
|
|
pos.line = s.line
|
|
pos.column = s.column
|
|
case s.prev_line_len > 0:
|
|
pos.line = s.line-1
|
|
pos.column = s.prev_line_len
|
|
case:
|
|
pos.line = 1
|
|
pos.column = 1
|
|
}
|
|
return pos
|
|
}
|
|
|
|
// token_text returns the string of the most recently scanned token
|
|
@(require_results)
|
|
token_text :: proc(s: ^Scanner) -> string {
|
|
if s.tok_pos < 0 {
|
|
return ""
|
|
}
|
|
return string(s.src[s.tok_pos:s.tok_end])
|
|
}
|
|
|
|
// token_string returns a printable string for a token or Unicode character
|
|
// By default, it uses the context.temp_allocator to produce the string
|
|
@(require_results)
|
|
token_string :: proc(tok: rune, allocator: runtime.Allocator) -> string {
|
|
context.allocator = allocator
|
|
switch tok {
|
|
case EOF: return strings.clone("EOF")
|
|
case Ident: return strings.clone("Ident")
|
|
case Int: return strings.clone("Int")
|
|
case Float: return strings.clone("Float")
|
|
case Char: return strings.clone("Char")
|
|
case String: return strings.clone("String")
|
|
case Raw_String: return strings.clone("Raw_String")
|
|
case Comment: return strings.clone("Comment")
|
|
}
|
|
return fmt.aprintf("%q", tok)
|
|
}
|