mirror of
https://github.com/nim-lang/Nim.git
synced 2025-12-30 18:02:05 +00:00
1037 lines
29 KiB
ObjectPascal
1037 lines
29 KiB
ObjectPascal
//
|
|
//
|
|
// The Nimrod Compiler
|
|
// (c) Copyright 2009 Andreas Rumpf
|
|
//
|
|
// See the file "copying.txt", included in this
|
|
// distribution, for details about the copyright.
|
|
//
|
|
unit scanner;
|
|
|
|
// This scanner is handwritten for efficiency. I used an elegant buffering
|
|
// scheme which I have not seen anywhere else:
|
|
// We guarantee that a whole line is in the buffer. Thus only when scanning
|
|
// the \n or \r character we have to check wether we need to read in the next
|
|
// chunk. (\n or \r already need special handling for incrementing the line
|
|
// counter; choosing both \n and \r allows the scanner to properly read Unix,
|
|
// DOS or Macintosh text files, even when it is not the native format.
|
|
|
|
interface
|
|
|
|
{$include 'config.inc'}
|
|
|
|
uses
|
|
charsets, nsystem, sysutils, nhashes, options, msgs, strutils, platform,
|
|
idents, lexbase, llstream, wordrecg;
|
|
|
|
const
|
|
MaxLineLength = 80; // lines longer than this lead to a warning
|
|
|
|
numChars: TCharSet = ['0'..'9','a'..'z','A'..'Z'];
|
|
SymChars: TCharSet = ['a'..'z', 'A'..'Z', '0'..'9', #128..#255];
|
|
SymStartChars: TCharSet = ['a'..'z', 'A'..'Z', #128..#255];
|
|
OpChars: TCharSet = ['+', '-', '*', '/', '\', '<', '>', '!', '?', '^', '.',
|
|
'|', '=', '%', '&', '$', '@', '~', #128..#255];
|
|
|
|
type
|
|
TTokType = (tkInvalid, tkEof, // order is important here!
|
|
tkSymbol,
|
|
// keywords:
|
|
//[[[cog
|
|
//from string import split, capitalize
|
|
//keywords = split(open("data/keywords.txt").read())
|
|
//idents = ""
|
|
//strings = ""
|
|
//i = 1
|
|
//for k in keywords:
|
|
// idents = idents + "tk" + capitalize(k) + ", "
|
|
// strings = strings + "'" + k + "', "
|
|
// if i % 4 == 0:
|
|
// idents = idents + "\n"
|
|
// strings = strings + "\n"
|
|
// i = i + 1
|
|
//cog.out(idents)
|
|
//]]]
|
|
tkAddr, tkAnd, tkAs, tkAsm,
|
|
tkBind, tkBlock, tkBreak, tkCase,
|
|
tkCast, tkConst, tkContinue, tkConverter,
|
|
tkDiscard, tkDistinct, tkDiv, tkElif,
|
|
tkElse, tkEnd, tkEnum, tkExcept,
|
|
tkFinally, tkFor, tkFrom, tkGeneric,
|
|
tkIf, tkImplies, tkImport, tkIn,
|
|
tkInclude, tkIs, tkIsnot, tkIterator,
|
|
tkLambda, tkMacro, tkMethod, tkMod,
|
|
tkNil, tkNot, tkNotin, tkObject,
|
|
tkOf, tkOr, tkOut, tkProc,
|
|
tkPtr, tkRaise, tkRef, tkReturn,
|
|
tkShl, tkShr, tkTemplate, tkTry,
|
|
tkTuple, tkType, tkVar, tkWhen,
|
|
tkWhile, tkWith, tkWithout, tkXor,
|
|
tkYield,
|
|
//[[[end]]]
|
|
tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit,
|
|
tkFloatLit, tkFloat32Lit, tkFloat64Lit,
|
|
tkStrLit, tkRStrLit, tkTripleStrLit, tkCallRStrLit, tkCallTripleStrLit,
|
|
tkCharLit, tkParLe, tkParRi, tkBracketLe, tkBracketRi, tkCurlyLe, tkCurlyRi,
|
|
tkBracketDotLe, tkBracketDotRi, // [. and .]
|
|
tkCurlyDotLe, tkCurlyDotRi, // {. and .}
|
|
tkParDotLe, tkParDotRi, // (. and .)
|
|
tkComma, tkSemiColon, tkColon,
|
|
tkEquals, tkDot, tkDotDot, tkHat, tkOpr,
|
|
tkComment, tkAccent, tkInd, tkSad, tkDed,
|
|
// pseudo token types used by the source renderers:
|
|
tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr
|
|
);
|
|
TTokTypes = set of TTokType;
|
|
const
|
|
tokKeywordLow = succ(tkSymbol);
|
|
tokKeywordHigh = pred(tkIntLit);
|
|
tokOperators: TTokTypes = {@set}[tkOpr, tkSymbol, tkBracketLe, tkBracketRi,
|
|
tkIn, tkIs, tkIsNot, tkEquals, tkDot, tkHat, tkNot, tkAnd, tkOr, tkXor,
|
|
tkShl, tkShr, tkDiv, tkMod, tkNotIn];
|
|
|
|
TokTypeToStr: array [TTokType] of string = (
|
|
'tkInvalid', '[EOF]',
|
|
'tkSymbol',
|
|
//[[[cog
|
|
//cog.out(strings)
|
|
//]]]
|
|
'addr', 'and', 'as', 'asm',
|
|
'bind', 'block', 'break', 'case',
|
|
'cast', 'const', 'continue', 'converter',
|
|
'discard', 'distinct', 'div', 'elif',
|
|
'else', 'end', 'enum', 'except',
|
|
'finally', 'for', 'from', 'generic',
|
|
'if', 'implies', 'import', 'in',
|
|
'include', 'is', 'isnot', 'iterator',
|
|
'lambda', 'macro', 'method', 'mod',
|
|
'nil', 'not', 'notin', 'object',
|
|
'of', 'or', 'out', 'proc',
|
|
'ptr', 'raise', 'ref', 'return',
|
|
'shl', 'shr', 'template', 'try',
|
|
'tuple', 'type', 'var', 'when',
|
|
'while', 'with', 'without', 'xor',
|
|
'yield',
|
|
//[[[end]]]
|
|
'tkIntLit', 'tkInt8Lit', 'tkInt16Lit', 'tkInt32Lit', 'tkInt64Lit',
|
|
'tkFloatLit', 'tkFloat32Lit', 'tkFloat64Lit',
|
|
'tkStrLit', 'tkRStrLit', 'tkTripleStrLit',
|
|
'tkCallRStrLit', 'tkCallTripleStrLit',
|
|
'tkCharLit',
|
|
'('+'', ')'+'', '['+'', ']'+'', '{'+'', '}'+'',
|
|
'[.', '.]', '{.', '.}', '(.', '.)', ','+'', ';'+'', ':'+'',
|
|
'='+'', '.'+'', '..', '^'+'', 'tkOpr',
|
|
'tkComment', '`'+'', '[new indentation]', '[same indentation]',
|
|
'[dedentation]',
|
|
'tkSpaces', 'tkInfixOpr', 'tkPrefixOpr', 'tkPostfixOpr'
|
|
);
|
|
|
|
type
|
|
TNumericalBase = (base10, // base10 is listed as the first element,
|
|
// so that it is the correct default value
|
|
base2,
|
|
base8,
|
|
base16);
|
|
PToken = ^TToken;
|
|
TToken = object // a Nimrod token
|
|
tokType: TTokType; // the type of the token
|
|
indent: int; // the indentation; only valid if tokType = tkIndent
|
|
ident: PIdent; // the parsed identifier
|
|
iNumber: BiggestInt; // the parsed integer literal
|
|
fNumber: BiggestFloat; // the parsed floating point literal
|
|
base: TNumericalBase; // the numerical base; only valid for int
|
|
// or float literals
|
|
literal: string; // the parsed (string) literal; and
|
|
// documentation comments are here too
|
|
next: PToken; // next token; can be used for arbitrary look-ahead
|
|
end;
|
|
|
|
PLexer = ^TLexer;
|
|
TLexer = object(TBaseLexer)
|
|
filename: string;
|
|
indentStack: array of int; // the indentation stack
|
|
dedent: int; // counter for DED token generation
|
|
indentAhead: int; // if > 0 an indendation has already been read
|
|
// this is needed because scanning comments
|
|
// needs so much look-ahead
|
|
end;
|
|
|
|
var
|
|
gLinesCompiled: int; // all lines that have been compiled
|
|
|
|
procedure pushInd(var L: TLexer; indent: int);
|
|
procedure popInd(var L: TLexer);
|
|
|
|
function isKeyword(kind: TTokType): boolean;
|
|
|
|
procedure openLexer(out lex: TLexer; const filename: string;
|
|
inputstream: PLLStream);
|
|
|
|
procedure rawGetTok(var L: TLexer; var tok: TToken);
|
|
// reads in the next token into tok and skips it
|
|
|
|
function getColumn(const L: TLexer): int;
|
|
|
|
function getLineInfo(const L: TLexer): TLineInfo;
|
|
|
|
procedure closeLexer(var lex: TLexer);
|
|
|
|
procedure PrintTok(tok: PToken);
|
|
function tokToStr(tok: PToken): string;
|
|
|
|
// auxiliary functions:
|
|
procedure lexMessage(const L: TLexer; const msg: TMsgKind;
|
|
const arg: string = '');
|
|
|
|
// the Pascal scanner uses this too:
|
|
procedure fillToken(var L: TToken);
|
|
|
|
implementation
|
|
|
|
function isKeyword(kind: TTokType): boolean;
|
|
begin
|
|
result := (kind >= tokKeywordLow) and (kind <= tokKeywordHigh)
|
|
end;
|
|
|
|
procedure pushInd(var L: TLexer; indent: int);
|
|
var
|
|
len: int;
|
|
begin
|
|
len := length(L.indentStack);
|
|
setLength(L.indentStack, len+1);
|
|
if (indent > L.indentStack[len-1]) then
|
|
L.indentstack[len] := indent
|
|
else
|
|
InternalError('pushInd');
|
|
//writeln('push indent ', indent);
|
|
end;
|
|
|
|
procedure popInd(var L: TLexer);
|
|
var
|
|
len: int;
|
|
begin
|
|
len := length(L.indentStack);
|
|
setLength(L.indentStack, len-1);
|
|
end;
|
|
|
|
function findIdent(const L: TLexer; indent: int): boolean;
|
|
var
|
|
i: int;
|
|
begin
|
|
for i := length(L.indentStack)-1 downto 0 do
|
|
if L.indentStack[i] = indent then begin result := true; exit end;
|
|
result := false
|
|
end;
|
|
|
|
function tokToStr(tok: PToken): string;
|
|
begin
|
|
case tok.tokType of
|
|
tkIntLit..tkInt64Lit:
|
|
result := toString(tok.iNumber);
|
|
tkFloatLit..tkFloat64Lit:
|
|
result := toStringF(tok.fNumber);
|
|
tkInvalid, tkStrLit..tkCharLit, tkComment:
|
|
result := tok.literal;
|
|
tkParLe..tkColon, tkEof, tkInd, tkSad, tkDed, tkAccent:
|
|
result := tokTypeToStr[tok.tokType];
|
|
else if (tok.ident <> nil) then
|
|
result := tok.ident.s
|
|
else begin
|
|
InternalError('tokToStr');
|
|
result := ''
|
|
end
|
|
end
|
|
end;
|
|
|
|
procedure PrintTok(tok: PToken);
|
|
begin
|
|
write(output, TokTypeToStr[tok.tokType]);
|
|
write(output, ' '+'');
|
|
writeln(output, tokToStr(tok))
|
|
end;
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
var
|
|
dummyIdent: PIdent;
|
|
|
|
procedure fillToken(var L: TToken);
|
|
begin
|
|
L.TokType := tkInvalid;
|
|
L.iNumber := 0;
|
|
L.Indent := 0;
|
|
L.literal := '';
|
|
L.fNumber := 0.0;
|
|
L.base := base10;
|
|
L.ident := dummyIdent; // this prevents many bugs!
|
|
end;
|
|
|
|
procedure openLexer(out lex: TLexer; const filename: string;
|
|
inputstream: PLLStream);
|
|
begin
|
|
{@ignore}
|
|
FillChar(lex, sizeof(lex), 0);
|
|
{@emit}
|
|
openBaseLexer(lex, inputstream);
|
|
{@ignore}
|
|
setLength(lex.indentStack, 1);
|
|
lex.indentStack[0] := 0;
|
|
{@emit lex.indentStack := @[0]; }
|
|
lex.filename := filename;
|
|
lex.indentAhead := -1;
|
|
end;
|
|
|
|
procedure closeLexer(var lex: TLexer);
|
|
begin
|
|
inc(gLinesCompiled, lex.LineNumber);
|
|
closeBaseLexer(lex);
|
|
end;
|
|
|
|
function getColumn(const L: TLexer): int;
|
|
begin
|
|
result := getColNumber(L, L.bufPos)
|
|
end;
|
|
|
|
function getLineInfo(const L: TLexer): TLineInfo;
|
|
begin
|
|
result := newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos))
|
|
end;
|
|
|
|
procedure lexMessage(const L: TLexer; const msg: TMsgKind;
|
|
const arg: string = '');
|
|
begin
|
|
msgs.liMessage(getLineInfo(L), msg, arg)
|
|
end;
|
|
|
|
procedure lexMessagePos(var L: TLexer; const msg: TMsgKind; pos: int;
|
|
const arg: string = '');
|
|
var
|
|
info: TLineInfo;
|
|
begin
|
|
info := newLineInfo(L.filename, L.linenumber, pos - L.lineStart);
|
|
msgs.liMessage(info, msg, arg);
|
|
end;
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
procedure matchUnderscoreChars(var L: TLexer; var tok: TToken;
|
|
const chars: TCharSet);
|
|
// matches ([chars]_)*
|
|
var
|
|
pos: int;
|
|
buf: PChar;
|
|
begin
|
|
pos := L.bufpos; // use registers for pos, buf
|
|
buf := L.buf;
|
|
repeat
|
|
if buf[pos] in chars then begin
|
|
addChar(tok.literal, buf[pos]);
|
|
Inc(pos)
|
|
end
|
|
else break;
|
|
if buf[pos] = '_' then begin
|
|
addChar(tok.literal, '_');
|
|
Inc(pos);
|
|
end;
|
|
until false;
|
|
L.bufPos := pos;
|
|
end;
|
|
|
|
function matchTwoChars(const L: TLexer; first: Char;
|
|
const second: TCharSet): Boolean;
|
|
begin
|
|
result := (L.buf[L.bufpos] = first) and (L.buf[L.bufpos+1] in Second);
|
|
end;
|
|
|
|
function isFloatLiteral(const s: string): boolean;
|
|
var
|
|
i: int;
|
|
begin
|
|
for i := strStart to length(s)+strStart-1 do
|
|
if s[i] in ['.','e','E'] then begin
|
|
result := true; exit
|
|
end;
|
|
result := false
|
|
end;
|
|
|
|
function GetNumber(var L: TLexer): TToken;
|
|
var
|
|
pos, endpos: int;
|
|
xi: biggestInt;
|
|
begin
|
|
// get the base:
|
|
result.tokType := tkIntLit; // int literal until we know better
|
|
result.literal := '';
|
|
result.base := base10; // BUGFIX
|
|
pos := L.bufpos;
|
|
// make sure the literal is correct for error messages:
|
|
matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9']);
|
|
if (L.buf[L.bufpos] = '.') and (L.buf[L.bufpos+1] in ['0'..'9']) then begin
|
|
addChar(result.literal, '.');
|
|
inc(L.bufpos);
|
|
//matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9'])
|
|
matchUnderscoreChars(L, result, ['0'..'9']);
|
|
if L.buf[L.bufpos] in ['e', 'E'] then begin
|
|
addChar(result.literal, 'e');
|
|
inc(L.bufpos);
|
|
if L.buf[L.bufpos] in ['+', '-'] then begin
|
|
addChar(result.literal, L.buf[L.bufpos]);
|
|
inc(L.bufpos);
|
|
end;
|
|
matchUnderscoreChars(L, result, ['0'..'9']);
|
|
end
|
|
end;
|
|
endpos := L.bufpos;
|
|
if L.buf[endpos] = '''' then begin
|
|
//matchUnderscoreChars(L, result, ['''', 'f', 'F', 'i', 'I', '0'..'9']);
|
|
inc(endpos);
|
|
L.bufpos := pos; // restore position
|
|
case L.buf[endpos] of
|
|
'f', 'F': begin
|
|
inc(endpos);
|
|
if (L.buf[endpos] = '6') and (L.buf[endpos+1] = '4') then begin
|
|
result.tokType := tkFloat64Lit;
|
|
inc(endpos, 2);
|
|
end
|
|
else if (L.buf[endpos] = '3') and (L.buf[endpos+1] = '2') then begin
|
|
result.tokType := tkFloat32Lit;
|
|
inc(endpos, 2);
|
|
end
|
|
else lexMessage(L, errInvalidNumber, result.literal);
|
|
end;
|
|
'i', 'I': begin
|
|
inc(endpos);
|
|
if (L.buf[endpos] = '6') and (L.buf[endpos+1] = '4') then begin
|
|
result.tokType := tkInt64Lit;
|
|
inc(endpos, 2);
|
|
end
|
|
else if (L.buf[endpos] = '3') and (L.buf[endpos+1] = '2') then begin
|
|
result.tokType := tkInt32Lit;
|
|
inc(endpos, 2);
|
|
end
|
|
else if (L.buf[endpos] = '1') and (L.buf[endpos+1] = '6') then begin
|
|
result.tokType := tkInt16Lit;
|
|
inc(endpos, 2);
|
|
end
|
|
else if (L.buf[endpos] = '8') then begin
|
|
result.tokType := tkInt8Lit;
|
|
inc(endpos);
|
|
end
|
|
else lexMessage(L, errInvalidNumber, result.literal);
|
|
end;
|
|
else lexMessage(L, errInvalidNumber, result.literal);
|
|
end
|
|
end
|
|
else
|
|
L.bufpos := pos; // restore position
|
|
|
|
try
|
|
if (L.buf[pos] = '0') and (L.buf[pos+1] in ['x','X','b','B','o','O','c','C'])
|
|
then begin
|
|
inc(pos, 2);
|
|
xi := 0;
|
|
// it may be a base prefix
|
|
case L.buf[pos-1] of
|
|
'b', 'B': begin
|
|
result.base := base2;
|
|
while true do begin
|
|
case L.buf[pos] of
|
|
'A'..'Z', 'a'..'z', '2'..'9', '.': begin
|
|
lexMessage(L, errInvalidNumber, result.literal);
|
|
inc(pos)
|
|
end;
|
|
'_': inc(pos);
|
|
'0', '1': begin
|
|
xi := shlu(xi, 1) or (ord(L.buf[pos]) - ord('0'));
|
|
inc(pos);
|
|
end;
|
|
else break;
|
|
end
|
|
end
|
|
end;
|
|
'o', 'c', 'C': begin
|
|
result.base := base8;
|
|
while true do begin
|
|
case L.buf[pos] of
|
|
'A'..'Z', 'a'..'z', '8'..'9', '.': begin
|
|
lexMessage(L, errInvalidNumber, result.literal);
|
|
inc(pos)
|
|
end;
|
|
'_': inc(pos);
|
|
'0'..'7': begin
|
|
xi := shlu(xi, 3) or (ord(L.buf[pos]) - ord('0'));
|
|
inc(pos);
|
|
end;
|
|
else break;
|
|
end
|
|
end
|
|
end;
|
|
'O': lexMessage(L, errInvalidNumber, result.literal);
|
|
'x', 'X': begin
|
|
result.base := base16;
|
|
while true do begin
|
|
case L.buf[pos] of
|
|
'G'..'Z', 'g'..'z', '.': begin
|
|
lexMessage(L, errInvalidNumber, result.literal);
|
|
inc(pos);
|
|
end;
|
|
'_': inc(pos);
|
|
'0'..'9': begin
|
|
xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('0'));
|
|
inc(pos);
|
|
end;
|
|
'a'..'f': begin
|
|
xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10);
|
|
inc(pos);
|
|
end;
|
|
'A'..'F': begin
|
|
xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10);
|
|
inc(pos);
|
|
end;
|
|
else break;
|
|
end
|
|
end
|
|
end;
|
|
else InternalError(getLineInfo(L), 'getNumber');
|
|
end;
|
|
// now look at the optional type suffix:
|
|
case result.tokType of
|
|
tkIntLit, tkInt64Lit:
|
|
result.iNumber := xi;
|
|
tkInt8Lit:
|
|
result.iNumber := biggestInt(int8(toU8(int(xi))));
|
|
tkInt16Lit:
|
|
result.iNumber := biggestInt(toU16(int(xi)));
|
|
tkInt32Lit:
|
|
result.iNumber := biggestInt(toU32(xi));
|
|
tkFloat32Lit:
|
|
result.fNumber := ({@cast}PFloat32(addr(xi)))^;
|
|
// note: this code is endian neutral!
|
|
// XXX: Test this on big endian machine!
|
|
tkFloat64Lit:
|
|
result.fNumber := ({@cast}PFloat64(addr(xi)))^;
|
|
else InternalError(getLineInfo(L), 'getNumber');
|
|
end
|
|
end
|
|
else if isFloatLiteral(result.literal)
|
|
or (result.tokType = tkFloat32Lit)
|
|
or (result.tokType = tkFloat64Lit) then begin
|
|
result.fnumber := parseFloat(result.literal);
|
|
if result.tokType = tkIntLit then result.tokType := tkFloatLit;
|
|
end
|
|
else begin
|
|
result.iNumber := ParseBiggestInt(result.literal);
|
|
if (result.iNumber < low(int32)) or (result.iNumber > high(int32)) then
|
|
begin
|
|
if result.tokType = tkIntLit then result.tokType := tkInt64Lit
|
|
else if result.tokType <> tkInt64Lit then
|
|
lexMessage(L, errInvalidNumber, result.literal);
|
|
end
|
|
end;
|
|
except
|
|
on EInvalidValue do
|
|
lexMessage(L, errInvalidNumber, result.literal);
|
|
{@ignore}
|
|
on sysutils.EIntOverflow do
|
|
lexMessage(L, errNumberOutOfRange, result.literal);
|
|
{@emit}
|
|
on EOverflow do
|
|
lexMessage(L, errNumberOutOfRange, result.literal);
|
|
on EOutOfRange do
|
|
lexMessage(L, errNumberOutOfRange, result.literal);
|
|
end;
|
|
L.bufpos := endpos;
|
|
end;
|
|
|
|
procedure handleHexChar(var L: TLexer; var xi: int);
|
|
begin
|
|
case L.buf[L.bufpos] of
|
|
'0'..'9': begin
|
|
xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0'));
|
|
inc(L.bufpos);
|
|
end;
|
|
'a'..'f': begin
|
|
xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10);
|
|
inc(L.bufpos);
|
|
end;
|
|
'A'..'F': begin
|
|
xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10);
|
|
inc(L.bufpos);
|
|
end;
|
|
else begin end // do nothing
|
|
end
|
|
end;
|
|
|
|
procedure handleDecChars(var L: TLexer; var xi: int);
|
|
begin
|
|
while L.buf[L.bufpos] in ['0'..'9'] do begin
|
|
xi := (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0'));
|
|
inc(L.bufpos);
|
|
end;
|
|
end;
|
|
|
|
procedure getEscapedChar(var L: TLexer; var tok: TToken);
|
|
var
|
|
xi: int;
|
|
begin
|
|
inc(L.bufpos); // skip '\'
|
|
case L.buf[L.bufpos] of
|
|
'n', 'N': begin
|
|
if tok.toktype = tkCharLit then
|
|
lexMessage(L, errNnotAllowedInCharacter);
|
|
tok.literal := tok.literal +{&} tnl;
|
|
Inc(L.bufpos);
|
|
end;
|
|
'r', 'R', 'c', 'C': begin addChar(tok.literal, CR); Inc(L.bufpos); end;
|
|
'l', 'L': begin addChar(tok.literal, LF); Inc(L.bufpos); end;
|
|
'f', 'F': begin addChar(tok.literal, FF); inc(L.bufpos); end;
|
|
'e', 'E': begin addChar(tok.literal, ESC); Inc(L.bufpos); end;
|
|
'a', 'A': begin addChar(tok.literal, BEL); Inc(L.bufpos); end;
|
|
'b', 'B': begin addChar(tok.literal, BACKSPACE); Inc(L.bufpos); end;
|
|
'v', 'V': begin addChar(tok.literal, VT); Inc(L.bufpos); end;
|
|
't', 'T': begin addChar(tok.literal, Tabulator); Inc(L.bufpos); end;
|
|
'''', '"': begin addChar(tok.literal, L.buf[L.bufpos]); Inc(L.bufpos); end;
|
|
'\': begin addChar(tok.literal, '\'); Inc(L.bufpos) end;
|
|
'x', 'X': begin
|
|
inc(L.bufpos);
|
|
xi := 0;
|
|
handleHexChar(L, xi);
|
|
handleHexChar(L, xi);
|
|
addChar(tok.literal, Chr(xi));
|
|
end;
|
|
'0'..'9': begin
|
|
if matchTwoChars(L, '0', ['0'..'9']) then
|
|
// this warning will make it easier for newcomers:
|
|
lexMessage(L, warnOctalEscape);
|
|
xi := 0;
|
|
handleDecChars(L, xi);
|
|
if (xi <= 255) then
|
|
addChar(tok.literal, Chr(xi))
|
|
else
|
|
lexMessage(L, errInvalidCharacterConstant)
|
|
end
|
|
else lexMessage(L, errInvalidCharacterConstant)
|
|
end
|
|
end;
|
|
|
|
function HandleCRLF(var L: TLexer; pos: int): int;
|
|
begin
|
|
case L.buf[pos] of
|
|
CR: begin
|
|
if getColNumber(L, pos) > MaxLineLength then
|
|
lexMessagePos(L, hintLineTooLong, pos);
|
|
result := lexbase.HandleCR(L, pos)
|
|
end;
|
|
LF: begin
|
|
if getColNumber(L, pos) > MaxLineLength then
|
|
lexMessagePos(L, hintLineTooLong, pos);
|
|
result := lexbase.HandleLF(L, pos)
|
|
end;
|
|
else result := pos
|
|
end
|
|
end;
|
|
|
|
procedure getString(var L: TLexer; var tok: TToken; rawMode: Boolean);
|
|
var
|
|
line, line2, pos: int;
|
|
c: Char;
|
|
buf: PChar;
|
|
begin
|
|
pos := L.bufPos + 1; // skip "
|
|
buf := L.buf; // put `buf` in a register
|
|
line := L.linenumber; // save linenumber for better error message
|
|
if (buf[pos] = '"') and (buf[pos+1] = '"') then begin
|
|
tok.tokType := tkTripleStrLit;
|
|
// long string literal:
|
|
inc(pos, 2); // skip ""
|
|
// skip leading newline:
|
|
pos := HandleCRLF(L, pos);
|
|
buf := L.buf;
|
|
repeat
|
|
case buf[pos] of
|
|
'"': begin
|
|
if (buf[pos+1] = '"') and (buf[pos+2] = '"') then
|
|
break;
|
|
addChar(tok.literal, '"');
|
|
Inc(pos)
|
|
end;
|
|
CR, LF: begin
|
|
pos := HandleCRLF(L, pos);
|
|
buf := L.buf;
|
|
tok.literal := tok.literal +{&} tnl;
|
|
end;
|
|
lexbase.EndOfFile: begin
|
|
line2 := L.linenumber;
|
|
L.LineNumber := line;
|
|
lexMessagePos(L, errClosingTripleQuoteExpected, L.lineStart);
|
|
L.LineNumber := line2;
|
|
break
|
|
end
|
|
else begin
|
|
addChar(tok.literal, buf[pos]);
|
|
Inc(pos)
|
|
end
|
|
end
|
|
until false;
|
|
L.bufpos := pos + 3 // skip the three """
|
|
end
|
|
else begin // ordinary string literal
|
|
if rawMode then tok.tokType := tkRStrLit
|
|
else tok.tokType := tkStrLit;
|
|
repeat
|
|
c := buf[pos];
|
|
if c = '"' then begin
|
|
inc(pos); // skip '"'
|
|
break
|
|
end;
|
|
if c in [CR, LF, lexbase.EndOfFile] then begin
|
|
lexMessage(L, errClosingQuoteExpected);
|
|
break
|
|
end;
|
|
if (c = '\') and not rawMode then begin
|
|
L.bufPos := pos;
|
|
getEscapedChar(L, tok);
|
|
pos := L.bufPos;
|
|
end
|
|
else begin
|
|
addChar(tok.literal, c);
|
|
Inc(pos)
|
|
end
|
|
until false;
|
|
L.bufpos := pos;
|
|
end
|
|
end;
|
|
|
|
procedure getCharacter(var L: TLexer; var tok: TToken);
|
|
var
|
|
c: Char;
|
|
begin
|
|
Inc(L.bufpos); // skip '
|
|
c := L.buf[L.bufpos];
|
|
case c of
|
|
#0..Pred(' '), '''': lexMessage(L, errInvalidCharacterConstant);
|
|
'\': getEscapedChar(L, tok);
|
|
else begin
|
|
tok.literal := c + '';
|
|
Inc(L.bufpos);
|
|
end
|
|
end;
|
|
if L.buf[L.bufpos] <> '''' then lexMessage(L, errMissingFinalQuote);
|
|
inc(L.bufpos); // skip '
|
|
end;
|
|
|
|
{@ignore}
|
|
{$ifopt Q+} {$define Q_on} {$Q-} {$endif}
|
|
{$ifopt R+} {$define R_on} {$R-} {$endif}
|
|
{@emit}
|
|
procedure getSymbol(var L: TLexer; var tok: TToken);
|
|
var
|
|
pos: int;
|
|
c: Char;
|
|
buf: pchar;
|
|
h: THash; // hashing algorithm inlined
|
|
begin
|
|
h := 0;
|
|
pos := L.bufpos;
|
|
buf := L.buf;
|
|
while true do begin
|
|
c := buf[pos];
|
|
case c of
|
|
'a'..'z', '0'..'9', #128..#255: begin
|
|
h := h +{%} Ord(c);
|
|
h := h +{%} h shl 10;
|
|
h := h xor (h shr 6)
|
|
end;
|
|
'A'..'Z': begin
|
|
c := chr(ord(c) + (ord('a')-ord('A'))); // toLower()
|
|
h := h +{%} Ord(c);
|
|
h := h +{%} h shl 10;
|
|
h := h xor (h shr 6)
|
|
end;
|
|
'_': begin end;
|
|
else break
|
|
end;
|
|
Inc(pos)
|
|
end;
|
|
h := h +{%} h shl 3;
|
|
h := h xor (h shr 11);
|
|
h := h +{%} h shl 15;
|
|
tok.ident := getIdent(addr(L.buf[L.bufpos]), pos-L.bufpos, h);
|
|
L.bufpos := pos;
|
|
if (tok.ident.id < ord(tokKeywordLow)-ord(tkSymbol)) or
|
|
(tok.ident.id > ord(tokKeywordHigh)-ord(tkSymbol)) then
|
|
tok.tokType := tkSymbol
|
|
else
|
|
tok.tokType := TTokType(tok.ident.id+ord(tkSymbol));
|
|
if buf[pos] = '"' then begin
|
|
getString(L, tok, true);
|
|
if tok.tokType = tkRStrLit then tok.tokType := tkCallRStrLit
|
|
else tok.tokType := tkCallTripleStrLit
|
|
end
|
|
end;
|
|
|
|
procedure getOperator(var L: TLexer; var tok: TToken);
|
|
var
|
|
pos: int;
|
|
c: Char;
|
|
buf: pchar;
|
|
h: THash; // hashing algorithm inlined
|
|
begin
|
|
pos := L.bufpos;
|
|
buf := L.buf;
|
|
h := 0;
|
|
while true do begin
|
|
c := buf[pos];
|
|
if c in OpChars then begin
|
|
h := h +{%} Ord(c);
|
|
h := h +{%} h shl 10;
|
|
h := h xor (h shr 6)
|
|
end
|
|
else break;
|
|
Inc(pos)
|
|
end;
|
|
h := h +{%} h shl 3;
|
|
h := h xor (h shr 11);
|
|
h := h +{%} h shl 15;
|
|
tok.ident := getIdent(addr(L.buf[L.bufpos]), pos-L.bufpos, h);
|
|
if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh) then
|
|
tok.tokType := tkOpr
|
|
else
|
|
tok.tokType := TTokType(tok.ident.id - oprLow + ord(tkColon));
|
|
L.bufpos := pos
|
|
end;
|
|
{@ignore}
|
|
{$ifdef Q_on} {$undef Q_on} {$Q+} {$endif}
|
|
{$ifdef R_on} {$undef R_on} {$R+} {$endif}
|
|
{@emit}
|
|
|
|
procedure handleIndentation(var L: TLexer; var tok: TToken; indent: int);
|
|
var
|
|
i: int;
|
|
begin
|
|
tok.indent := indent;
|
|
i := high(L.indentStack);
|
|
if indent > L.indentStack[i] then
|
|
tok.tokType := tkInd
|
|
else if indent = L.indentStack[i] then
|
|
tok.tokType := tkSad
|
|
else begin
|
|
// check we have the indentation somewhere in the stack:
|
|
while (i >= 0) and (indent <> L.indentStack[i]) do begin
|
|
dec(i);
|
|
inc(L.dedent);
|
|
end;
|
|
dec(L.dedent);
|
|
tok.tokType := tkDed;
|
|
if i < 0 then begin
|
|
tok.tokType := tkSad; // for the parser it is better as SAD
|
|
lexMessage(L, errInvalidIndentation);
|
|
end
|
|
end
|
|
end;
|
|
|
|
procedure scanComment(var L: TLexer; var tok: TToken);
|
|
var
|
|
buf: PChar;
|
|
pos, col: int;
|
|
indent: int;
|
|
begin
|
|
pos := L.bufpos;
|
|
buf := L.buf;
|
|
// a comment ends if the next line does not start with the # on the same
|
|
// column after only whitespace
|
|
tok.tokType := tkComment;
|
|
col := getColNumber(L, pos);
|
|
while true do begin
|
|
while not (buf[pos] in [CR, LF, lexbase.EndOfFile]) do begin
|
|
addChar(tok.literal, buf[pos]); inc(pos);
|
|
end;
|
|
pos := handleCRLF(L, pos);
|
|
buf := L.buf;
|
|
indent := 0;
|
|
while buf[pos] = ' ' do begin inc(pos); inc(indent) end;
|
|
if (buf[pos] = '#') and (col = indent) then begin
|
|
tok.literal := tok.literal +{&} nl;
|
|
end
|
|
else begin
|
|
if buf[pos] > ' ' then begin
|
|
L.indentAhead := indent;
|
|
inc(L.dedent)
|
|
end;
|
|
break
|
|
end
|
|
end;
|
|
L.bufpos := pos;
|
|
end;
|
|
|
|
procedure skip(var L: TLexer; var tok: TToken);
|
|
var
|
|
buf: PChar;
|
|
indent, pos: int;
|
|
begin
|
|
pos := L.bufpos;
|
|
buf := L.buf;
|
|
repeat
|
|
case buf[pos] of
|
|
' ': Inc(pos);
|
|
Tabulator: begin
|
|
lexMessagePos(L, errTabulatorsAreNotAllowed, pos);
|
|
inc(pos); // BUGFIX
|
|
end;
|
|
// newline is special:
|
|
CR, LF: begin
|
|
pos := HandleCRLF(L, pos);
|
|
buf := L.buf;
|
|
indent := 0;
|
|
while buf[pos] = ' ' do begin
|
|
Inc(pos); Inc(indent)
|
|
end;
|
|
if (buf[pos] > ' ') then begin
|
|
handleIndentation(L, tok, indent);
|
|
break;
|
|
end
|
|
end;
|
|
else break // EndOfFile also leaves the loop
|
|
end
|
|
until false;
|
|
L.bufpos := pos;
|
|
end;
|
|
|
|
procedure rawGetTok(var L: TLexer; var tok: TToken);
|
|
var
|
|
c: Char;
|
|
begin
|
|
fillToken(tok);
|
|
if L.dedent > 0 then begin
|
|
dec(L.dedent);
|
|
if L.indentAhead >= 0 then begin
|
|
handleIndentation(L, tok, L.indentAhead);
|
|
L.indentAhead := -1;
|
|
end
|
|
else
|
|
tok.tokType := tkDed;
|
|
exit;
|
|
end;
|
|
// Skip whitespace, comments:
|
|
skip(L, tok); // skip
|
|
// got an documentation comment or tkIndent, return that:
|
|
if tok.toktype <> tkInvalid then exit;
|
|
|
|
c := L.buf[L.bufpos];
|
|
if c in SymStartChars - ['r', 'R', 'l'] then // common case first
|
|
getSymbol(L, tok)
|
|
else if c in ['0'..'9'] then
|
|
tok := getNumber(L)
|
|
else begin
|
|
case c of
|
|
'#': scanComment(L, tok);
|
|
':': begin
|
|
tok.tokType := tkColon;
|
|
inc(L.bufpos);
|
|
end;
|
|
',': begin
|
|
tok.toktype := tkComma;
|
|
Inc(L.bufpos)
|
|
end;
|
|
'l': begin
|
|
// if we parsed exactly one character and its a small L (l), this
|
|
// is treated as a warning because it may be confused with the number 1
|
|
if not (L.buf[L.bufpos+1] in (SymChars+['_'])) then
|
|
lexMessage(L, warnSmallLshouldNotBeUsed);
|
|
getSymbol(L, tok);
|
|
end;
|
|
'r', 'R': begin
|
|
if L.buf[L.bufPos+1] = '"' then begin
|
|
Inc(L.bufPos);
|
|
getString(L, tok, true);
|
|
end
|
|
else getSymbol(L, tok);
|
|
end;
|
|
'(': begin
|
|
Inc(L.bufpos);
|
|
if (L.buf[L.bufPos] = '.')
|
|
and (L.buf[L.bufPos+1] <> '.') then begin
|
|
tok.toktype := tkParDotLe;
|
|
Inc(L.bufpos);
|
|
end
|
|
else
|
|
tok.toktype := tkParLe;
|
|
end;
|
|
')': begin
|
|
tok.toktype := tkParRi;
|
|
Inc(L.bufpos)
|
|
end;
|
|
'[': begin
|
|
Inc(L.bufpos);
|
|
if (L.buf[L.bufPos] = '.')
|
|
and (L.buf[L.bufPos+1] <> '.') then begin
|
|
tok.toktype := tkBracketDotLe;
|
|
Inc(L.bufpos);
|
|
end
|
|
else
|
|
tok.toktype := tkBracketLe;
|
|
end;
|
|
']': begin
|
|
tok.toktype := tkBracketRi;
|
|
Inc(L.bufpos)
|
|
end;
|
|
'.': begin
|
|
if L.buf[L.bufPos+1] = ']' then begin
|
|
tok.tokType := tkBracketDotRi;
|
|
Inc(L.bufpos, 2);
|
|
end
|
|
else if L.buf[L.bufPos+1] = '}' then begin
|
|
tok.tokType := tkCurlyDotRi;
|
|
Inc(L.bufpos, 2);
|
|
end
|
|
else if L.buf[L.bufPos+1] = ')' then begin
|
|
tok.tokType := tkParDotRi;
|
|
Inc(L.bufpos, 2);
|
|
end
|
|
else
|
|
getOperator(L, tok)
|
|
end;
|
|
'{': begin
|
|
Inc(L.bufpos);
|
|
if (L.buf[L.bufPos] = '.')
|
|
and (L.buf[L.bufPos+1] <> '.') then begin
|
|
tok.toktype := tkCurlyDotLe;
|
|
Inc(L.bufpos);
|
|
end
|
|
else
|
|
tok.toktype := tkCurlyLe;
|
|
end;
|
|
'}': begin
|
|
tok.toktype := tkCurlyRi;
|
|
Inc(L.bufpos)
|
|
end;
|
|
';': begin
|
|
tok.toktype := tkSemiColon;
|
|
Inc(L.bufpos)
|
|
end;
|
|
'`': begin
|
|
tok.tokType := tkAccent;
|
|
Inc(L.bufpos);
|
|
end;
|
|
'"': getString(L, tok, false);
|
|
'''': begin
|
|
getCharacter(L, tok);
|
|
tok.tokType := tkCharLit;
|
|
end;
|
|
lexbase.EndOfFile: tok.toktype := tkEof;
|
|
else if c in OpChars then
|
|
getOperator(L, tok)
|
|
else begin
|
|
tok.literal := c + '';
|
|
tok.tokType := tkInvalid;
|
|
lexMessage(L, errInvalidToken, c +{&} ' (\' +{&} toString(ord(c)) + ')');
|
|
Inc(L.bufpos);
|
|
end
|
|
end
|
|
end
|
|
end;
|
|
|
|
initialization
|
|
dummyIdent := getIdent('');
|
|
end.
|