mirror of
https://github.com/nim-lang/Nim.git
synced 2025-12-30 18:02:05 +00:00
233 lines
6.0 KiB
ObjectPascal
233 lines
6.0 KiB
ObjectPascal
//
|
|
//
|
|
// The Nimrod Compiler
|
|
// (c) Copyright 2008 Andreas Rumpf
|
|
//
|
|
// See the file "copying.txt", included in this
|
|
// distribution, for details about the copyright.
|
|
//
|
|
unit lexbase;
|
|
|
|
// Base Object of a lexer with efficient buffer handling. In fact
|
|
// I believe that this is the most efficient method of buffer
|
|
// handling that exists! Only at line endings checks are necessary
|
|
// if the buffer needs refilling.
|
|
|
|
interface
|
|
|
|
uses
|
|
nsystem, llstream, charsets, strutils;
|
|
|
|
{@emit
|
|
const
|
|
Lrz = ' ';
|
|
Apo = '''';
|
|
Tabulator = #9;
|
|
ESC = #27;
|
|
CR = #13;
|
|
FF = #12;
|
|
LF = #10;
|
|
BEL = #7;
|
|
BACKSPACE = #8;
|
|
VT = #11;
|
|
}
|
|
|
|
const
|
|
EndOfFile = #0; // end of file marker
|
|
{ A little picture makes everything clear :-)
|
|
buf:
|
|
"Example Text\n ha!" bufLen = 17
|
|
^pos = 0 ^ sentinel = 12
|
|
}
|
|
NewLines = {@set}[CR, LF];
|
|
|
|
type
|
|
TBaseLexer = object(NObject)
|
|
bufpos: int;
|
|
buf: PChar;
|
|
bufLen: int; // length of buffer in characters
|
|
stream: PLLStream; // we read from this stream
|
|
LineNumber: int; // the current line number
|
|
// private data:
|
|
sentinel: int;
|
|
lineStart: int; // index of last line start in buffer
|
|
end;
|
|
|
|
procedure openBaseLexer(out L: TBaseLexer;
|
|
inputstream: PLLStream;
|
|
bufLen: int = 8192);
|
|
// 8K is a reasonable buffer size
|
|
|
|
procedure closeBaseLexer(var L: TBaseLexer);
|
|
|
|
function getCurrentLine(const L: TBaseLexer; marker: boolean = true): string;
|
|
function getColNumber(const L: TBaseLexer; pos: int): int;
|
|
|
|
function HandleCR(var L: TBaseLexer; pos: int): int;
|
|
// Call this if you scanned over CR in the buffer; it returns the
|
|
// position to continue the scanning from. `pos` must be the position
|
|
// of the CR.
|
|
|
|
function HandleLF(var L: TBaseLexer; pos: int): int;
|
|
// Call this if you scanned over LF in the buffer; it returns the the
|
|
// position to continue the scanning from. `pos` must be the position
|
|
// of the LF.
|
|
|
|
implementation
|
|
|
|
const
|
|
chrSize = sizeof(char);
|
|
|
|
procedure closeBaseLexer(var L: TBaseLexer);
|
|
begin
|
|
dealloc(L.buf);
|
|
LLStreamClose(L.stream);
|
|
end;
|
|
|
|
{@ignore}
|
|
{$ifdef false}
|
|
procedure printBuffer(const L: TBaseLexer);
|
|
var
|
|
i: int;
|
|
begin
|
|
writeln('____________________________________');
|
|
writeln('sentinel: ', L.sentinel);
|
|
writeln('bufLen: ', L.bufLen);
|
|
writeln('buf: ');
|
|
for i := 0 to L.bufLen-1 do write(L.buf[i]);
|
|
writeln(NL + '____________________________________');
|
|
end;
|
|
{$endif}
|
|
{@emit}
|
|
|
|
procedure FillBuffer(var L: TBaseLexer);
|
|
var
|
|
charsRead, toCopy, s: int; // all are in characters,
|
|
// not bytes (in case this
|
|
// is not the same)
|
|
oldBufLen: int;
|
|
begin
|
|
// we know here that pos == L.sentinel, but not if this proc
|
|
// is called the first time by initBaseLexer()
|
|
assert(L.sentinel < L.bufLen);
|
|
toCopy := L.BufLen - L.sentinel - 1;
|
|
assert(toCopy >= 0);
|
|
if toCopy > 0 then
|
|
MoveMem(L.buf, addr(L.buf[L.sentinel+1]), toCopy * chrSize);
|
|
// "moveMem" handles overlapping regions
|
|
charsRead := LLStreamRead(L.stream, addr(L.buf[toCopy]),
|
|
(L.sentinel+1) * chrSize) div chrSize;
|
|
s := toCopy + charsRead;
|
|
if charsRead < L.sentinel+1 then begin
|
|
L.buf[s] := EndOfFile; // set end marker
|
|
L.sentinel := s
|
|
end
|
|
else begin
|
|
// compute sentinel:
|
|
dec(s); // BUGFIX (valgrind)
|
|
while true do begin
|
|
assert(s < L.bufLen);
|
|
while (s >= 0) and not (L.buf[s] in NewLines) do Dec(s);
|
|
if s >= 0 then begin
|
|
// we found an appropriate character for a sentinel:
|
|
L.sentinel := s;
|
|
break
|
|
end
|
|
else begin
|
|
// rather than to give up here because the line is too long,
|
|
// double the buffer's size and try again:
|
|
oldBufLen := L.BufLen;
|
|
L.bufLen := L.BufLen * 2;
|
|
L.buf := {@cast}PChar(realloc(L.buf, L.bufLen*chrSize));
|
|
assert(L.bufLen - oldBuflen = oldBufLen);
|
|
charsRead := LLStreamRead(L.stream, addr(L.buf[oldBufLen]),
|
|
oldBufLen*chrSize) div chrSize;
|
|
if charsRead < oldBufLen then begin
|
|
L.buf[oldBufLen+charsRead] := EndOfFile;
|
|
L.sentinel := oldBufLen+charsRead;
|
|
break
|
|
end;
|
|
s := L.bufLen - 1
|
|
end
|
|
end
|
|
end
|
|
end;
|
|
|
|
function fillBaseLexer(var L: TBaseLexer; pos: int): int;
|
|
begin
|
|
assert(pos <= L.sentinel);
|
|
if pos < L.sentinel then begin
|
|
result := pos+1; // nothing to do
|
|
end
|
|
else begin
|
|
fillBuffer(L);
|
|
L.bufpos := 0; // XXX: is this really correct?
|
|
result := 0;
|
|
end;
|
|
L.lineStart := result;
|
|
end;
|
|
|
|
function HandleCR(var L: TBaseLexer; pos: int): int;
|
|
begin
|
|
assert(L.buf[pos] = CR);
|
|
inc(L.linenumber);
|
|
result := fillBaseLexer(L, pos);
|
|
if L.buf[result] = LF then begin
|
|
result := fillBaseLexer(L, result);
|
|
end;
|
|
//L.lastNL := result-1; // BUGFIX: was: result;
|
|
end;
|
|
|
|
function HandleLF(var L: TBaseLexer; pos: int): int;
|
|
begin
|
|
assert(L.buf[pos] = LF);
|
|
inc(L.linenumber);
|
|
result := fillBaseLexer(L, pos);
|
|
//L.lastNL := result-1; // BUGFIX: was: result;
|
|
end;
|
|
|
|
procedure skip_UTF_8_BOM(var L: TBaseLexer);
|
|
begin
|
|
if (L.buf[0] = #239) and (L.buf[1] = #187) and (L.buf[2] = #191) then begin
|
|
inc(L.bufpos, 3);
|
|
inc(L.lineStart, 3)
|
|
end
|
|
end;
|
|
|
|
procedure openBaseLexer(out L: TBaseLexer; inputstream: PLLStream;
|
|
bufLen: int = 8192);
|
|
begin
|
|
assert(bufLen > 0);
|
|
L.bufpos := 0;
|
|
L.bufLen := bufLen;
|
|
L.buf := {@cast}PChar(alloc(bufLen * chrSize));
|
|
L.sentinel := bufLen-1;
|
|
L.lineStart := 0;
|
|
L.linenumber := 1; // lines start at 1
|
|
L.stream := inputstream;
|
|
fillBuffer(L);
|
|
skip_UTF_8_BOM(L);
|
|
end;
|
|
|
|
function getColNumber(const L: TBaseLexer; pos: int): int;
|
|
begin
|
|
result := abs(pos - L.lineStart);
|
|
end;
|
|
|
|
function getCurrentLine(const L: TBaseLexer; marker: boolean = true): string;
|
|
var
|
|
i: int;
|
|
begin
|
|
result := '';
|
|
i := L.lineStart;
|
|
while not (L.buf[i] in [CR, LF, EndOfFile]) do begin
|
|
addChar(result, L.buf[i]);
|
|
inc(i)
|
|
end;
|
|
result := result +{&} NL;
|
|
if marker then
|
|
result := result +{&} RepeatChar(getColNumber(L, L.bufpos)) +{&} '^' +{&} NL
|
|
end;
|
|
|
|
end.
|