From 1102b8ac6e643c8f8428dd7db0994d26b0c65ea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arne=20D=C3=B6ring?= Date: Thu, 28 Feb 2019 22:57:57 +0100 Subject: [PATCH] StringStream and parseJson, parseCfg, parseSql et al for the vm (#10746) --- compiler/ccgmerge.nim | 16 ++--- compiler/lexer.nim | 123 ++++++++++++++------------------ compiler/nimlexbase.nim | 32 ++++----- compiler/vmgen.nim | 2 + lib/packages/docutils/rst.nim | 17 +++-- lib/pure/lexbase.nim | 51 +++++--------- lib/pure/parsecfg.nim | 24 +++---- lib/pure/parsecsv.nim | 15 ++-- lib/pure/parsejson.nim | 76 +++++++++----------- lib/pure/parsesql.nim | 101 ++++++++++++-------------- lib/pure/parsexml.nim | 129 +++++++++++++--------------------- lib/pure/pegs.nim | 43 +++++------- lib/pure/streams.nim | 33 ++++++++- nimsuggest/sexp.nim | 55 +++++++-------- tests/stdlib/tjsonmacro.nim | 40 +++++++++++ 15 files changed, 354 insertions(+), 403 deletions(-) diff --git a/compiler/ccgmerge.nim b/compiler/ccgmerge.nim index ccb5a7635d..56b17440e2 100644 --- a/compiler/ccgmerge.nim +++ b/compiler/ccgmerge.nim @@ -145,38 +145,34 @@ proc atEndMark(buf: cstring, pos: int): bool = proc readVerbatimSection(L: var TBaseLexer): Rope = var pos = L.bufpos - var buf = L.buf var r = newStringOfCap(30_000) while true: - case buf[pos] + case L.buf[pos] of CR: pos = nimlexbase.handleCR(L, pos) - buf = L.buf r.add('\L') of LF: pos = nimlexbase.handleLF(L, pos) - buf = L.buf r.add('\L') of '\0': doAssert(false, "ccgmerge: expected: " & NimMergeEndMark) break else: - if atEndMark(buf, pos): + if atEndMark(L.buf, pos): inc pos, NimMergeEndMark.len break - r.add(buf[pos]) + r.add(L.buf[pos]) inc pos L.bufpos = pos result = r.rope proc readKey(L: var TBaseLexer, result: var string) = var pos = L.bufpos - var buf = L.buf setLen(result, 0) - while buf[pos] in IdentChars: - result.add(buf[pos]) + while L.buf[pos] in IdentChars: + result.add(L.buf[pos]) inc pos - if buf[pos] != ':': doAssert(false, "ccgmerge: ':' expected") + if L.buf[pos] != ':': doAssert(false, "ccgmerge: ':' expected") L.bufpos = pos + 1 # skip ':' proc newFakeType(id: int): PType = diff --git a/compiler/lexer.nim b/compiler/lexer.nim index 5eaa4c09f1..0dd6245b02 100644 --- a/compiler/lexer.nim +++ b/compiler/lexer.nim @@ -318,17 +318,16 @@ template eatChar(L: var TLexer, t: var TToken) = proc getNumber(L: var TLexer, result: var TToken) = proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: set[char]): Natural = var pos = L.bufpos # use registers for pos, buf - var buf = L.buf result = 0 while true: - if buf[pos] in chars: - add(tok.literal, buf[pos]) + if L.buf[pos] in chars: + add(tok.literal, L.buf[pos]) inc(pos) inc(result) else: break - if buf[pos] == '_': - if buf[pos+1] notin chars: + if L.buf[pos] == '_': + if L.buf[pos+1] notin chars: lexMessage(L, errGenerated, "only single underscores may occur in a token and token may not " & "end with an underscore: e.g. '1__1' and '1_' are invalid") @@ -339,9 +338,8 @@ proc getNumber(L: var TLexer, result: var TToken) = proc matchChars(L: var TLexer, tok: var TToken, chars: set[char]) = var pos = L.bufpos # use registers for pos, buf - var buf = L.buf - while buf[pos] in chars: - add(tok.literal, buf[pos]) + while L.buf[pos] in chars: + add(tok.literal, L.buf[pos]) inc(pos) L.bufpos = pos @@ -800,25 +798,23 @@ type proc getString(L: var TLexer, tok: var TToken, mode: StringMode) = var pos = L.bufpos - var buf = L.buf # put `buf` in a register var line = L.lineNumber # save linenumber for better error message tokenBegin(tok, pos - ord(mode == raw)) inc pos # skip " - if buf[pos] == '\"' and buf[pos+1] == '\"': + if L.buf[pos] == '\"' and L.buf[pos+1] == '\"': tok.tokType = tkTripleStrLit # long string literal: inc(pos, 2) # skip "" # skip leading newline: - if buf[pos] in {' ', '\t'}: + if L.buf[pos] in {' ', '\t'}: var newpos = pos+1 - while buf[newpos] in {' ', '\t'}: inc newpos - if buf[newpos] in {CR, LF}: pos = newpos + while L.buf[newpos] in {' ', '\t'}: inc newpos + if L.buf[newpos] in {CR, LF}: pos = newpos pos = handleCRLF(L, pos) - buf = L.buf while true: - case buf[pos] + case L.buf[pos] of '\"': - if buf[pos+1] == '\"' and buf[pos+2] == '\"' and - buf[pos+3] != '\"': + if L.buf[pos+1] == '\"' and L.buf[pos+2] == '\"' and + L.buf[pos+3] != '\"': tokenEndIgnore(tok, pos+2) L.bufpos = pos + 3 # skip the three """ break @@ -827,7 +823,6 @@ proc getString(L: var TLexer, tok: var TToken, mode: StringMode) = of CR, LF: tokenEndIgnore(tok, pos) pos = handleCRLF(L, pos) - buf = L.buf add(tok.literal, "\n") of nimlexbase.EndOfFile: tokenEndIgnore(tok, pos) @@ -838,16 +833,16 @@ proc getString(L: var TLexer, tok: var TToken, mode: StringMode) = L.bufpos = pos break else: - add(tok.literal, buf[pos]) + add(tok.literal, L.buf[pos]) inc(pos) else: # ordinary string literal if mode != normal: tok.tokType = tkRStrLit else: tok.tokType = tkStrLit while true: - var c = buf[pos] + var c = L.buf[pos] if c == '\"': - if mode != normal and buf[pos+1] == '\"': + if mode != normal and L.buf[pos+1] == '\"': inc(pos, 2) add(tok.literal, '"') else: @@ -885,10 +880,9 @@ proc getCharacter(L: var TLexer, tok: var TToken) = proc getSymbol(L: var TLexer, tok: var TToken) = var h: Hash = 0 var pos = L.bufpos - var buf = L.buf tokenBegin(tok, pos) while true: - var c = buf[pos] + var c = L.buf[pos] case c of 'a'..'z', '0'..'9', '\x80'..'\xFF': h = h !& ord(c) @@ -898,7 +892,7 @@ proc getSymbol(L: var TLexer, tok: var TToken) = h = h !& ord(c) inc(pos) of '_': - if buf[pos+1] notin SymChars: + if L.buf[pos+1] notin SymChars: lexMessage(L, errGenerated, "invalid token: trailing underscore") break inc(pos) @@ -923,11 +917,10 @@ proc endOperator(L: var TLexer, tok: var TToken, pos: int, proc getOperator(L: var TLexer, tok: var TToken) = var pos = L.bufpos - var buf = L.buf tokenBegin(tok, pos) var h: Hash = 0 while true: - var c = buf[pos] + var c = L.buf[pos] if c notin OpChars: break h = h !& ord(c) inc(pos) @@ -936,10 +929,10 @@ proc getOperator(L: var TLexer, tok: var TToken) = # advance pos but don't store it in L.bufpos so the next token (which might # be an operator too) gets the preceding spaces: tok.strongSpaceB = 0 - while buf[pos] == ' ': + while L.buf[pos] == ' ': inc pos inc tok.strongSpaceB - if buf[pos] in {CR, LF, nimlexbase.EndOfFile}: + if L.buf[pos] in {CR, LF, nimlexbase.EndOfFile}: tok.strongSpaceB = -1 proc getPrecedence*(tok: TToken, strongSpaces: bool): int = @@ -980,9 +973,8 @@ proc getPrecedence*(tok: TToken, strongSpaces: bool): int = proc newlineFollows*(L: TLexer): bool = var pos = L.bufpos - var buf = L.buf while true: - case buf[pos] + case L.buf[pos] of ' ', '\t': inc(pos) of CR, LF: @@ -990,49 +982,47 @@ proc newlineFollows*(L: TLexer): bool = break of '#': inc(pos) - if buf[pos] == '#': inc(pos) - if buf[pos] != '[': return true + if L.buf[pos] == '#': inc(pos) + if L.buf[pos] != '[': return true else: break proc skipMultiLineComment(L: var TLexer; tok: var TToken; start: int; isDoc: bool) = var pos = start - var buf = L.buf var toStrip = 0 tokenBegin(tok, pos) # detect the amount of indentation: if isDoc: toStrip = getColNumber(L, pos) - while buf[pos] == ' ': inc pos - if buf[pos] in {CR, LF}: + while L.buf[pos] == ' ': inc pos + if L.buf[pos] in {CR, LF}: pos = handleCRLF(L, pos) - buf = L.buf toStrip = 0 - while buf[pos] == ' ': + while L.buf[pos] == ' ': inc pos inc toStrip var nesting = 0 while true: - case buf[pos] + case L.buf[pos] of '#': if isDoc: - if buf[pos+1] == '#' and buf[pos+2] == '[': + if L.buf[pos+1] == '#' and L.buf[pos+2] == '[': inc nesting tok.literal.add '#' - elif buf[pos+1] == '[': + elif L.buf[pos+1] == '[': inc nesting inc pos of ']': if isDoc: - if buf[pos+1] == '#' and buf[pos+2] == '#': + if L.buf[pos+1] == '#' and L.buf[pos+2] == '#': if nesting == 0: tokenEndIgnore(tok, pos+2) inc(pos, 3) break dec nesting tok.literal.add ']' - elif buf[pos+1] == '#': + elif L.buf[pos+1] == '#': if nesting == 0: tokenEndIgnore(tok, pos+1) inc(pos, 2) @@ -1042,14 +1032,13 @@ proc skipMultiLineComment(L: var TLexer; tok: var TToken; start: int; of CR, LF: tokenEndIgnore(tok, pos) pos = handleCRLF(L, pos) - buf = L.buf # strip leading whitespace: when defined(nimpretty): tok.literal.add "\L" if isDoc: when not defined(nimpretty): tok.literal.add "\n" inc tok.iNumber var c = toStrip - while buf[pos] == ' ' and c > 0: + while L.buf[pos] == ' ' and c > 0: inc pos dec c of nimlexbase.EndOfFile: @@ -1057,7 +1046,7 @@ proc skipMultiLineComment(L: var TLexer; tok: var TToken; start: int; lexMessagePos(L, errGenerated, pos, "end of multiline comment expected") break else: - if isDoc or defined(nimpretty): tok.literal.add buf[pos] + if isDoc or defined(nimpretty): tok.literal.add L.buf[pos] inc(pos) L.bufpos = pos when defined(nimpretty): @@ -1065,49 +1054,47 @@ proc skipMultiLineComment(L: var TLexer; tok: var TToken; start: int; proc scanComment(L: var TLexer, tok: var TToken) = var pos = L.bufpos - var buf = L.buf tok.tokType = tkComment # iNumber contains the number of '\n' in the token tok.iNumber = 0 - assert buf[pos+1] == '#' + assert L.buf[pos+1] == '#' when defined(nimpretty): tok.commentOffsetA = L.offsetBase + pos - 1 - if buf[pos+2] == '[': + if L.buf[pos+2] == '[': skipMultiLineComment(L, tok, pos+3, true) return tokenBegin(tok, pos) inc(pos, 2) var toStrip = 0 - while buf[pos] == ' ': + while L.buf[pos] == ' ': inc pos inc toStrip while true: var lastBackslash = -1 - while buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: - if buf[pos] == '\\': lastBackslash = pos+1 - add(tok.literal, buf[pos]) + while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: + if L.buf[pos] == '\\': lastBackslash = pos+1 + add(tok.literal, L.buf[pos]) inc(pos) tokenEndIgnore(tok, pos) pos = handleCRLF(L, pos) - buf = L.buf var indent = 0 - while buf[pos] == ' ': + while L.buf[pos] == ' ': inc(pos) inc(indent) - if buf[pos] == '#' and buf[pos+1] == '#': + if L.buf[pos] == '#' and L.buf[pos+1] == '#': tok.literal.add "\n" inc(pos, 2) var c = toStrip - while buf[pos] == ' ' and c > 0: + while L.buf[pos] == ' ' and c > 0: inc pos dec c inc tok.iNumber else: - if buf[pos] > ' ': + if L.buf[pos] > ' ': L.indentAhead = indent tokenEndIgnore(tok, pos) break @@ -1117,7 +1104,6 @@ proc scanComment(L: var TLexer, tok: var TToken) = proc skip(L: var TLexer, tok: var TToken) = var pos = L.bufpos - var buf = L.buf tokenBegin(tok, pos) tok.strongSpaceA = 0 when defined(nimpretty): @@ -1127,7 +1113,7 @@ proc skip(L: var TLexer, tok: var TToken) = tok.commentOffsetB = tok.commentOffsetA tok.line = -1 while true: - case buf[pos] + case L.buf[pos] of ' ': inc(pos) inc(tok.strongSpaceA) @@ -1137,13 +1123,12 @@ proc skip(L: var TLexer, tok: var TToken) = of CR, LF: tokenEndPrevious(tok, pos) pos = handleCRLF(L, pos) - buf = L.buf var indent = 0 while true: - if buf[pos] == ' ': + if L.buf[pos] == ' ': inc(pos) inc(indent) - elif buf[pos] == '#' and buf[pos+1] == '[': + elif L.buf[pos] == '#' and L.buf[pos+1] == '[': when defined(nimpretty): hasComment = true if tok.line < 0: @@ -1151,32 +1136,30 @@ proc skip(L: var TLexer, tok: var TToken) = commentIndent = indent skipMultiLineComment(L, tok, pos+2, false) pos = L.bufpos - buf = L.buf else: break tok.strongSpaceA = 0 when defined(nimpretty): - if buf[pos] == '#' and tok.line < 0: commentIndent = indent - if buf[pos] > ' ' and (buf[pos] != '#' or buf[pos+1] == '#'): + if L.buf[pos] == '#' and tok.line < 0: commentIndent = indent + if L.buf[pos] > ' ' and (L.buf[pos] != '#' or L.buf[pos+1] == '#'): tok.indent = indent L.currLineIndent = indent break of '#': # do not skip documentation comment: - if buf[pos+1] == '#': break + if L.buf[pos+1] == '#': break when defined(nimpretty): hasComment = true if tok.line < 0: tok.line = L.lineNumber - if buf[pos+1] == '[': + if L.buf[pos+1] == '[': skipMultiLineComment(L, tok, pos+2, false) pos = L.bufpos - buf = L.buf else: tokenBegin(tok, pos) - while buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: - when defined(nimpretty): tok.literal.add buf[pos] + while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: + when defined(nimpretty): tok.literal.add L.buf[pos] inc(pos) tokenEndIgnore(tok, pos+1) when defined(nimpretty): diff --git a/compiler/nimlexbase.nim b/compiler/nimlexbase.nim index 2e7416645e..214147a2b1 100644 --- a/compiler/nimlexbase.nim +++ b/compiler/nimlexbase.nim @@ -39,8 +39,7 @@ const type TBaseLexer* = object of RootObj bufpos*: int - buf*: cstring - bufLen*: int # length of buffer in characters + buf*: string stream*: PLLStream # we read from this stream lineNumber*: int # the current line number # private data: @@ -65,11 +64,7 @@ proc handleLF*(L: var TBaseLexer, pos: int): int # of the LF. # implementation -const - chrSize = sizeof(char) - proc closeBaseLexer(L: var TBaseLexer) = - dealloc(L.buf) llStreamClose(L.stream) proc fillBuffer(L: var TBaseLexer) = @@ -80,14 +75,13 @@ proc fillBuffer(L: var TBaseLexer) = oldBufLen: int # we know here that pos == L.sentinel, but not if this proc # is called the first time by initBaseLexer() - assert(L.sentinel < L.bufLen) - toCopy = L.bufLen - L.sentinel - 1 + assert(L.sentinel < L.buf.len) + toCopy = L.buf.len - L.sentinel - 1 assert(toCopy >= 0) if toCopy > 0: - moveMem(L.buf, addr(L.buf[L.sentinel + 1]), toCopy * chrSize) + moveMem(addr L.buf[0], addr L.buf[L.sentinel + 1], toCopy) # "moveMem" handles overlapping regions - charsRead = llStreamRead(L.stream, addr(L.buf[toCopy]), - (L.sentinel + 1) * chrSize) div chrSize + charsRead = llStreamRead(L.stream, addr L.buf[toCopy], L.sentinel + 1) s = toCopy + charsRead if charsRead < L.sentinel + 1: L.buf[s] = EndOfFile # set end marker @@ -96,7 +90,7 @@ proc fillBuffer(L: var TBaseLexer) = # compute sentinel: dec(s) # BUGFIX (valgrind) while true: - assert(s < L.bufLen) + assert(s < L.buf.len) while (s >= 0) and not (L.buf[s] in NewLines): dec(s) if s >= 0: # we found an appropriate character for a sentinel: @@ -105,17 +99,16 @@ proc fillBuffer(L: var TBaseLexer) = else: # rather than to give up here because the line is too long, # double the buffer's size and try again: - oldBufLen = L.bufLen - L.bufLen = L.bufLen * 2 - L.buf = cast[cstring](realloc(L.buf, L.bufLen * chrSize)) - assert(L.bufLen - oldBufLen == oldBufLen) + oldBufLen = L.buf.len + L.buf.setLen(L.buf.len * 2) + assert(L.buf.len - oldBufLen == oldBufLen) charsRead = llStreamRead(L.stream, addr(L.buf[oldBufLen]), - oldBufLen * chrSize) div chrSize + oldBufLen) if charsRead < oldBufLen: L.buf[oldBufLen + charsRead] = EndOfFile L.sentinel = oldBufLen + charsRead break - s = L.bufLen - 1 + s = L.buf.len - 1 proc fillBaseLexer(L: var TBaseLexer, pos: int): int = assert(pos <= L.sentinel) @@ -149,8 +142,7 @@ proc openBaseLexer(L: var TBaseLexer, inputstream: PLLStream, bufLen = 8192) = assert(bufLen > 0) L.bufpos = 0 L.offsetBase = 0 - L.bufLen = bufLen - L.buf = cast[cstring](alloc(bufLen * chrSize)) + L.buf = newString(bufLen) L.sentinel = bufLen - 1 L.lineStart = 0 L.lineNumber = 1 # lines start at 1 diff --git a/compiler/vmgen.nim b/compiler/vmgen.nim index ab2ac87072..092c25a46e 100644 --- a/compiler/vmgen.nim +++ b/compiler/vmgen.nim @@ -2039,6 +2039,8 @@ proc gen(c: PCtx; n: PNode; dest: var TDest; flags: TGenFlags = {}) = genConv(c, n, n.sons[1], dest) of nkObjDownConv: genConv(c, n, n.sons[0], dest) + of nkObjUpConv: + genConv(c, n, n.sons[0], dest) of nkVarSection, nkLetSection: unused(c, n, dest) genVarSection(c, n) diff --git a/lib/packages/docutils/rst.nim b/lib/packages/docutils/rst.nim index 615119135d..0b077b1f18 100644 --- a/lib/packages/docutils/rst.nim +++ b/lib/packages/docutils/rst.nim @@ -155,18 +155,17 @@ proc getAdornment(L: var Lexer, tok: var Token) = proc getIndentAux(L: var Lexer, start: int): int = var pos = start - var buf = L.buf # skip the newline (but include it in the token!) - if buf[pos] == '\x0D': - if buf[pos + 1] == '\x0A': inc(pos, 2) + if L.buf[pos] == '\x0D': + if L.buf[pos + 1] == '\x0A': inc(pos, 2) else: inc(pos) - elif buf[pos] == '\x0A': + elif L.buf[pos] == '\x0A': inc(pos) if L.skipPounds: - if buf[pos] == '#': inc(pos) - if buf[pos] == '#': inc(pos) + if L.buf[pos] == '#': inc(pos) + if L.buf[pos] == '#': inc(pos) while true: - case buf[pos] + case L.buf[pos] of ' ', '\x0B', '\x0C': inc(pos) inc(result) @@ -175,9 +174,9 @@ proc getIndentAux(L: var Lexer, start: int): int = result = result - (result mod 8) + 8 else: break # EndOfFile also leaves the loop - if buf[pos] == '\0': + if L.buf[pos] == '\0': result = 0 - elif (buf[pos] == '\x0A') or (buf[pos] == '\x0D'): + elif (L.buf[pos] == '\x0A') or (L.buf[pos] == '\x0D'): # look at the next line for proper indentation: result = getIndentAux(L, pos) L.bufpos = pos # no need to set back buf diff --git a/lib/pure/lexbase.nim b/lib/pure/lexbase.nim index e38acd5ef0..11ec45a372 100644 --- a/lib/pure/lexbase.nim +++ b/lib/pure/lexbase.nim @@ -28,11 +28,7 @@ type BaseLexer* = object of RootObj ## the base lexer. Inherit your lexer from ## this object. bufpos*: int ## the current position within the buffer - when defined(js): ## the buffer itself - buf*: string - else: - buf*: cstring - bufLen*: int ## length of buffer in characters + buf*: string ## the buffer itself input: Stream ## the input stream lineNumber*: int ## the current line number sentinel: int @@ -40,13 +36,8 @@ type offsetBase*: int # use ``offsetBase + bufpos`` to get the offset refillChars: set[char] -const - chrSize = sizeof(char) - proc close*(L: var BaseLexer) = ## closes the base lexer. This closes `L`'s associated stream too. - when not defined(js): - dealloc(L.buf) close(L.input) proc fillBuffer(L: var BaseLexer) = @@ -57,17 +48,21 @@ proc fillBuffer(L: var BaseLexer) = oldBufLen: int # we know here that pos == L.sentinel, but not if this proc # is called the first time by initBaseLexer() - assert(L.sentinel < L.bufLen) - toCopy = L.bufLen - L.sentinel - 1 + assert(L.sentinel + 1 <= L.buf.len) + toCopy = L.buf.len - (L.sentinel + 1) assert(toCopy >= 0) if toCopy > 0: when defined(js): - for i in 0 ..< toCopy: L.buf[i] = L.buf[L.sentinel + 1 + i] + for i in 0 ..< toCopy: + L.buf[i] = L.buf[L.sentinel + 1 + i] else: - # "moveMem" handles overlapping regions - moveMem(L.buf, addr L.buf[L.sentinel + 1], toCopy * chrSize) - charsRead = readData(L.input, addr(L.buf[toCopy]), - (L.sentinel + 1) * chrSize) div chrSize + when nimvm: + for i in 0 ..< toCopy: + L.buf[i] = L.buf[L.sentinel + 1 + i] + else: + # "moveMem" handles overlapping regions + moveMem(addr L.buf[0], addr L.buf[L.sentinel + 1], toCopy) + charsRead = L.input.readDataStr(L.buf, toCopy ..< toCopy + L.sentinel + 1) s = toCopy + charsRead if charsRead < L.sentinel + 1: L.buf[s] = EndOfFile # set end marker @@ -76,7 +71,7 @@ proc fillBuffer(L: var BaseLexer) = # compute sentinel: dec(s) # BUGFIX (valgrind) while true: - assert(s < L.bufLen) + assert(s < L.buf.len) while s >= 0 and L.buf[s] notin L.refillChars: dec(s) if s >= 0: # we found an appropriate character for a sentinel: @@ -85,20 +80,14 @@ proc fillBuffer(L: var BaseLexer) = else: # rather than to give up here because the line is too long, # double the buffer's size and try again: - oldBufLen = L.bufLen - L.bufLen = L.bufLen * 2 - when defined(js): - L.buf.setLen(L.bufLen) - else: - L.buf = cast[cstring](realloc(L.buf, L.bufLen * chrSize)) - assert(L.bufLen - oldBufLen == oldBufLen) - charsRead = readData(L.input, addr(L.buf[oldBufLen]), - oldBufLen * chrSize) div chrSize + oldBufLen = L.buf.len + L.buf.setLen(L.buf.len * 2) + charsRead = readDataStr(L.input, L.buf, oldBufLen ..< L.buf.len) if charsRead < oldBufLen: L.buf[oldBufLen + charsRead] = EndOfFile L.sentinel = oldBufLen + charsRead break - s = L.bufLen - 1 + s = L.buf.len - 1 proc fillBaseLexer(L: var BaseLexer, pos: int): int = assert(pos <= L.sentinel) @@ -148,12 +137,8 @@ proc open*(L: var BaseLexer, input: Stream, bufLen: int = 8192; L.input = input L.bufpos = 0 L.offsetBase = 0 - L.bufLen = bufLen L.refillChars = refillChars - when defined(js): - L.buf = newString(bufLen) - else: - L.buf = cast[cstring](alloc(bufLen * chrSize)) + L.buf = newString(bufLen) L.sentinel = bufLen - 1 L.lineStart = 0 L.lineNumber = 1 # lines start at 1 diff --git a/lib/pure/parsecfg.nim b/lib/pure/parsecfg.nim index 106d59017e..d043cd3215 100644 --- a/lib/pure/parsecfg.nim +++ b/lib/pure/parsecfg.nim @@ -261,35 +261,32 @@ proc handleCRLF(c: var CfgParser, pos: int): int = proc getString(c: var CfgParser, tok: var Token, rawMode: bool) = var pos = c.bufpos + 1 # skip " - var buf = c.buf # put `buf` in a register tok.kind = tkSymbol - if (buf[pos] == '"') and (buf[pos + 1] == '"'): + if (c.buf[pos] == '"') and (c.buf[pos + 1] == '"'): # long string literal: inc(pos, 2) # skip "" # skip leading newline: pos = handleCRLF(c, pos) - buf = c.buf while true: - case buf[pos] + case c.buf[pos] of '"': - if (buf[pos + 1] == '"') and (buf[pos + 2] == '"'): break + if (c.buf[pos + 1] == '"') and (c.buf[pos + 2] == '"'): break add(tok.literal, '"') inc(pos) of '\c', '\L': pos = handleCRLF(c, pos) - buf = c.buf add(tok.literal, "\n") of lexbase.EndOfFile: tok.kind = tkInvalid break else: - add(tok.literal, buf[pos]) + add(tok.literal, c.buf[pos]) inc(pos) c.bufpos = pos + 3 # skip the three """ else: # ordinary string literal while true: - var ch = buf[pos] + var ch = c.buf[pos] if ch == '"': inc(pos) # skip '"' break @@ -307,26 +304,23 @@ proc getString(c: var CfgParser, tok: var Token, rawMode: bool) = proc getSymbol(c: var CfgParser, tok: var Token) = var pos = c.bufpos - var buf = c.buf while true: - add(tok.literal, buf[pos]) + add(tok.literal, c.buf[pos]) inc(pos) - if not (buf[pos] in SymChars): break + if not (c.buf[pos] in SymChars): break c.bufpos = pos tok.kind = tkSymbol proc skip(c: var CfgParser) = var pos = c.bufpos - var buf = c.buf while true: - case buf[pos] + case c.buf[pos] of ' ', '\t': inc(pos) of '#', ';': - while not (buf[pos] in {'\c', '\L', lexbase.EndOfFile}): inc(pos) + while not (c.buf[pos] in {'\c', '\L', lexbase.EndOfFile}): inc(pos) of '\c', '\L': pos = handleCRLF(c, pos) - buf = c.buf else: break # EndOfFile also leaves the loop c.bufpos = pos diff --git a/lib/pure/parsecsv.nim b/lib/pure/parsecsv.nim index e0c4f38a4f..402e3ad318 100644 --- a/lib/pure/parsecsv.nim +++ b/lib/pure/parsecsv.nim @@ -156,44 +156,41 @@ proc open*(my: var CsvParser, filename: string, proc parseField(my: var CsvParser, a: var string) = var pos = my.bufpos - var buf = my.buf if my.skipWhite: - while buf[pos] in {' ', '\t'}: inc(pos) + while my.buf[pos] in {' ', '\t'}: inc(pos) setLen(a, 0) # reuse memory - if buf[pos] == my.quote and my.quote != '\0': + if my.buf[pos] == my.quote and my.quote != '\0': inc(pos) while true: - let c = buf[pos] + let c = my.buf[pos] if c == '\0': my.bufpos = pos # can continue after exception? error(my, pos, my.quote & " expected") break elif c == my.quote: - if my.esc == '\0' and buf[pos+1] == my.quote: + if my.esc == '\0' and my.buf[pos+1] == my.quote: add(a, my.quote) inc(pos, 2) else: inc(pos) break elif c == my.esc: - add(a, buf[pos+1]) + add(a, my.buf[pos+1]) inc(pos, 2) else: case c of '\c': pos = handleCR(my, pos) - buf = my.buf add(a, "\n") of '\l': pos = handleLF(my, pos) - buf = my.buf add(a, "\n") else: add(a, c) inc(pos) else: while true: - let c = buf[pos] + let c = my.buf[pos] if c == my.sep: break if c in {'\c', '\l', '\0'}: break add(a, c) diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index 9c53af6a6c..abf2854dd4 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -182,11 +182,10 @@ proc parseEscapedUTF16*(buf: cstring, pos: var int): int = proc parseString(my: var JsonParser): TokKind = result = tkString var pos = my.bufpos + 1 - var buf = my.buf if my.rawStringLiterals: add(my.a, '"') while true: - case buf[pos] + case my.buf[pos] of '\0': my.err = errQuoteExpected result = tkError @@ -199,9 +198,9 @@ proc parseString(my: var JsonParser): TokKind = of '\\': if my.rawStringLiterals: add(my.a, '\\') - case buf[pos+1] + case my.buf[pos+1] of '\\', '"', '\'', '/': - add(my.a, buf[pos+1]) + add(my.a, my.buf[pos+1]) inc(pos, 2) of 'b': add(my.a, '\b') @@ -223,17 +222,17 @@ proc parseString(my: var JsonParser): TokKind = add(my.a, 'u') inc(pos, 2) var pos2 = pos - var r = parseEscapedUTF16(buf, pos) + var r = parseEscapedUTF16(my.buf, pos) if r < 0: my.err = errInvalidToken break # Deal with surrogates if (r and 0xfc00) == 0xd800: - if buf[pos] != '\\' or buf[pos+1] != 'u': + if my.buf[pos] != '\\' or my.buf[pos+1] != 'u': my.err = errInvalidToken break inc(pos, 2) - var s = parseEscapedUTF16(buf, pos) + var s = parseEscapedUTF16(my.buf, pos) if (s and 0xfc00) == 0xdc00 and s > 0: r = 0x10000 + (((r - 0xd800) shl 10) or (s - 0xdc00)) else: @@ -242,8 +241,8 @@ proc parseString(my: var JsonParser): TokKind = if my.rawStringLiterals: let length = pos - pos2 for i in 1 .. length: - if buf[pos2] in {'0'..'9', 'A'..'F', 'a'..'f'}: - add(my.a, buf[pos2]) + if my.buf[pos2] in {'0'..'9', 'A'..'F', 'a'..'f'}: + add(my.a, my.buf[pos2]) inc pos2 else: break @@ -251,61 +250,54 @@ proc parseString(my: var JsonParser): TokKind = add(my.a, toUTF8(Rune(r))) else: # don't bother with the error - add(my.a, buf[pos]) + add(my.a, my.buf[pos]) inc(pos) of '\c': pos = lexbase.handleCR(my, pos) - buf = my.buf add(my.a, '\c') of '\L': pos = lexbase.handleLF(my, pos) - buf = my.buf add(my.a, '\L') else: - add(my.a, buf[pos]) + add(my.a, my.buf[pos]) inc(pos) my.bufpos = pos # store back proc skip(my: var JsonParser) = var pos = my.bufpos - var buf = my.buf while true: - case buf[pos] + case my.buf[pos] of '/': - if buf[pos+1] == '/': + if my.buf[pos+1] == '/': # skip line comment: inc(pos, 2) while true: - case buf[pos] + case my.buf[pos] of '\0': break of '\c': pos = lexbase.handleCR(my, pos) - buf = my.buf break of '\L': pos = lexbase.handleLF(my, pos) - buf = my.buf break else: inc(pos) - elif buf[pos+1] == '*': + elif my.buf[pos+1] == '*': # skip long comment: inc(pos, 2) while true: - case buf[pos] + case my.buf[pos] of '\0': my.err = errEOC_Expected break of '\c': pos = lexbase.handleCR(my, pos) - buf = my.buf of '\L': pos = lexbase.handleLF(my, pos) - buf = my.buf of '*': inc(pos) - if buf[pos] == '/': + if my.buf[pos] == '/': inc(pos) break else: @@ -316,51 +308,47 @@ proc skip(my: var JsonParser) = inc(pos) of '\c': pos = lexbase.handleCR(my, pos) - buf = my.buf of '\L': pos = lexbase.handleLF(my, pos) - buf = my.buf else: break my.bufpos = pos proc parseNumber(my: var JsonParser) = var pos = my.bufpos - var buf = my.buf - if buf[pos] == '-': + if my.buf[pos] == '-': add(my.a, '-') inc(pos) - if buf[pos] == '.': + if my.buf[pos] == '.': add(my.a, "0.") inc(pos) else: - while buf[pos] in Digits: - add(my.a, buf[pos]) + while my.buf[pos] in Digits: + add(my.a, my.buf[pos]) inc(pos) - if buf[pos] == '.': + if my.buf[pos] == '.': add(my.a, '.') inc(pos) # digits after the dot: - while buf[pos] in Digits: - add(my.a, buf[pos]) + while my.buf[pos] in Digits: + add(my.a, my.buf[pos]) inc(pos) - if buf[pos] in {'E', 'e'}: - add(my.a, buf[pos]) + if my.buf[pos] in {'E', 'e'}: + add(my.a, my.buf[pos]) inc(pos) - if buf[pos] in {'+', '-'}: - add(my.a, buf[pos]) + if my.buf[pos] in {'+', '-'}: + add(my.a, my.buf[pos]) inc(pos) - while buf[pos] in Digits: - add(my.a, buf[pos]) + while my.buf[pos] in Digits: + add(my.a, my.buf[pos]) inc(pos) my.bufpos = pos proc parseName(my: var JsonParser) = var pos = my.bufpos - var buf = my.buf - if buf[pos] in IdentStartChars: - while buf[pos] in IdentChars: - add(my.a, buf[pos]) + if my.buf[pos] in IdentStartChars: + while my.buf[pos] in IdentChars: + add(my.a, my.buf[pos]) inc(pos) my.bufpos = pos diff --git a/lib/pure/parsesql.nim b/lib/pure/parsesql.nim index f0961829bc..abe712e7f2 100644 --- a/lib/pure/parsesql.nim +++ b/lib/pure/parsesql.nim @@ -148,35 +148,33 @@ proc handleCRLF(c: var SqlLexer, pos: int): int = proc skip(c: var SqlLexer) = var pos = c.bufpos - var buf = c.buf var nested = 0 while true: - case buf[pos] + case c.buf[pos] of ' ', '\t': inc(pos) of '-': - if buf[pos+1] == '-': - while not (buf[pos] in {'\c', '\L', lexbase.EndOfFile}): inc(pos) + if c.buf[pos+1] == '-': + while not (c.buf[pos] in {'\c', '\L', lexbase.EndOfFile}): inc(pos) else: break of '/': - if buf[pos+1] == '*': + if c.buf[pos+1] == '*': inc(pos,2) while true: - case buf[pos] + case c.buf[pos] of '\0': break of '\c', '\L': pos = handleCRLF(c, pos) - buf = c.buf of '*': - if buf[pos+1] == '/': + if c.buf[pos+1] == '/': inc(pos, 2) if nested <= 0: break dec(nested) else: inc(pos) of '/': - if buf[pos+1] == '*': + if c.buf[pos+1] == '*': inc(pos, 2) inc(nested) else: @@ -185,21 +183,19 @@ proc skip(c: var SqlLexer) = else: break of '\c', '\L': pos = handleCRLF(c, pos) - buf = c.buf else: break # EndOfFile also leaves the loop c.bufpos = pos proc getString(c: var SqlLexer, tok: var Token, kind: TokKind) = var pos = c.bufpos + 1 - var buf = c.buf tok.kind = kind block parseLoop: while true: while true: - var ch = buf[pos] + var ch = c.buf[pos] if ch == '\'': - if buf[pos+1] == '\'': + if c.buf[pos+1] == '\'': inc(pos, 2) add(tok.literal, '\'') else: @@ -221,30 +217,27 @@ proc getString(c: var SqlLexer, tok: var Token, kind: TokKind) = if c.lineNumber > line: # a new line whitespace has been parsed, so we check if the string # continues after the whitespace: - buf = c.buf # may have been reallocated pos = c.bufpos - if buf[pos] == '\'': inc(pos) + if c.buf[pos] == '\'': inc(pos) else: break parseLoop else: break parseLoop c.bufpos = pos proc getDollarString(c: var SqlLexer, tok: var Token) = var pos = c.bufpos + 1 - var buf = c.buf tok.kind = tkDollarQuotedConstant var tag = "$" - while buf[pos] in IdentChars: - add(tag, buf[pos]) + while c.buf[pos] in IdentChars: + add(tag, c.buf[pos]) inc(pos) - if buf[pos] == '$': inc(pos) + if c.buf[pos] == '$': inc(pos) else: tok.kind = tkInvalid return while true: - case buf[pos] + case c.buf[pos] of '\c', '\L': pos = handleCRLF(c, pos) - buf = c.buf add(tok.literal, "\L") of '\0': tok.kind = tkInvalid @@ -252,37 +245,35 @@ proc getDollarString(c: var SqlLexer, tok: var Token) = of '$': inc(pos) var tag2 = "$" - while buf[pos] in IdentChars: - add(tag2, buf[pos]) + while c.buf[pos] in IdentChars: + add(tag2, c.buf[pos]) inc(pos) - if buf[pos] == '$': inc(pos) + if c.buf[pos] == '$': inc(pos) if tag2 == tag: break add(tok.literal, tag2) add(tok.literal, '$') else: - add(tok.literal, buf[pos]) + add(tok.literal, c.buf[pos]) inc(pos) c.bufpos = pos proc getSymbol(c: var SqlLexer, tok: var Token) = var pos = c.bufpos - var buf = c.buf while true: - add(tok.literal, buf[pos]) + add(tok.literal, c.buf[pos]) inc(pos) - if buf[pos] notin {'a'..'z','A'..'Z','0'..'9','_','$', '\128'..'\255'}: + if c.buf[pos] notin {'a'..'z','A'..'Z','0'..'9','_','$', '\128'..'\255'}: break c.bufpos = pos tok.kind = tkIdentifier proc getQuotedIdentifier(c: var SqlLexer, tok: var Token, quote='\"') = var pos = c.bufpos + 1 - var buf = c.buf tok.kind = tkQuotedIdentifier while true: - var ch = buf[pos] + var ch = c.buf[pos] if ch == quote: - if buf[pos+1] == quote: + if c.buf[pos+1] == quote: inc(pos, 2) add(tok.literal, quote) else: @@ -298,11 +289,10 @@ proc getQuotedIdentifier(c: var SqlLexer, tok: var Token, quote='\"') = proc getBitHexString(c: var SqlLexer, tok: var Token, validChars: set[char]) = var pos = c.bufpos + 1 - var buf = c.buf block parseLoop: while true: while true: - var ch = buf[pos] + var ch = c.buf[pos] if ch in validChars: add(tok.literal, ch) inc(pos) @@ -318,9 +308,8 @@ proc getBitHexString(c: var SqlLexer, tok: var Token, validChars: set[char]) = if c.lineNumber > line: # a new line whitespace has been parsed, so we check if the string # continues after the whitespace: - buf = c.buf # may have been reallocated pos = c.bufpos - if buf[pos] == '\'': inc(pos) + if c.buf[pos] == '\'': inc(pos) else: break parseLoop else: break parseLoop c.bufpos = pos @@ -328,29 +317,28 @@ proc getBitHexString(c: var SqlLexer, tok: var Token, validChars: set[char]) = proc getNumeric(c: var SqlLexer, tok: var Token) = tok.kind = tkInteger var pos = c.bufpos - var buf = c.buf - while buf[pos] in Digits: - add(tok.literal, buf[pos]) + while c.buf[pos] in Digits: + add(tok.literal, c.buf[pos]) inc(pos) - if buf[pos] == '.': + if c.buf[pos] == '.': tok.kind = tkNumeric - add(tok.literal, buf[pos]) + add(tok.literal, c.buf[pos]) inc(pos) - while buf[pos] in Digits: - add(tok.literal, buf[pos]) + while c.buf[pos] in Digits: + add(tok.literal, c.buf[pos]) inc(pos) - if buf[pos] in {'E', 'e'}: + if c.buf[pos] in {'E', 'e'}: tok.kind = tkNumeric - add(tok.literal, buf[pos]) + add(tok.literal, c.buf[pos]) inc(pos) - if buf[pos] == '+': + if c.buf[pos] == '+': inc(pos) - elif buf[pos] == '-': - add(tok.literal, buf[pos]) + elif c.buf[pos] == '-': + add(tok.literal, c.buf[pos]) inc(pos) - if buf[pos] in Digits: - while buf[pos] in Digits: - add(tok.literal, buf[pos]) + if c.buf[pos] in Digits: + while c.buf[pos] in Digits: + add(tok.literal, c.buf[pos]) inc(pos) else: tok.kind = tkInvalid @@ -361,24 +349,23 @@ proc getOperator(c: var SqlLexer, tok: var Token) = '^', '&', '|', '`', '?'} tok.kind = tkOperator var pos = c.bufpos - var buf = c.buf var trailingPlusMinus = false while true: - case buf[pos] + case c.buf[pos] of '-': - if buf[pos] == '-': break - if not trailingPlusMinus and buf[pos+1] notin operators and + if c.buf[pos] == '-': break + if not trailingPlusMinus and c.buf[pos+1] notin operators and tok.literal.len > 0: break of '/': - if buf[pos] == '*': break + if c.buf[pos] == '*': break of '~', '!', '@', '#', '%', '^', '&', '|', '`', '?': trailingPlusMinus = true of '+': - if not trailingPlusMinus and buf[pos+1] notin operators and + if not trailingPlusMinus and c.buf[pos+1] notin operators and tok.literal.len > 0: break of '*', '<', '>', '=': discard else: break - add(tok.literal, buf[pos]) + add(tok.literal, c.buf[pos]) inc(pos) c.bufpos = pos diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim index 953c5cdde2..3b77f9c628 100644 --- a/lib/pure/parsexml.nim +++ b/lib/pure/parsexml.nim @@ -345,11 +345,10 @@ proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} = proc parseCDATA(my: var XmlParser) = var pos = my.bufpos + len("': + if my.buf[pos+1] == ']' and my.buf[pos+2] == '>': inc(pos, 3) break add(my.a, ']') @@ -359,29 +358,25 @@ proc parseCDATA(my: var XmlParser) = break of '\c': pos = lexbase.handleCR(my, pos) - buf = my.buf add(my.a, '\L') of '\L': pos = lexbase.handleLF(my, pos) - buf = my.buf add(my.a, '\L') of '/': pos = lexbase.handleRefillChar(my, pos) - buf = my.buf add(my.a, '/') else: - add(my.a, buf[pos]) + add(my.a, my.buf[pos]) inc(pos) my.bufpos = pos # store back my.kind = xmlCData proc parseComment(my: var XmlParser) = var pos = my.bufpos + len("