mirror of
https://github.com/nim-lang/Nim.git
synced 2026-04-19 14:00:35 +00:00
fixes #2429
This commit is contained in:
@@ -34,37 +34,15 @@ type
|
||||
lineNumber*: int ## the current line number
|
||||
sentinel: int
|
||||
lineStart: int # index of last line start in buffer
|
||||
fileOpened: bool
|
||||
refillChars: set[char]
|
||||
|
||||
{.deprecated: [TBaseLexer: BaseLexer].}
|
||||
|
||||
proc open*(L: var BaseLexer, input: Stream, bufLen: int = 8192)
|
||||
## inits the BaseLexer with a stream to read from
|
||||
|
||||
proc close*(L: var BaseLexer)
|
||||
## closes the base lexer. This closes `L`'s associated stream too.
|
||||
|
||||
proc getCurrentLine*(L: BaseLexer, marker: bool = true): string
|
||||
## retrieves the current line.
|
||||
|
||||
proc getColNumber*(L: BaseLexer, pos: int): int
|
||||
## retrieves the current column.
|
||||
|
||||
proc handleCR*(L: var BaseLexer, pos: int): int
|
||||
## Call this if you scanned over '\c' in the buffer; it returns the the
|
||||
## position to continue the scanning from. `pos` must be the position
|
||||
## of the '\c'.
|
||||
proc handleLF*(L: var BaseLexer, pos: int): int
|
||||
## Call this if you scanned over '\L' in the buffer; it returns the the
|
||||
## position to continue the scanning from. `pos` must be the position
|
||||
## of the '\L'.
|
||||
|
||||
# implementation
|
||||
|
||||
const
|
||||
chrSize = sizeof(char)
|
||||
|
||||
proc close(L: var BaseLexer) =
|
||||
proc close*(L: var BaseLexer) =
|
||||
## closes the base lexer. This closes `L`'s associated stream too.
|
||||
dealloc(L.buf)
|
||||
close(L.input)
|
||||
|
||||
@@ -80,7 +58,7 @@ proc fillBuffer(L: var BaseLexer) =
|
||||
toCopy = L.bufLen - L.sentinel - 1
|
||||
assert(toCopy >= 0)
|
||||
if toCopy > 0:
|
||||
moveMem(L.buf, addr(L.buf[L.sentinel + 1]), toCopy * chrSize)
|
||||
moveMem(L.buf, addr(L.buf[L.sentinel + 1]), toCopy * chrSize)
|
||||
# "moveMem" handles overlapping regions
|
||||
charsRead = readData(L.input, addr(L.buf[toCopy]),
|
||||
(L.sentinel + 1) * chrSize) div chrSize
|
||||
@@ -93,7 +71,7 @@ proc fillBuffer(L: var BaseLexer) =
|
||||
dec(s) # BUGFIX (valgrind)
|
||||
while true:
|
||||
assert(s < L.bufLen)
|
||||
while (s >= 0) and not (L.buf[s] in NewLines): dec(s)
|
||||
while s >= 0 and L.buf[s] notin L.refillChars: dec(s)
|
||||
if s >= 0:
|
||||
# we found an appropriate character for a sentinel:
|
||||
L.sentinel = s
|
||||
@@ -121,31 +99,46 @@ proc fillBaseLexer(L: var BaseLexer, pos: int): int =
|
||||
fillBuffer(L)
|
||||
L.bufpos = 0 # XXX: is this really correct?
|
||||
result = 0
|
||||
L.lineStart = result
|
||||
|
||||
proc handleCR(L: var BaseLexer, pos: int): int =
|
||||
proc handleCR*(L: var BaseLexer, pos: int): int =
|
||||
## Call this if you scanned over '\c' in the buffer; it returns the the
|
||||
## position to continue the scanning from. `pos` must be the position
|
||||
## of the '\c'.
|
||||
assert(L.buf[pos] == '\c')
|
||||
inc(L.lineNumber)
|
||||
result = fillBaseLexer(L, pos)
|
||||
if L.buf[result] == '\L':
|
||||
result = fillBaseLexer(L, result)
|
||||
L.lineStart = result
|
||||
|
||||
proc handleLF(L: var BaseLexer, pos: int): int =
|
||||
proc handleLF*(L: var BaseLexer, pos: int): int =
|
||||
## Call this if you scanned over '\L' in the buffer; it returns the the
|
||||
## position to continue the scanning from. `pos` must be the position
|
||||
## of the '\L'.
|
||||
assert(L.buf[pos] == '\L')
|
||||
inc(L.lineNumber)
|
||||
result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
|
||||
L.lineStart = result
|
||||
|
||||
proc handleRefillChar*(L: var BaseLexer, pos: int): int =
|
||||
## To be documented.
|
||||
assert(L.buf[pos] in L.refillChars)
|
||||
result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
|
||||
|
||||
proc skipUtf8Bom(L: var BaseLexer) =
|
||||
if (L.buf[0] == '\xEF') and (L.buf[1] == '\xBB') and (L.buf[2] == '\xBF'):
|
||||
inc(L.bufpos, 3)
|
||||
inc(L.lineStart, 3)
|
||||
|
||||
proc open(L: var BaseLexer, input: Stream, bufLen: int = 8192) =
|
||||
proc open*(L: var BaseLexer, input: Stream, bufLen: int = 8192;
|
||||
refillChars: set[char] = NewLines) =
|
||||
## inits the BaseLexer with a stream to read from.
|
||||
assert(bufLen > 0)
|
||||
assert(input != nil)
|
||||
L.input = input
|
||||
L.bufpos = 0
|
||||
L.bufLen = bufLen
|
||||
L.refillChars = refillChars
|
||||
L.buf = cast[cstring](alloc(bufLen * chrSize))
|
||||
L.sentinel = bufLen - 1
|
||||
L.lineStart = 0
|
||||
@@ -153,10 +146,12 @@ proc open(L: var BaseLexer, input: Stream, bufLen: int = 8192) =
|
||||
fillBuffer(L)
|
||||
skipUtf8Bom(L)
|
||||
|
||||
proc getColNumber(L: BaseLexer, pos: int): int =
|
||||
proc getColNumber*(L: BaseLexer, pos: int): int =
|
||||
## retrieves the current column.
|
||||
result = abs(pos - L.lineStart)
|
||||
|
||||
proc getCurrentLine(L: BaseLexer, marker: bool = true): string =
|
||||
proc getCurrentLine*(L: BaseLexer, marker: bool = true): string =
|
||||
## retrieves the current line.
|
||||
var i: int
|
||||
result = ""
|
||||
i = L.lineStart
|
||||
@@ -166,4 +161,3 @@ proc getCurrentLine(L: BaseLexer, marker: bool = true): string =
|
||||
add(result, "\n")
|
||||
if marker:
|
||||
add(result, spaces(getColNumber(L, L.bufpos)) & "^\n")
|
||||
|
||||
|
||||
@@ -8,19 +8,19 @@
|
||||
#
|
||||
|
||||
## This module implements a simple high performance `XML`:idx: / `HTML`:idx:
|
||||
## parser.
|
||||
## parser.
|
||||
## The only encoding that is supported is UTF-8. The parser has been designed
|
||||
## to be somewhat error correcting, so that even most "wild HTML" found on the
|
||||
## to be somewhat error correcting, so that even most "wild HTML" found on the
|
||||
## web can be parsed with it. **Note:** This parser does not check that each
|
||||
## ``<tag>`` has a corresponding ``</tag>``! These checks have do be
|
||||
## implemented by the client code for various reasons:
|
||||
## ``<tag>`` has a corresponding ``</tag>``! These checks have do be
|
||||
## implemented by the client code for various reasons:
|
||||
##
|
||||
## * Old HTML contains tags that have no end tag: ``<br>`` for example.
|
||||
## * HTML tags are case insensitive, XML tags are case sensitive. Since this
|
||||
## library can parse both, only the client knows which comparison is to be
|
||||
## used.
|
||||
## * Thus the checks would have been very difficult to implement properly with
|
||||
## little benefit, especially since they are simple to implement in the
|
||||
## little benefit, especially since they are simple to implement in the
|
||||
## client. The client should use the `errorMsgExpected` proc to generate
|
||||
## a nice error message that fits the other error messages this library
|
||||
## creates.
|
||||
@@ -29,7 +29,7 @@
|
||||
## Example 1: Retrieve HTML title
|
||||
## ==============================
|
||||
##
|
||||
## The file ``examples/htmltitle.nim`` demonstrates how to use the
|
||||
## The file ``examples/htmltitle.nim`` demonstrates how to use the
|
||||
## XML parser to accomplish a simple task: To determine the title of an HTML
|
||||
## document.
|
||||
##
|
||||
@@ -40,22 +40,22 @@
|
||||
## Example 2: Retrieve all HTML links
|
||||
## ==================================
|
||||
##
|
||||
## The file ``examples/htmlrefs.nim`` demonstrates how to use the
|
||||
## XML parser to accomplish another simple task: To determine all the links
|
||||
## The file ``examples/htmlrefs.nim`` demonstrates how to use the
|
||||
## XML parser to accomplish another simple task: To determine all the links
|
||||
## an HTML document contains.
|
||||
##
|
||||
## .. code-block:: nim
|
||||
## :file: examples/htmlrefs.nim
|
||||
##
|
||||
|
||||
import
|
||||
import
|
||||
hashes, strutils, lexbase, streams, unicode
|
||||
|
||||
# the parser treats ``<br />`` as ``<br></br>``
|
||||
|
||||
# xmlElementCloseEnd, ## ``/>``
|
||||
# xmlElementCloseEnd, ## ``/>``
|
||||
|
||||
type
|
||||
type
|
||||
XmlEventKind* = enum ## enumation of all events that may occur when parsing
|
||||
xmlError, ## an error occurred during parsing
|
||||
xmlEof, ## end of file reached
|
||||
@@ -65,13 +65,13 @@ type
|
||||
xmlPI, ## processing instruction (``<?name something ?>``)
|
||||
xmlElementStart, ## ``<elem>``
|
||||
xmlElementEnd, ## ``</elem>``
|
||||
xmlElementOpen, ## ``<elem
|
||||
xmlElementOpen, ## ``<elem
|
||||
xmlAttribute, ## ``key = "value"`` pair
|
||||
xmlElementClose, ## ``>``
|
||||
xmlElementClose, ## ``>``
|
||||
xmlCData, ## ``<![CDATA[`` ... data ... ``]]>``
|
||||
xmlEntity, ## &entity;
|
||||
xmlSpecial ## ``<! ... data ... >``
|
||||
|
||||
|
||||
XmlErrorKind* = enum ## enumeration that lists all errors that can occur
|
||||
errNone, ## no error
|
||||
errEndOfCDataExpected, ## ``]]>`` expected
|
||||
@@ -82,8 +82,8 @@ type
|
||||
errEqExpected, ## ``=`` expected
|
||||
errQuoteExpected, ## ``"`` or ``'`` expected
|
||||
errEndOfCommentExpected ## ``-->`` expected
|
||||
|
||||
ParserState = enum
|
||||
|
||||
ParserState = enum
|
||||
stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
|
||||
|
||||
XmlParseOption* = enum ## options for the XML parser
|
||||
@@ -121,8 +121,8 @@ proc open*(my: var XmlParser, input: Stream, filename: string,
|
||||
## the `options` parameter: If `options` contains ``reportWhitespace``
|
||||
## a whitespace token is reported as an ``xmlWhitespace`` event.
|
||||
## If `options` contains ``reportComments`` a comment token is reported as an
|
||||
## ``xmlComment`` event.
|
||||
lexbase.open(my, input)
|
||||
## ``xmlComment`` event.
|
||||
lexbase.open(my, input, 8192, {'\c', '\L', '/'})
|
||||
my.filename = filename
|
||||
my.state = stateStart
|
||||
my.kind = xmlError
|
||||
@@ -130,24 +130,24 @@ proc open*(my: var XmlParser, input: Stream, filename: string,
|
||||
my.b = ""
|
||||
my.c = nil
|
||||
my.options = options
|
||||
|
||||
proc close*(my: var XmlParser) {.inline.} =
|
||||
|
||||
proc close*(my: var XmlParser) {.inline.} =
|
||||
## closes the parser `my` and its associated input stream.
|
||||
lexbase.close(my)
|
||||
|
||||
proc kind*(my: XmlParser): XmlEventKind {.inline.} =
|
||||
proc kind*(my: XmlParser): XmlEventKind {.inline.} =
|
||||
## returns the current event type for the XML parser
|
||||
return my.kind
|
||||
|
||||
template charData*(my: XmlParser): string =
|
||||
## returns the character data for the events: ``xmlCharData``,
|
||||
## returns the character data for the events: ``xmlCharData``,
|
||||
## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
|
||||
assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData,
|
||||
assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData,
|
||||
xmlSpecial})
|
||||
my.a
|
||||
|
||||
template elementName*(my: XmlParser): string =
|
||||
## returns the element name for the events: ``xmlElementStart``,
|
||||
## returns the element name for the events: ``xmlElementStart``,
|
||||
## ``xmlElementEnd``, ``xmlElementOpen``
|
||||
assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen})
|
||||
my.a
|
||||
@@ -156,12 +156,12 @@ template entityName*(my: XmlParser): string =
|
||||
## returns the entity name for the event: ``xmlEntity``
|
||||
assert(my.kind == xmlEntity)
|
||||
my.a
|
||||
|
||||
|
||||
template attrKey*(my: XmlParser): string =
|
||||
## returns the attribute key for the event ``xmlAttribute``
|
||||
assert(my.kind == xmlAttribute)
|
||||
my.a
|
||||
|
||||
|
||||
template attrValue*(my: XmlParser): string =
|
||||
## returns the attribute value for the event ``xmlAttribute``
|
||||
assert(my.kind == xmlAttribute)
|
||||
@@ -187,110 +187,118 @@ proc rawData2*(my: XmlParser): string {.inline.} =
|
||||
## This is only used for speed hacks.
|
||||
shallowCopy(result, my.b)
|
||||
|
||||
proc getColumn*(my: XmlParser): int {.inline.} =
|
||||
proc getColumn*(my: XmlParser): int {.inline.} =
|
||||
## get the current column the parser has arrived at.
|
||||
result = getColNumber(my, my.bufpos)
|
||||
|
||||
proc getLine*(my: XmlParser): int {.inline.} =
|
||||
proc getLine*(my: XmlParser): int {.inline.} =
|
||||
## get the current line the parser has arrived at.
|
||||
result = my.lineNumber
|
||||
|
||||
proc getFilename*(my: XmlParser): string {.inline.} =
|
||||
proc getFilename*(my: XmlParser): string {.inline.} =
|
||||
## get the filename of the file that the parser processes.
|
||||
result = my.filename
|
||||
|
||||
proc errorMsg*(my: XmlParser): string =
|
||||
|
||||
proc errorMsg*(my: XmlParser): string =
|
||||
## returns a helpful error message for the event ``xmlError``
|
||||
assert(my.kind == xmlError)
|
||||
result = "$1($2, $3) Error: $4" % [
|
||||
my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
|
||||
|
||||
proc errorMsgExpected*(my: XmlParser, tag: string): string =
|
||||
proc errorMsgExpected*(my: XmlParser, tag: string): string =
|
||||
## returns an error message "<tag> expected" in the same format as the
|
||||
## other error messages
|
||||
## other error messages
|
||||
result = "$1($2, $3) Error: $4" % [
|
||||
my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag]
|
||||
|
||||
proc errorMsg*(my: XmlParser, msg: string): string =
|
||||
proc errorMsg*(my: XmlParser, msg: string): string =
|
||||
## returns an error message with text `msg` in the same format as the
|
||||
## other error messages
|
||||
## other error messages
|
||||
result = "$1($2, $3) Error: $4" % [
|
||||
my.filename, $getLine(my), $getColumn(my), msg]
|
||||
|
||||
proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} =
|
||||
|
||||
proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} =
|
||||
my.err = kind
|
||||
my.state = stateError
|
||||
|
||||
proc parseCDATA(my: var XmlParser) =
|
||||
proc parseCDATA(my: var XmlParser) =
|
||||
var pos = my.bufpos + len("<![CDATA[")
|
||||
var buf = my.buf
|
||||
while true:
|
||||
case buf[pos]
|
||||
case buf[pos]
|
||||
of ']':
|
||||
if buf[pos+1] == ']' and buf[pos+2] == '>':
|
||||
inc(pos, 3)
|
||||
break
|
||||
add(my.a, ']')
|
||||
inc(pos)
|
||||
of '\0':
|
||||
of '\0':
|
||||
markError(my, errEndOfCDataExpected)
|
||||
break
|
||||
of '\c':
|
||||
of '\c':
|
||||
pos = lexbase.handleCR(my, pos)
|
||||
buf = my.buf
|
||||
add(my.a, '\L')
|
||||
of '\L':
|
||||
of '\L':
|
||||
pos = lexbase.handleLF(my, pos)
|
||||
buf = my.buf
|
||||
add(my.a, '\L')
|
||||
of '/':
|
||||
pos = lexbase.handleRefillChar(my, pos)
|
||||
buf = my.buf
|
||||
add(my.a, '/')
|
||||
else:
|
||||
add(my.a, buf[pos])
|
||||
inc(pos)
|
||||
inc(pos)
|
||||
my.bufpos = pos # store back
|
||||
my.kind = xmlCData
|
||||
|
||||
proc parseComment(my: var XmlParser) =
|
||||
proc parseComment(my: var XmlParser) =
|
||||
var pos = my.bufpos + len("<!--")
|
||||
var buf = my.buf
|
||||
while true:
|
||||
case buf[pos]
|
||||
case buf[pos]
|
||||
of '-':
|
||||
if buf[pos+1] == '-' and buf[pos+2] == '>':
|
||||
inc(pos, 3)
|
||||
break
|
||||
if my.options.contains(reportComments): add(my.a, '-')
|
||||
inc(pos)
|
||||
of '\0':
|
||||
of '\0':
|
||||
markError(my, errEndOfCommentExpected)
|
||||
break
|
||||
of '\c':
|
||||
of '\c':
|
||||
pos = lexbase.handleCR(my, pos)
|
||||
buf = my.buf
|
||||
if my.options.contains(reportComments): add(my.a, '\L')
|
||||
of '\L':
|
||||
of '\L':
|
||||
pos = lexbase.handleLF(my, pos)
|
||||
buf = my.buf
|
||||
if my.options.contains(reportComments): add(my.a, '\L')
|
||||
of '/':
|
||||
pos = lexbase.handleRefillChar(my, pos)
|
||||
buf = my.buf
|
||||
if my.options.contains(reportComments): add(my.a, '/')
|
||||
else:
|
||||
if my.options.contains(reportComments): add(my.a, buf[pos])
|
||||
inc(pos)
|
||||
my.bufpos = pos
|
||||
my.kind = xmlComment
|
||||
|
||||
proc parseWhitespace(my: var XmlParser, skip=false) =
|
||||
proc parseWhitespace(my: var XmlParser, skip=false) =
|
||||
var pos = my.bufpos
|
||||
var buf = my.buf
|
||||
while true:
|
||||
while true:
|
||||
case buf[pos]
|
||||
of ' ', '\t':
|
||||
of ' ', '\t':
|
||||
if not skip: add(my.a, buf[pos])
|
||||
inc(pos)
|
||||
of '\c':
|
||||
of '\c':
|
||||
# the specification says that CR-LF, CR are to be transformed to LF
|
||||
pos = lexbase.handleCR(my, pos)
|
||||
buf = my.buf
|
||||
if not skip: add(my.a, '\L')
|
||||
of '\L':
|
||||
of '\L':
|
||||
pos = lexbase.handleLF(my, pos)
|
||||
buf = my.buf
|
||||
if not skip: add(my.a, '\L')
|
||||
@@ -302,10 +310,10 @@ const
|
||||
NameStartChar = {'A'..'Z', 'a'..'z', '_', ':', '\128'..'\255'}
|
||||
NameChar = {'A'..'Z', 'a'..'z', '0'..'9', '.', '-', '_', ':', '\128'..'\255'}
|
||||
|
||||
proc parseName(my: var XmlParser, dest: var string) =
|
||||
proc parseName(my: var XmlParser, dest: var string) =
|
||||
var pos = my.bufpos
|
||||
var buf = my.buf
|
||||
if buf[pos] in NameStartChar:
|
||||
if buf[pos] in NameStartChar:
|
||||
while true:
|
||||
add(dest, buf[pos])
|
||||
inc(pos)
|
||||
@@ -314,14 +322,14 @@ proc parseName(my: var XmlParser, dest: var string) =
|
||||
else:
|
||||
markError(my, errNameExpected)
|
||||
|
||||
proc parseEntity(my: var XmlParser, dest: var string) =
|
||||
proc parseEntity(my: var XmlParser, dest: var string) =
|
||||
var pos = my.bufpos+1
|
||||
var buf = my.buf
|
||||
my.kind = xmlCharData
|
||||
if buf[pos] == '#':
|
||||
var r: int
|
||||
inc(pos)
|
||||
if buf[pos] == 'x':
|
||||
if buf[pos] == 'x':
|
||||
inc(pos)
|
||||
while true:
|
||||
case buf[pos]
|
||||
@@ -331,7 +339,7 @@ proc parseEntity(my: var XmlParser, dest: var string) =
|
||||
else: break
|
||||
inc(pos)
|
||||
else:
|
||||
while buf[pos] in {'0'..'9'}:
|
||||
while buf[pos] in {'0'..'9'}:
|
||||
r = r * 10 + (ord(buf[pos]) - ord('0'))
|
||||
inc(pos)
|
||||
add(dest, toUTF8(Rune(r)))
|
||||
@@ -345,11 +353,11 @@ proc parseEntity(my: var XmlParser, dest: var string) =
|
||||
buf[pos+3] == ';':
|
||||
add(dest, '&')
|
||||
inc(pos, 3)
|
||||
elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and
|
||||
elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and
|
||||
buf[pos+3] == 's' and buf[pos+4] == ';':
|
||||
add(dest, '\'')
|
||||
inc(pos, 4)
|
||||
elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and
|
||||
elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and
|
||||
buf[pos+3] == 't' and buf[pos+4] == ';':
|
||||
add(dest, '"')
|
||||
inc(pos, 4)
|
||||
@@ -357,23 +365,23 @@ proc parseEntity(my: var XmlParser, dest: var string) =
|
||||
my.bufpos = pos
|
||||
parseName(my, dest)
|
||||
pos = my.bufpos
|
||||
if my.err != errNameExpected:
|
||||
if my.err != errNameExpected:
|
||||
my.kind = xmlEntity
|
||||
else:
|
||||
add(dest, '&')
|
||||
if buf[pos] == ';':
|
||||
if buf[pos] == ';':
|
||||
inc(pos)
|
||||
else:
|
||||
markError(my, errSemicolonExpected)
|
||||
my.bufpos = pos
|
||||
|
||||
proc parsePI(my: var XmlParser) =
|
||||
proc parsePI(my: var XmlParser) =
|
||||
inc(my.bufpos, "<?".len)
|
||||
parseName(my, my.a)
|
||||
var pos = my.bufpos
|
||||
var buf = my.buf
|
||||
setLen(my.b, 0)
|
||||
while true:
|
||||
while true:
|
||||
case buf[pos]
|
||||
of '\0':
|
||||
markError(my, errQmGtExpected)
|
||||
@@ -387,29 +395,33 @@ proc parsePI(my: var XmlParser) =
|
||||
of '\c':
|
||||
# the specification says that CR-LF, CR are to be transformed to LF
|
||||
pos = lexbase.handleCR(my, pos)
|
||||
buf = my.buf
|
||||
buf = my.buf
|
||||
add(my.b, '\L')
|
||||
of '\L':
|
||||
of '\L':
|
||||
pos = lexbase.handleLF(my, pos)
|
||||
buf = my.buf
|
||||
add(my.b, '\L')
|
||||
of '/':
|
||||
pos = lexbase.handleRefillChar(my, pos)
|
||||
buf = my.buf
|
||||
add(my.b, '/')
|
||||
else:
|
||||
add(my.b, buf[pos])
|
||||
inc(pos)
|
||||
my.bufpos = pos
|
||||
my.kind = xmlPI
|
||||
|
||||
proc parseSpecial(my: var XmlParser) =
|
||||
proc parseSpecial(my: var XmlParser) =
|
||||
# things that start with <!
|
||||
var pos = my.bufpos + 2
|
||||
var buf = my.buf
|
||||
var opentags = 0
|
||||
while true:
|
||||
while true:
|
||||
case buf[pos]
|
||||
of '\0':
|
||||
markError(my, errGtExpected)
|
||||
break
|
||||
of '<':
|
||||
of '<':
|
||||
inc(opentags)
|
||||
inc(pos)
|
||||
add(my.a, '<')
|
||||
@@ -420,47 +432,55 @@ proc parseSpecial(my: var XmlParser) =
|
||||
dec(opentags)
|
||||
inc(pos)
|
||||
add(my.a, '>')
|
||||
of '\c':
|
||||
of '\c':
|
||||
pos = lexbase.handleCR(my, pos)
|
||||
buf = my.buf
|
||||
add(my.a, '\L')
|
||||
of '\L':
|
||||
of '\L':
|
||||
pos = lexbase.handleLF(my, pos)
|
||||
buf = my.buf
|
||||
add(my.a, '\L')
|
||||
of '/':
|
||||
pos = lexbase.handleRefillChar(my, pos)
|
||||
buf = my.buf
|
||||
add(my.b, '/')
|
||||
else:
|
||||
add(my.a, buf[pos])
|
||||
inc(pos)
|
||||
my.bufpos = pos
|
||||
my.kind = xmlSpecial
|
||||
|
||||
proc parseTag(my: var XmlParser) =
|
||||
proc parseTag(my: var XmlParser) =
|
||||
inc(my.bufpos)
|
||||
parseName(my, my.a)
|
||||
# if we have no name, do not interpret the '<':
|
||||
if my.a.len == 0:
|
||||
if my.a.len == 0:
|
||||
my.kind = xmlCharData
|
||||
add(my.a, '<')
|
||||
return
|
||||
parseWhitespace(my, skip=true)
|
||||
if my.buf[my.bufpos] in NameStartChar:
|
||||
if my.buf[my.bufpos] in NameStartChar:
|
||||
# an attribute follows:
|
||||
my.kind = xmlElementOpen
|
||||
my.state = stateAttr
|
||||
my.c = my.a # save for later
|
||||
else:
|
||||
my.kind = xmlElementStart
|
||||
if my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>':
|
||||
inc(my.bufpos, 2)
|
||||
let slash = my.buf[my.bufpos] == '/'
|
||||
if slash:
|
||||
my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
|
||||
if slash and my.buf[my.bufpos] == '>':
|
||||
inc(my.bufpos)
|
||||
my.state = stateEmptyElementTag
|
||||
my.c = nil
|
||||
elif my.buf[my.bufpos] == '>':
|
||||
inc(my.bufpos)
|
||||
inc(my.bufpos)
|
||||
else:
|
||||
markError(my, errGtExpected)
|
||||
|
||||
proc parseEndTag(my: var XmlParser) =
|
||||
inc(my.bufpos, 2)
|
||||
|
||||
proc parseEndTag(my: var XmlParser) =
|
||||
my.bufpos = lexbase.handleRefillChar(my, my.bufpos+1)
|
||||
#inc(my.bufpos, 2)
|
||||
parseName(my, my.a)
|
||||
parseWhitespace(my, skip=true)
|
||||
if my.buf[my.bufpos] == '>':
|
||||
@@ -469,13 +489,13 @@ proc parseEndTag(my: var XmlParser) =
|
||||
markError(my, errGtExpected)
|
||||
my.kind = xmlElementEnd
|
||||
|
||||
proc parseAttribute(my: var XmlParser) =
|
||||
proc parseAttribute(my: var XmlParser) =
|
||||
my.kind = xmlAttribute
|
||||
setLen(my.a, 0)
|
||||
setLen(my.b, 0)
|
||||
parseName(my, my.a)
|
||||
# if we have no name, we have '<tag attr= key %&$$%':
|
||||
if my.a.len == 0:
|
||||
if my.a.len == 0:
|
||||
markError(my, errGtExpected)
|
||||
return
|
||||
parseWhitespace(my, skip=true)
|
||||
@@ -491,27 +511,27 @@ proc parseAttribute(my: var XmlParser) =
|
||||
var quote = buf[pos]
|
||||
var pendingSpace = false
|
||||
inc(pos)
|
||||
while true:
|
||||
while true:
|
||||
case buf[pos]
|
||||
of '\0':
|
||||
markError(my, errQuoteExpected)
|
||||
break
|
||||
of '&':
|
||||
if pendingSpace:
|
||||
of '&':
|
||||
if pendingSpace:
|
||||
add(my.b, ' ')
|
||||
pendingSpace = false
|
||||
my.bufpos = pos
|
||||
parseEntity(my, my.b)
|
||||
my.kind = xmlAttribute # parseEntity overwrites my.kind!
|
||||
pos = my.bufpos
|
||||
of ' ', '\t':
|
||||
of ' ', '\t':
|
||||
pendingSpace = true
|
||||
inc(pos)
|
||||
of '\c':
|
||||
of '\c':
|
||||
pos = lexbase.handleCR(my, pos)
|
||||
buf = my.buf
|
||||
pendingSpace = true
|
||||
of '\L':
|
||||
of '\L':
|
||||
pos = lexbase.handleLF(my, pos)
|
||||
buf = my.buf
|
||||
pendingSpace = true
|
||||
@@ -520,44 +540,48 @@ proc parseAttribute(my: var XmlParser) =
|
||||
inc(pos)
|
||||
break
|
||||
else:
|
||||
if pendingSpace:
|
||||
if pendingSpace:
|
||||
add(my.b, ' ')
|
||||
pendingSpace = false
|
||||
add(my.b, buf[pos])
|
||||
inc(pos)
|
||||
else:
|
||||
markError(my, errQuoteExpected)
|
||||
markError(my, errQuoteExpected)
|
||||
my.bufpos = pos
|
||||
parseWhitespace(my, skip=true)
|
||||
|
||||
proc parseCharData(my: var XmlParser) =
|
||||
|
||||
proc parseCharData(my: var XmlParser) =
|
||||
var pos = my.bufpos
|
||||
var buf = my.buf
|
||||
while true:
|
||||
while true:
|
||||
case buf[pos]
|
||||
of '\0', '<', '&': break
|
||||
of '\c':
|
||||
of '\c':
|
||||
# the specification says that CR-LF, CR are to be transformed to LF
|
||||
pos = lexbase.handleCR(my, pos)
|
||||
buf = my.buf
|
||||
add(my.a, '\L')
|
||||
of '\L':
|
||||
of '\L':
|
||||
pos = lexbase.handleLF(my, pos)
|
||||
buf = my.buf
|
||||
add(my.a, '\L')
|
||||
of '/':
|
||||
pos = lexbase.handleRefillChar(my, pos)
|
||||
buf = my.buf
|
||||
add(my.a, '/')
|
||||
else:
|
||||
add(my.a, buf[pos])
|
||||
inc(pos)
|
||||
my.bufpos = pos
|
||||
my.kind = xmlCharData
|
||||
|
||||
proc rawGetTok(my: var XmlParser) =
|
||||
proc rawGetTok(my: var XmlParser) =
|
||||
my.kind = xmlError
|
||||
setLen(my.a, 0)
|
||||
var pos = my.bufpos
|
||||
var buf = my.buf
|
||||
case buf[pos]
|
||||
of '<':
|
||||
of '<':
|
||||
case buf[pos+1]
|
||||
of '/':
|
||||
parseEndTag(my)
|
||||
@@ -566,44 +590,44 @@ proc rawGetTok(my: var XmlParser) =
|
||||
buf[pos+5] == 'A' and buf[pos+6] == 'T' and buf[pos+7] == 'A' and
|
||||
buf[pos+8] == '[':
|
||||
parseCDATA(my)
|
||||
elif buf[pos+2] == '-' and buf[pos+3] == '-':
|
||||
elif buf[pos+2] == '-' and buf[pos+3] == '-':
|
||||
parseComment(my)
|
||||
else:
|
||||
else:
|
||||
parseSpecial(my)
|
||||
of '?':
|
||||
parsePI(my)
|
||||
else:
|
||||
else:
|
||||
parseTag(my)
|
||||
of ' ', '\t', '\c', '\l':
|
||||
of ' ', '\t', '\c', '\l':
|
||||
parseWhitespace(my)
|
||||
my.kind = xmlWhitespace
|
||||
of '\0':
|
||||
of '\0':
|
||||
my.kind = xmlEof
|
||||
of '&':
|
||||
parseEntity(my, my.a)
|
||||
else:
|
||||
else:
|
||||
parseCharData(my)
|
||||
assert my.kind != xmlError
|
||||
|
||||
proc getTok(my: var XmlParser) =
|
||||
|
||||
proc getTok(my: var XmlParser) =
|
||||
while true:
|
||||
rawGetTok(my)
|
||||
case my.kind
|
||||
of xmlComment:
|
||||
of xmlComment:
|
||||
if my.options.contains(reportComments): break
|
||||
of xmlWhitespace:
|
||||
of xmlWhitespace:
|
||||
if my.options.contains(reportWhitespace): break
|
||||
else: break
|
||||
|
||||
proc next*(my: var XmlParser) =
|
||||
|
||||
proc next*(my: var XmlParser) =
|
||||
## retrieves the first/next event. This controls the parser.
|
||||
case my.state
|
||||
of stateNormal:
|
||||
getTok(my)
|
||||
getTok(my)
|
||||
of stateStart:
|
||||
my.state = stateNormal
|
||||
getTok(my)
|
||||
if my.kind == xmlPI and my.a == "xml":
|
||||
if my.kind == xmlPI and my.a == "xml":
|
||||
# just skip the first ``<?xml >`` processing instruction
|
||||
getTok(my)
|
||||
of stateAttr:
|
||||
@@ -612,10 +636,14 @@ proc next*(my: var XmlParser) =
|
||||
my.kind = xmlElementClose
|
||||
inc(my.bufpos)
|
||||
my.state = stateNormal
|
||||
elif my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>':
|
||||
my.kind = xmlElementClose
|
||||
inc(my.bufpos, 2)
|
||||
my.state = stateEmptyElementTag
|
||||
elif my.buf[my.bufpos] == '/':
|
||||
my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
|
||||
if my.buf[my.bufpos] == '>':
|
||||
my.kind = xmlElementClose
|
||||
inc(my.bufpos)
|
||||
my.state = stateEmptyElementTag
|
||||
else:
|
||||
markError(my, errGtExpected)
|
||||
else:
|
||||
parseAttribute(my)
|
||||
# state remains the same
|
||||
@@ -624,10 +652,10 @@ proc next*(my: var XmlParser) =
|
||||
my.kind = xmlElementEnd
|
||||
if not my.c.isNil:
|
||||
my.a = my.c
|
||||
of stateError:
|
||||
of stateError:
|
||||
my.kind = xmlError
|
||||
my.state = stateNormal
|
||||
|
||||
|
||||
when not defined(testing) and isMainModule:
|
||||
import os
|
||||
var s = newFileStream(paramStr(1), fmRead)
|
||||
@@ -645,13 +673,13 @@ when not defined(testing) and isMainModule:
|
||||
of xmlPI: echo("<? $1 ## $2 ?>" % [x.piName, x.piRest])
|
||||
of xmlElementStart: echo("<$1>" % x.elementName)
|
||||
of xmlElementEnd: echo("</$1>" % x.elementName)
|
||||
|
||||
of xmlElementOpen: echo("<$1" % x.elementName)
|
||||
of xmlAttribute:
|
||||
|
||||
of xmlElementOpen: echo("<$1" % x.elementName)
|
||||
of xmlAttribute:
|
||||
echo("Key: " & x.attrKey)
|
||||
echo("Value: " & x.attrValue)
|
||||
|
||||
of xmlElementClose: echo(">")
|
||||
|
||||
of xmlElementClose: echo(">")
|
||||
of xmlCData:
|
||||
echo("<![CDATA[$1]]>" % x.charData)
|
||||
of xmlEntity:
|
||||
|
||||
Reference in New Issue
Block a user