fixed a long standing xml parser bug; html parser bugfixes + speed improvements

This commit is contained in:
Araq
2013-06-27 01:04:56 +02:00
parent ff1d68c50b
commit a674c039fb
7 changed files with 252 additions and 84 deletions

View File

@@ -283,6 +283,9 @@ const
sfNoForward* = sfRegister
# forward declarations are not required (per module)
sfNoRoot* = sfBorrow # a local variable is provably no root so it doesn't
# require RC ops
const
# getting ready for the future expr/stmt merge
nkWhen* = nkWhenStmt

View File

@@ -57,14 +57,6 @@ discard """
c()
--> we need a stack of scopes for this analysis
Effect tracking:
We track the effects per proc; forward declarations and indirect calls cause
problems: Forward declarations are computed lazily (we do this pass after
a whole module) and indirect calls are assumed the worst, unless they have
an effect annotation.
"""
type
@@ -75,6 +67,7 @@ type
owner: PSym
init: seq[int] # list of initialized variables
guards: TModel # nested guards
locked: seq[PNode] # locked locations
PEffects = var TEffects
proc isLocalVar(a: PEffects, s: PSym): bool =

View File

@@ -1,7 +1,7 @@
#
#
# Nimrod's Runtime Library
# (c) Copyright 2010 Andreas Rumpf
# (c) Copyright 2013 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
@@ -35,8 +35,12 @@ type
tagAddress, ## the HTML ``address`` element
tagApplet, ## the deprecated HTML ``applet`` element
tagArea, ## the HTML ``area`` element
tagArticle, ## the HTML ``article`` element
tagAside, ## the HTML ``aside`` element
tagAudio, ## the HTML ``audio`` element
tagB, ## the HTML ``b`` element
tagBase, ## the HTML ``base`` element
tagBdi, ## the HTML ``bdi`` element
tagBdo, ## the deprecated HTML ``dbo`` element
tagBasefont, ## the deprecated HTML ``basefont`` element
tagBig, ## the HTML ``big`` element
@@ -44,22 +48,31 @@ type
tagBody, ## the HTML ``body`` element
tagBr, ## the HTML ``br`` element
tagButton, ## the HTML ``button`` element
tagCanvas, ## the HTML ``canvas`` element
tagCaption, ## the HTML ``caption`` element
tagCenter, ## the deprecated HTML ``center`` element
tagCite, ## the HTML ``cite`` element
tagCode, ## the HTML ``code`` element
tagCol, ## the HTML ``col`` element
tagColgroup, ## the HTML ``colgroup`` element
tagCommand, ## the HTML ``command`` element
tagDatalist, ## the HTML ``datalist`` element
tagDd, ## the HTML ``dd`` element
tagDel, ## the HTML ``del`` element
tagDetails, ## the HTML ``details`` element
tagDfn, ## the HTML ``dfn`` element
tagDialog, ## the HTML ``dialog`` element
tagDiv, ## the HTML ``div`` element
tagDir, ## the deprecated HTLM ``dir`` element
tagDl, ## the HTML ``dl`` element
tagDt, ## the HTML ``dt`` element
tagEm, ## the HTML ``em`` element
tagEmbed, ## the HTML ``embed`` element
tagFieldset, ## the HTML ``fieldset`` element
tagFigcaption, ## the HTML ``figcaption`` element
tagFigure, ## the HTML ``figure`` element
tagFont, ## the deprecated HTML ``font`` element
tagFooter, ## the HTML ``footer`` element
tagForm, ## the HTML ``form`` element
tagFrame, ## the HTML ``frame`` element
tagFrameset, ## the deprecated HTML ``frameset`` element
@@ -70,6 +83,8 @@ type
tagH5, ## the HTML ``h5`` element
tagH6, ## the HTML ``h6`` element
tagHead, ## the HTML ``head`` element
tagHeader, ## the HTML ``header`` element
tagHgroup, ## the HTML ``hgroup`` element
tagHtml, ## the HTML ``html`` element
tagHr, ## the HTML ``hr`` element
tagI, ## the HTML ``i`` element
@@ -79,13 +94,17 @@ type
tagIns, ## the HTML ``ins`` element
tagIsindex, ## the deprecated HTML ``isindex`` element
tagKbd, ## the HTML ``kbd`` element
tagKeygen, ## the HTML ``keygen`` element
tagLabel, ## the HTML ``label`` element
tagLegend, ## the HTML ``legend`` element
tagLi, ## the HTML ``li`` element
tagLink, ## the HTML ``link`` element
tagMap, ## the HTML ``map`` element
tagMark, ## the HTML ``mark`` element
tagMenu, ## the deprecated HTML ``menu`` element
tagMeta, ## the HTML ``meta`` element
tagMeter, ## the HTML ``meter`` element
tagNav, ## the HTML ``nav`` element
tagNobr, ## the deprecated HTML ``nobr`` element
tagNoframes, ## the deprecated HTML ``noframes`` element
tagNoscript, ## the HTML ``noscript`` element
@@ -93,20 +112,28 @@ type
tagOl, ## the HTML ``ol`` element
tagOptgroup, ## the HTML ``optgroup`` element
tagOption, ## the HTML ``option`` element
tagOutput, ## the HTML ``output`` element
tagP, ## the HTML ``p`` element
tagParam, ## the HTML ``param`` element
tagPre, ## the HTML ``pre`` element
tagProgress, ## the HTML ``progress`` element
tagQ, ## the HTML ``q`` element
tagRp, ## the HTML ``rp`` element
tagRt, ## the HTML ``rt`` element
tagRuby, ## the HTML ``ruby`` element
tagS, ## the deprecated HTML ``s`` element
tagSamp, ## the HTML ``samp`` element
tagScript, ## the HTML ``script`` element
tagSection, ## the HTML ``section`` element
tagSelect, ## the HTML ``select`` element
tagSmall, ## the HTML ``small`` element
tagSource, ## the HTML ``source`` element
tagSpan, ## the HTML ``span`` element
tagStrike, ## the deprecated HTML ``strike`` element
tagStrong, ## the HTML ``strong`` element
tagStyle, ## the HTML ``style`` element
tagSub, ## the HTML ``sub`` element
tagSummary, ## the HTML ``summary`` element
tagSup, ## the HTML ``sup`` element
tagTable, ## the HTML ``table`` element
tagTbody, ## the HTML ``tbody`` element
@@ -115,37 +142,46 @@ type
tagTfoot, ## the HTML ``tfoot`` element
tagTh, ## the HTML ``th`` element
tagThead, ## the HTML ``thead`` element
tagTime, ## the HTML ``time`` element
tagTitle, ## the HTML ``title`` element
tagTr, ## the HTML ``tr`` element
tagTrack, ## the HTML ``track`` element
tagTt, ## the HTML ``tt`` element
tagU, ## the deprecated HTML ``u`` element
tagUl, ## the HTML ``ul`` element
tagVar ## the HTML ``var`` element
tagVar, ## the HTML ``var`` element
tagVideo, ## the HTML ``video`` element
tagWbr ## the HTML ``wbr`` element
const
tagToStr* = [
"a", "abbr", "acronym", "address", "applet", "area",
"b", "base", "basefont", "bdo", "big", "blockquote", "body",
"br", "button", "caption", "center", "cite", "code",
"col", "colgroup", "dd", "del", "dfn", "div",
"dir", "dl", "dt", "em", "fieldset", "font",
"a", "abbr", "acronym", "address", "applet", "area", "article",
"aside", "audio",
"b", "base", "basefont", "bdi", "bdo", "big", "blockquote", "body",
"br", "button", "canvas", "caption", "center", "cite", "code",
"col", "colgroup", "command",
"datalist", "dd", "del", "details", "dfn", "dialog", "div",
"dir", "dl", "dt", "em", "embed", "fieldset",
"figcaption", "figure", "font", "footer",
"form", "frame", "frameset", "h1", "h2", "h3",
"h4", "h5", "h6", "head", "html", "hr",
"h4", "h5", "h6", "head", "header", "hgroup", "html", "hr",
"i", "iframe", "img", "input", "ins", "isindex",
"kbd", "label", "legend", "li", "link", "map",
"menu", "meta", "nobr", "noframes", "noscript", "object", "ol",
"optgroup", "option", "p", "param", "pre", "q",
"s", "samp", "script", "select", "small", "span",
"strike", "strong", "style", "sub", "sup", "table",
"tbody", "td", "textarea", "tfoot", "th", "thead",
"title", "tr", "tt", "u", "ul", "var"]
"kbd", "keygen", "label", "legend", "li", "link", "map", "mark",
"menu", "meta", "meter", "nav", "nobr", "noframes", "noscript",
"object", "ol",
"optgroup", "option", "output", "p", "param", "pre", "progress", "q",
"rp", "rt", "ruby", "s", "samp", "script", "section", "select", "small",
"source", "span", "strike", "strong", "style",
"sub", "summary", "sup", "table",
"tbody", "td", "textarea", "tfoot", "th", "thead", "time",
"title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr"]
InlineTags* = {tagA, tagAbbr, tagAcronym, tagApplet, tagB, tagBasefont,
tagBdo, tagBig, tagBr, tagButton, tagCite, tagCode, tagDel, tagDfn,
tagEm, tagFont, tagI, tagImg, tagIns, tagInput, tagIframe, tagKbd,
tagLabel, tagMap, tagObject, tagQ, tagSamp, tagScript, tagSelect,
tagSmall, tagSpan, tagStrong, tagSub, tagSup, tagTextarea, tagTt,
tagVar, tagApplet, tagBasefont, tagFont, tagIframe, tagU, tagS,
tagStrike}
tagStrike, tagWbr}
BlockTags* = {tagAddress, tagBlockquote, tagCenter, tagDel, tagDir, tagDiv,
tagDl, tagFieldset, tagForm, tagH1, tagH2, tagH3, tagH4,
tagH5, tagH6, tagHr, tagIns, tagIsindex, tagMenu, tagNoframes, tagNoscript,
@@ -153,7 +189,7 @@ const
tagMenu, tagNoframes}
SingleTags* = {tagArea, tagBase, tagBasefont,
tagBr, tagCol, tagFrame, tagHr, tagImg, tagIsindex,
tagLink, tagMeta, tagParam}
tagLink, tagMeta, tagParam, tagWbr}
Entities = [
("nbsp", 0x00A0), ("iexcl", 0x00A1), ("cent", 0x00A2), ("pound", 0x00A3),
@@ -228,28 +264,149 @@ const
("loz", 0x25CA), ("spades", 0x2660), ("clubs", 0x2663),
("hearts", 0x2665), ("diams", 0x2666)]
proc binaryStrSearch(x: openarray[string], y: string): int =
## XXX put this into the library somewhere!
var a = 0
var b = len(x) - 1
while a <= b:
var mid = (a + b) div 2
var c = cmp(x[mid], y)
if c < 0: a = mid + 1
elif c > 0: b = mid - 1
else: return mid
result = - 1
proc allLower(s: string): bool =
for c in s:
if c < 'a' or c > 'z': return false
return true
proc toHtmlTag(s: string): THtmlTag =
case s
of "a": tagA
of "abbr": tagAbbr
of "acronym": tagAcronym
of "address": tagAddress
of "applet": tagApplet
of "area": tagArea
of "article": tagArticle
of "aside": tagAside
of "audio": tagAudio
of "b": tagB
of "base": tagBase
of "basefont": tagBasefont
of "bdi": tagBdi
of "bdo": tagBdo
of "big": tagBig
of "blockquote": tagBlockquote
of "body": tagBody
of "br": tagBr
of "button": tagButton
of "canvas": tagCanvas
of "caption": tagCaption
of "center": tagCenter
of "cite": tagCite
of "code": tagCode
of "col": tagCol
of "colgroup": tagColgroup
of "command": tagCommand
of "datalist": tagDatalist
of "dd": tagDd
of "del": tagDel
of "details": tagDetails
of "dfn": tagDfn
of "dialog": tagDialog
of "div": tagDiv
of "dir": tagDir
of "dl": tagDl
of "dt": tagDt
of "em": tagEm
of "embed": tagEmbed
of "fieldset": tagFieldset
of "figcaption": tagFigcaption
of "figure": tagFigure
of "font": tagFont
of "footer": tagFooter
of "form": tagForm
of "frame": tagFrame
of "frameset": tagFrameset
of "h1": tagH1
of "h2": tagH2
of "h3": tagH3
of "h4": tagH4
of "h5": tagH5
of "h6": tagH6
of "head": tagHead
of "header": tagHeader
of "hgroup": tagHgroup
of "html": tagHtml
of "hr": tagHr
of "i": tagI
of "iframe": tagIframe
of "img": tagImg
of "input": tagInput
of "ins": tagIns
of "isindex": tagIsindex
of "kbd": tagKbd
of "keygen": tagKeygen
of "label": tagLabel
of "legend": tagLegend
of "li": tagLi
of "link": tagLink
of "map": tagMap
of "mark": tagMark
of "menu": tagMenu
of "meta": tagMeta
of "meter": tagMeter
of "nav": tagNav
of "nobr": tagNobr
of "noframes": tagNoframes
of "noscript": tagNoscript
of "object": tagObject
of "ol": tagOl
of "optgroup": tagOptgroup
of "option": tagOption
of "output": tagOutput
of "p": tagP
of "param": tagParam
of "pre": tagPre
of "progress": tagProgress
of "q": tagQ
of "rp": tagRp
of "rt": tagRt
of "ruby": tagRuby
of "s": tagS
of "samp": tagSamp
of "script": tagScript
of "section": tagSection
of "select": tagSelect
of "small": tagSmall
of "source": tagSource
of "span": tagSpan
of "strike": tagStrike
of "strong": tagStrong
of "style": tagStyle
of "sub": tagSub
of "summary": tagSummary
of "sup": tagSup
of "table": tagTable
of "tbody": tagTbody
of "td": tagTd
of "textarea": tagTextarea
of "tfoot": tagTfoot
of "th": tagTh
of "thead": tagThead
of "time": tagTime
of "title": tagTitle
of "tr": tagTr
of "track": tagTrack
of "tt": tagTt
of "u": tagU
of "ul": tagUl
of "var": tagVar
of "video": tagVideo
of "wbr": tagWbr
else: tagUnknown
proc htmlTag*(n: PXmlNode): THtmlTag =
## gets `n`'s tag as a ``THtmlTag``.
if n.clientData == 0:
n.clientData = binaryStrSearch(tagToStr, n.tag)+1
n.clientData = toHtmlTag(n.tag).ord
result = THtmlTag(n.clientData)
proc htmlTag*(s: string): THtmlTag =
## converts `s` to a ``THtmlTag``. If `s` is no HTML tag, ``tagUnknown`` is
## returned.
result = THtmlTag(binaryStrSearch(tagToStr, s.toLower)+1)
let s = if allLower(s): s else: s.toLower
result = toHtmlTag(s)
proc entityToUtf8*(entity: string): string =
## converts an HTML entity name like ``&Uuml;`` to its UTF-8 equivalent.
@@ -267,11 +424,13 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode
proc expected(x: var TXmlParser, n: PXmlNode): string =
result = errorMsg(x, "</" & n.tag & "> expected")
template elemName(x: expr): expr = rawData(x)
proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
errors: var seq[string]) =
# we parsed e.g. ``<br>`` and don't really expect a ``</br>``:
if result.htmlTag in singleTags:
if x.kind != xmlElementEnd or cmpIgnoreCase(x.elementName, result.tag) != 0:
if x.kind != xmlElementEnd or cmpIgnoreCase(x.elemName, result.tag) != 0:
return
while true:
case x.kind
@@ -279,28 +438,29 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
case result.htmlTag
of tagLi, tagP, tagDt, tagDd, tagInput, tagOption:
# some tags are common to have no ``</end>``, like ``<li>``:
if htmlTag(x.elementName) in {tagLi, tagP, tagDt, tagDd, tagInput,
tagOption}:
if htmlTag(x.elemName) in {tagLi, tagP, tagDt, tagDd, tagInput,
tagOption}:
errors.add(expected(x, result))
break
when false:
if htmlTag(x.elementName) notin InlineTags:
errors.add(expected(x, result))
break
of tagTr, tagTd, tagTh, tagTfoot, tagThead:
if htmlTag(x.elementName) in {tagTr, tagTd, tagTh, tagTfoot, tagThead}:
of tagTd, tagTh, tagTfoot, tagThead:
if htmlTag(x.elemName) in {tagTr, tagTd, tagTh, tagTfoot, tagThead}:
errors.add(expected(x, result))
break
of tagTr:
if htmlTag(x.elemName) == tagTr:
errors.add(expected(x, result))
break
of tagOptgroup:
if htmlTag(x.elementName) in {tagOption, tagOptgroup}:
if htmlTag(x.elemName) in {tagOption, tagOptgroup}:
errors.add(expected(x, result))
break
else: nil
result.addNode(parse(x, errors))
of xmlElementEnd:
if cmpIgnoreCase(x.elementName, result.tag) == 0:
if cmpIgnoreCase(x.elemName, result.tag) == 0:
next(x)
else:
echo "5; expected: ", result.htmltag, " ", x.elemName
errors.add(expected(x, result))
# do not skip it here!
break
@@ -313,10 +473,10 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode =
case x.kind
of xmlComment:
result = newComment(x.charData)
result = newComment(x.rawData)
next(x)
of xmlCharData, xmlWhitespace:
result = newText(x.charData)
result = newText(x.rawData)
next(x)
of xmlPI, xmlSpecial:
# we just ignore processing instructions for now
@@ -325,19 +485,19 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode =
errors.add(errorMsg(x))
next(x)
of xmlElementStart:
result = newElement(x.elementName.toLower)
result = newElement(x.elemName.toLower)
next(x)
untilElementEnd(x, result, errors)
of xmlElementEnd:
errors.add(errorMsg(x, "unexpected ending tag: " & x.elementName))
errors.add(errorMsg(x, "unexpected ending tag: " & x.elemName))
of xmlElementOpen:
result = newElement(x.elementName.toLower)
result = newElement(x.elemName.toLower)
next(x)
result.attrs = newStringTable()
while true:
case x.kind
of xmlAttribute:
result.attrs[x.attrKey] = x.attrValue
result.attrs[x.rawData] = x.rawData2
next(x)
of xmlElementClose:
next(x)
@@ -355,10 +515,10 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode =
errors.add(errorMsg(x, "<some_tag> expected"))
next(x)
of xmlCData:
result = newCData(x.charData)
result = newCData(x.rawData)
next(x)
of xmlEntity:
var u = entityToUtf8(x.entityName)
var u = entityToUtf8(x.rawData)
if u.len != 0: result = newText(u)
next(x)
of xmlEof: nil
@@ -372,15 +532,17 @@ proc parseHtml*(s: PStream, filename: string,
next(x)
# skip the DOCTYPE:
if x.kind == xmlSpecial: next(x)
result = parse(x, errors)
if x.kind != xmlEof:
errors.add(errorMsg(x, "EOF expected"))
result = newElement("document")
result.addNode(parse(x, errors))
#if x.kind != xmlEof:
# errors.add(errorMsg(x, "EOF expected"))
while x.kind != xmlEof:
var oldPos = x.bufpos # little hack to see if we made any progess
result.addNode(parse(x, errors))
if x.bufpos == oldPos:
if x.bufpos == oldPos:
# force progress!
next(x)
next(x)
close(x)
proc parseHtml*(s: PStream): PXmlNode =
@@ -400,22 +562,9 @@ proc loadHtml*(path: string, errors: var seq[string]): PXmlNode =
proc loadHtml*(path: string): PXmlNode =
## Loads and parses HTML from file specified by ``path``, and returns
## a ``PXmlNode``. All parsing errors are ignored.
var errors: seq[string] = @[]
var errors: seq[string] = @[]
result = loadHtml(path, errors)
when true:
nil
else:
proc checkHtmlAux(n: PXmlNode, errors: var seq[string]) =
nil
proc checkHtmlStructure*(n: PXmlNode, errors: var seq[string]) =
## checks the HTML structure after parsing for other errors like
## a ``<h1>`` element within a ``<p>`` element.
if n == nil or n.htmlTag != tagHtml:
errors.add("<html> tag expected")
checkHtmlAux(n, errors)
when isMainModule:
import os
@@ -429,4 +578,3 @@ when isMainModule:
f.close()
else:
quit("cannot write test.txt")

View File

@@ -131,6 +131,10 @@ proc close*(my: var TXmlParser) {.inline.} =
## closes the parser `my` and its associated input stream.
lexbase.close(my)
proc kind*(my: TXmlParser): TXmlEventKind {.inline.} =
## returns the current event type for the XML parser
return my.kind
proc charData*(my: TXmlParser): string {.inline.} =
## returns the character data for the events: ``xmlCharData``,
## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
@@ -138,10 +142,6 @@ proc charData*(my: TXmlParser): string {.inline.} =
xmlSpecial})
return my.a
proc kind*(my: TXmlParser): TXmlEventKind {.inline.} =
## returns the current event type for the XML parser
return my.kind
proc elementName*(my: TXmlParser): string {.inline.} =
## returns the element name for the events: ``xmlElementStart``,
## ``xmlElementEnd``, ``xmlElementOpen``
@@ -173,6 +173,16 @@ proc PIRest*(my: TXmlParser): string {.inline.} =
assert(my.kind == xmlPI)
return my.b
proc rawData*(my: TXmlParser): string {.inline.} =
## returns the underlying 'data' string by reference.
## This is only used for speed hacks.
shallowCopy(result, my.a)
proc rawData2*(my: TXmlParser): string {.inline.} =
## returns the underlying second 'data' string by reference.
## This is only used for speed hacks.
shallowCopy(result, my.b)
proc getColumn*(my: TXmlParser): int {.inline.} =
## get the current column the parser has arrived at.
result = getColNumber(my, my.bufPos)
@@ -439,6 +449,7 @@ proc parseTag(my: var TXmlParser) =
if my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>':
inc(my.bufpos, 2)
my.state = stateEmptyElementTag
my.c = nil
elif my.buf[my.bufpos] == '>':
inc(my.bufpos)
else:

View File

@@ -71,6 +71,16 @@ proc text*(n: PXmlNode): string {.inline.} =
assert n.k in {xnText, xnComment, xnCData, xnEntity}
result = n.fText
proc rawText*(n: PXmlNode): string {.inline.} =
## returns the underlying 'text' string by reference.
## This is only used for speed hacks.
shallowCopy(result, n.fText)
proc rawTag*(n: PXmlNode): string {.inline.} =
## returns the underlying 'tag' string by reference.
## This is only used for speed hacks.
shallowCopy(result, n.fTag)
proc innerText*(n: PXmlNode): string =
## gets the inner text of `n`. `n` has to be an ``xnElement`` node. Only
## ``xnText`` and ``xnEntity`` nodes are considered part of `n`'s inner text,

View File

@@ -19,7 +19,7 @@ when defined(Windows):
OwningThread: int
LockSemaphore: int
Reserved: int32
TSysCond = THandle
proc InitSysLock(L: var TSysLock) {.stdcall, noSideEffect,

View File

@@ -2,7 +2,6 @@ version 0.9.4
=============
- make 'bind' default for templates and introduce 'mixin'
- prove array accesses
- special rule for ``[]=``
- ``=`` should be overloadable; requires specialization for ``=``; general
lift mechanism in the compiler is already implemented for 'fields'
@@ -16,8 +15,11 @@ version 0.9.4
Bugs
====
- simple closure iterator doesn't work
- marshal doesn't support unsigned types
- sfAnon is not respected in renderer
- docgen: sometimes effects are listed twice
- 'result' is not properly cleaned for NRVO
- 'result' is not properly cleaned for NRVO --> use uninit checking instead
- sneaking with qualifiedLookup() is really broken!
- aporia.nim(968, 5) Error: ambiguous identifier: 'DELETE' --
use a qualifier
@@ -140,6 +142,7 @@ Not essential for 1.0.0
- implement closures that support nesting of *procs* > 1
- object constructors: static check for fields if discriminator is known at
compile time
- prove array accesses
Optimizations