further improvements for the HTML parser

This commit is contained in:
rumpf_a@web.de
2010-02-14 09:53:56 +01:00
parent 40a5d6c3b9
commit 597d98e7ee

View File

@@ -138,8 +138,7 @@ const
"s", "samp", "script", "select", "small", "span",
"strike", "strong", "style", "sub", "sup", "table",
"tbody", "td", "textarea", "tfoot", "th", "thead",
"title", "tr", "tt", "u", "ul", "var"
]
"title", "tr", "tt", "u", "ul", "var"]
InlineTags* = {tagA, tagAbbr, tagAcronym, tagApplet, tagB, tagBasefont,
tagBdo, tagBig, tagBr, tagButton, tagCite, tagCode, tagDel, tagDfn,
tagEm, tagFont, tagI, tagImg, tagIns, tagInput, tagIframe, tagKbd,
@@ -154,7 +153,7 @@ const
tagMenu, tagNoframes}
SingleTags* = {tagArea, tagBase, tagBasefont,
tagBr, tagCol, tagFrame, tagHr, tagImg, tagInput, tagIsindex,
tagLink, tagMeta, tagParam} # `tagP` can be both!
tagLink, tagMeta, tagParam}
Entities = [
("nbsp", 0x00A0), ("iexcl", 0x00A1), ("cent", 0x00A2), ("pound", 0x00A3),
@@ -247,13 +246,17 @@ proc htmlTag*(n: PXmlNode): THtmlTag =
n.clientData = binaryStrSearch(tagStrs, n.tag)+1
result = THtmlTag(n.clientData)
proc htmlTag*(s: string): THtmlTag =
## converts `s` to a ``THtmlTag``. If `s` is no HTML tag, ``tagUnknown`` is
## returned.
result = THtmlTag(binaryStrSearch(tagStrs, s.toLower)+1)
proc entityToUtf8*(entity: string): string =
## converts an HTML entity name like ``Ü`` to its UTF-8 equivalent.
## "" is returned if the entity name is unknown. The HTML parser
## already converts entities to UTF-8.
for name, val in items(entities):
if name == entity:
return toUTF8(TRune(val))
if name == entity: return toUTF8(TRune(val))
result = ""
proc addNode(father, son: PXmlNode) =
@@ -261,6 +264,9 @@ proc addNode(father, son: PXmlNode) =
proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode
proc expected(x: var TXmlParser, n: PXmlNode): string =
result = errorMsg(x, "</" & n.tag & "$1> expected")
proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
errors: var seq[string]) =
if result.htmlTag in singleTags:
@@ -268,15 +274,28 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
return
while true:
case x.kind
of xmlElementStart, xmlElementOpen:
case result.htmlTag
of tagLi, tagP, tagDt, tagDd, tagOption:
if htmlTag(x.elementName) notin InlineTags:
# some tags are common to have no ``</end>``, like ``<li>``:
errors.add(expected(x, result))
break
of tagTr, tagTd, tagTh:
if htmlTag(x.elementName) in {tagTr, tagTd, tagTh}:
errors.add(expected(x, result))
break
else: nil
result.addNode(parse(x, errors))
of xmlElementEnd:
if cmpIgnoreCase(x.elementName, result.tag) == 0:
next(x)
else:
errors.add(errorMsg(x, "</" & result.tag & "$1> expected"))
errors.add(expected(x, result))
# do not skip it here!
break
of xmlEof:
errors.add(errorMsg(x, "</" & result.tag & "$1> expected"))
errors.add(expected(x, result))
break
else:
result.addNode(parse(x, errors))
@@ -296,13 +315,13 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode =
errors.add(errorMsg(x))
next(x)
of xmlElementStart:
result = newElement(x.elementName)
result = newElement(x.elementName.toLower)
next(x)
untilElementEnd(x, result, errors)
of xmlElementEnd:
errors.add(errorMsg(x, "unexpected ending tag: " & x.elementName))
of xmlElementOpen:
result = newElement(x.elementName)
result = newElement(x.elementName.toLower)
next(x)
result.attr = newStringTable()
while true: