From 597d98e7ee9cf60ec1ed601a311975384d65cba8 Mon Sep 17 00:00:00 2001 From: "rumpf_a@web.de" <> Date: Sun, 14 Feb 2010 09:53:56 +0100 Subject: [PATCH] further improvements for the HTML parser --- lib/pure/htmlparser.nim | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/lib/pure/htmlparser.nim b/lib/pure/htmlparser.nim index 5c88f211d6..4e4af0e4ba 100644 --- a/lib/pure/htmlparser.nim +++ b/lib/pure/htmlparser.nim @@ -138,8 +138,7 @@ const "s", "samp", "script", "select", "small", "span", "strike", "strong", "style", "sub", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", - "title", "tr", "tt", "u", "ul", "var" - ] + "title", "tr", "tt", "u", "ul", "var"] InlineTags* = {tagA, tagAbbr, tagAcronym, tagApplet, tagB, tagBasefont, tagBdo, tagBig, tagBr, tagButton, tagCite, tagCode, tagDel, tagDfn, tagEm, tagFont, tagI, tagImg, tagIns, tagInput, tagIframe, tagKbd, @@ -154,7 +153,7 @@ const tagMenu, tagNoframes} SingleTags* = {tagArea, tagBase, tagBasefont, tagBr, tagCol, tagFrame, tagHr, tagImg, tagInput, tagIsindex, - tagLink, tagMeta, tagParam} # `tagP` can be both! + tagLink, tagMeta, tagParam} Entities = [ ("nbsp", 0x00A0), ("iexcl", 0x00A1), ("cent", 0x00A2), ("pound", 0x00A3), @@ -247,13 +246,17 @@ proc htmlTag*(n: PXmlNode): THtmlTag = n.clientData = binaryStrSearch(tagStrs, n.tag)+1 result = THtmlTag(n.clientData) +proc htmlTag*(s: string): THtmlTag = + ## converts `s` to a ``THtmlTag``. If `s` is no HTML tag, ``tagUnknown`` is + ## returned. + result = THtmlTag(binaryStrSearch(tagStrs, s.toLower)+1) + proc entityToUtf8*(entity: string): string = ## converts an HTML entity name like ``Ü`` to its UTF-8 equivalent. ## "" is returned if the entity name is unknown. The HTML parser ## already converts entities to UTF-8. for name, val in items(entities): - if name == entity: - return toUTF8(TRune(val)) + if name == entity: return toUTF8(TRune(val)) result = "" proc addNode(father, son: PXmlNode) = @@ -261,6 +264,9 @@ proc addNode(father, son: PXmlNode) = proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode +proc expected(x: var TXmlParser, n: PXmlNode): string = + result = errorMsg(x, "" & n.tag & "$1> expected") + proc untilElementEnd(x: var TXmlParser, result: PXmlNode, errors: var seq[string]) = if result.htmlTag in singleTags: @@ -268,15 +274,28 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode, return while true: case x.kind + of xmlElementStart, xmlElementOpen: + case result.htmlTag + of tagLi, tagP, tagDt, tagDd, tagOption: + if htmlTag(x.elementName) notin InlineTags: + # some tags are common to have no ````, like ``