Make htmlparser parse unquoted attrib values (#9537)

Fixes #6154
This commit is contained in:
rec
2018-10-29 11:10:00 +01:00
committed by Andreas Rumpf
parent 680f5eeb15
commit 9fd0a71e4d
3 changed files with 65 additions and 2 deletions

View File

@@ -2014,7 +2014,7 @@ proc parseHtml*(s: Stream, filename: string,
## Parses the XML from stream `s` and returns a ``XmlNode``. Every
## occurred parsing error is added to the `errors` sequence.
var x: XmlParser
open(x, s, filename, {reportComments, reportWhitespace})
open(x, s, filename, {reportComments, reportWhitespace, allowUnquotedAttribs})
next(x)
# skip the DOCTYPE:
if x.kind == xmlSpecial: next(x)

View File

@@ -180,6 +180,7 @@ type
errEqExpected, ## ``=`` expected
errQuoteExpected, ## ``"`` or ``'`` expected
errEndOfCommentExpected ## ``-->`` expected
errAttributeValueExpected ## non-empty attribute value expected
ParserState = enum
stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
@@ -187,6 +188,7 @@ type
XmlParseOption* = enum ## options for the XML parser
reportWhitespace, ## report whitespace
reportComments ## report comments
allowUnquotedAttribs ## allow unquoted attribute values (for HTML)
XmlParser* = object of BaseLexer ## the parser object.
a, b, c: string
@@ -207,7 +209,8 @@ const
"'>' expected",
"'=' expected",
"'\"' or \"'\" expected",
"'-->' expected"
"'-->' expected",
"attribute value expected"
]
proc open*(my: var XmlParser, input: Stream, filename: string,
@@ -669,6 +672,21 @@ proc parseAttribute(my: var XmlParser) =
pendingSpace = false
add(my.b, buf[pos])
inc(pos)
elif allowUnquotedAttribs in my.options:
const disallowedChars = {'"', '\'', '`', '=', '<', '>', ' ',
'\0', '\t', '\L', '\F', '\f'}
let startPos = pos
while (let c = buf[pos]; c notin disallowedChars):
if c == '&':
my.bufpos = pos
parseEntity(my, my.b)
my.kind = xmlAttribute # parseEntity overwrites my.kind!
pos = my.bufpos
else:
add(my.b, c)
inc(pos)
if pos == startPos:
markError(my, errAttributeValueExpected)
else:
markError(my, errQuoteExpected)
# error corrections: guess what was meant

View File

@@ -78,3 +78,48 @@ block t2814:
echo "case " & ltype[0] & " failed !"
quit(2)
echo "true"
block t6154:
let foo = """
<!DOCTYPE html>
<html>
<head>
<title> foobar </title>
</head>
<body>
<p class=foo id=bar></p>
<p something=&#9;foo&#9;bar&#178;></p>
<p something= &#9;foo&#9;bar&#178; foo =bloo></p>
<p class="foo2" id="bar2"></p>
<p wrong= ></p>
</body>
</html>
"""
var errors: seq[string] = @[]
let html = parseHtml(newStringStream(foo), "statichtml", errors=errors)
doAssert "statichtml(11, 18) Error: attribute value expected" in errors
let ps = html.findAll("p")
doAssert ps.len == 5
doAssert ps[0].attrsLen == 2
doAssert ps[0].attr("class") == "foo"
doAssert ps[0].attr("id") == "bar"
doassert ps[0].len == 0
doAssert ps[1].attrsLen == 1
doAssert ps[1].attr("something") == "\tfoo\tbar²"
doassert ps[1].len == 0
doAssert ps[2].attrsLen == 2
doAssert ps[2].attr("something") == "\tfoo\tbar²"
doAssert ps[2].attr("foo") == "bloo"
doassert ps[2].len == 0
doAssert ps[3].attrsLen == 2
doAssert ps[3].attr("class") == "foo2"
doAssert ps[3].attr("id") == "bar2"
doassert ps[3].len == 0
doAssert ps[4].attrsLen == 1
doAssert ps[4].attr("wrong") == ""