bugfixes for unicode; xmlparser; htmlparser; scanner

This commit is contained in:
rumpf_a@web.de
2010-02-20 19:21:38 +01:00
parent 64da2f1681
commit 6bc16904ed
18 changed files with 226 additions and 74 deletions

View File

@@ -265,7 +265,7 @@ proc addNode(father, son: PXmlNode) =
proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode
proc expected(x: var TXmlParser, n: PXmlNode): string =
result = errorMsg(x, "</" & n.tag & "$1> expected")
result = errorMsg(x, "</" & n.tag & "> expected")
proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
errors: var seq[string]) =
@@ -378,17 +378,19 @@ proc parseHtml*(s: PStream): PXmlNode =
var errors: seq[string] = @[]
result = parseHtml(s, "unknown_html_doc", errors)
proc loadHtml*(path: string, reportErrors = false): PXmlNode =
proc loadHtml*(path: string, errors: var seq[string]): PXmlNode =
## Loads and parses HTML from file specified by ``path``, and returns
## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are
## ``echo``ed, otherwise they are ignored.
## a ``PXmlNode``. Every occured parsing error is added to
## the `errors` sequence.
var s = newFileStream(path, fmRead)
if s == nil: raise newException(EIO, "Unable to read file: " & path)
var errors: seq[string] = @[]
result = parseHtml(s, path, errors)
if reportErrors:
for msg in items(errors): echo(msg)
proc loadHtml*(path: string): PXmlNode =
## Loads and parses HTML from file specified by ``path``, and returns
## a ``PXmlNode``. All parsing errors are ignored.
var errors: seq[string] = @[]
result = loadHtml(path, errors)
when true:
nil
@@ -402,4 +404,18 @@ else:
if n == nil or n.htmlTag != tagHtml:
errors.add("<html> tag expected")
checkHtmlAux(n, errors)
when isMainModule:
import os
var errors: seq[string] = @[]
var x = loadHtml(paramStr(1), errors)
for e in items(errors): echo e
var f: TFile
if open(f, "test.txt", fmWrite):
f.write($x)
f.close()
else:
quit("cannot write test.txt")

View File

@@ -1,7 +1,7 @@
#
#
# Nimrod's Runtime Library
# (c) Copyright 2009 Andreas Rumpf
# (c) Copyright 2010 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
@@ -619,9 +619,11 @@ proc sameFileContent*(path1, path2: string): bool =
close(a)
close(b)
proc copyFile*(dest, source: string) =
proc copyFile*(dest, source: string) {.deprecated.} =
## Copies a file from `source` to `dest`. If this fails,
## `EOS` is raised.
## **Deprecated since version 0.8.8**: Use this proc with named arguments
## only, because the order will change!
when defined(Windows):
if CopyFileA(source, dest, 0'i32) == 0'i32: OSError()
else:
@@ -647,8 +649,10 @@ proc copyFile*(dest, source: string) =
close(s)
close(d)
proc moveFile*(dest, source: string) =
proc moveFile*(dest, source: string) {.deprecated.} =
## Moves a file from `source` to `dest`. If this fails, `EOS` is raised.
## **Deprecated since version 0.8.8**: Use this proc with named arguments
## only, because the order will change!
if crename(source, dest) != 0'i32: OSError()
proc removeFile*(file: string) =

View File

@@ -83,8 +83,8 @@ proc toUTF8*(c: TRune): string =
result[0] = chr(i)
elif i <=% 0x07FF:
result = newString(2)
result[0] = chr(i shr 6 or 0b110_0000)
result[1] = chr(i and ones(6) or 0b10_000000)
result[0] = chr((i shr 6) or 0b110_00000)
result[1] = chr((i and ones(6)) or 0b10_000000)
elif i <=% 0xFFFF:
result = newString(3)
result[0] = chr(i shr 12 or 0b1110_0000)

View File

@@ -227,7 +227,7 @@ proc createAttributeNS*(doc: PDocument, namespaceURI: string, qualifiedName: str
raise newException(EInvalidCharacterErr, "Invalid character")
# Exceptions
if qualifiedName.contains(':'):
if namespaceURI == nil or namespaceURI == "":
if namespaceURI == nil:
raise newException(ENamespaceErr, "When qualifiedName contains a prefix namespaceURI cannot be nil")
elif qualifiedName.split(':')[0].toLower() == "xml" and namespaceURI != "http://www.w3.org/XML/1998/namespace":
raise newException(ENamespaceErr,
@@ -303,7 +303,7 @@ proc createElement*(doc: PDocument, tagName: string): PElement =
proc createElementNS*(doc: PDocument, namespaceURI: string, qualifiedName: string): PElement =
## Creates an element of the given qualified name and namespace URI.
if qualifiedName.contains(':'):
if namespaceURI == nil or namespaceURI == "":
if namespaceURI == nil:
raise newException(ENamespaceErr, "When qualifiedName contains a prefix namespaceURI cannot be nil")
elif qualifiedName.split(':')[0].toLower() == "xml" and namespaceURI != "http://www.w3.org/XML/1998/namespace":
raise newException(ENamespaceErr,
@@ -464,8 +464,11 @@ proc localName*(n: PNode): string =
proc namespaceURI*(n: PNode): string =
## Returns this nodes namespace URI
return n.FNamespaceURI
proc `namespaceURI=`*(n: PNode, value: string) =
n.FNamespaceURI = value
proc nextSibling*(n: PNode): PNode =
## Returns the next sibling of this node
@@ -507,7 +510,7 @@ proc previousSibling*(n: PNode): PNode =
return n.FParentNode.childNodes[i - 1]
return nil
proc `prefix=`*(n: var PNode, value: string) =
proc `prefix=`*(n: PNode, value: string) =
## Modifies the prefix of this node
# Setter
@@ -530,11 +533,10 @@ proc `prefix=`*(n: var PNode, value: string) =
if n.nodeType == ElementNode:
var el: PElement = PElement(n)
el.FTagName = value & ":" & n.FLocalName
n = PNode(el)
elif n.nodeType == AttributeNode:
var attr: PAttr = PAttr(n)
attr.FName = value & ":" & n.FLocalName
n = PNode(attr)
# Procedures
proc appendChild*(n: PNode, newChild: PNode) =
@@ -1078,4 +1080,4 @@ proc nodeToXml(n: PNode, indent: int = 0): string =
proc `$`*(doc: PDocument): string =
## Converts a PDocument object into a string representation of it's XML
result = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
result.add(nodeToXml(doc.documentElement))
result.add(nodeToXml(doc.documentElement))

View File

@@ -14,9 +14,34 @@ import xmldom, os, streams, parsexml, strutils
#XMLDom's Parser - Turns XML into a Document
type
#Parsing errors
# Parsing errors
EMismatchedTag* = object of E_Base ## Raised when a tag is not properly closed
EParserError* = object of E_Base ## Raised when an unexpected XML Parser event occurs
# For namespaces
xmlnsAttr = tuple[name, value: string, ownerElement: PElement]
var nsList: seq[xmlnsAttr] = @[] # Used for storing namespaces
proc getNS(prefix: string): string =
var defaultNS: seq[string] = @[]
for key, value, tag in items(nsList):
if ":" in key:
if key.split(':')[1] == prefix:
return value
if key == "xmlns":
defaultNS.add(value)
# Don't return the default namespaces
# in the loop, because then they would have a precedence
# over normal namespaces
if defaultNS.len() > 0:
return defaultNS[0] # Return the first found default namespace
# if none are specified for this prefix
return ""
proc parseText(x: var TXmlParser, doc: var PDocument): PText =
result = doc.createTextNode(x.charData())
@@ -28,24 +53,33 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
case x.kind()
of xmlEof:
break
of xmlElementStart:
of xmlElementStart, xmlElementOpen:
if n.tagName() != "":
n.appendChild(parseElement(x, doc))
else:
n = doc.createElement(x.elementName)
of xmlElementOpen:
if n.tagName() != "":
n.appendChild(parseElement(x, doc))
else:
if x.elementName.contains(':'):
#TODO: NamespaceURI
n = doc.createElementNS("nil", x.elementName)
else:
n = doc.createElement(x.elementName)
n = doc.createElementNS("", x.elementName)
of xmlElementEnd:
if x.elementName == n.nodeName:
# n.normalize() # Remove any whitespace etc.
var ns: string
if x.elementName.contains(':'):
ns = getNS(x.elementName.split(':')[0])
else:
ns = getNS("")
n.namespaceURI = ns
# Remove any namespaces this element declared
var count = 0 # Variable which keeps the index
# We need to edit it..
for i in low(nsList)..len(nsList)-1:
if nsList[count][2] == n:
nsList.delete(count)
dec(count)
inc(count)
return n
else: #The wrong element is ended
raise newException(EMismatchedTag, "Mismatched tag at line " &
@@ -54,11 +88,15 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
of xmlCharData:
n.appendChild(parseText(x, doc))
of xmlAttribute:
if x.attrKey == "xmlns" or x.attrKey.startsWith("xmlns:"):
nsList.add((x.attrKey, x.attrValue, n))
if x.attrKey.contains(':'):
#TODO: NamespaceURI
n.setAttributeNS("nil", x.attrKey, x.attrValue)
var ns = getNS(x.attrKey)
n.setAttributeNS(ns, x.attrKey, x.attrValue)
else:
n.setAttribute(x.attrKey, x.attrValue)
of xmlCData:
n.appendChild(doc.createCDATASection(x.charData()))
of xmlComment:
@@ -75,16 +113,13 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
raise newException(EMismatchedTag,
"Mismatched tag at line " & $x.getLine() & " column " & $x.getColumn)
proc loadXML*(path: string): PDocument =
## Loads and parses XML from file specified by ``path``, and returns
proc loadXMLStream*(stream: PStream): PDocument =
## Loads and parses XML from a stream specified by ``stream``, and returns
## a ``PDocument``
var s = newFileStream(path, fmRead)
if s == nil: raise newException(EIO, "Unable to read file " & path)
var x: TXmlParser
open(x, s, path, {reportComments})
open(x, stream, nil, {reportComments})
var XmlDoc: PDocument
var DOM: PDOMImplementation = getDOM()
@@ -102,10 +137,32 @@ proc loadXML*(path: string): PDocument =
else:
raise newException(EParserError, "Unexpected XML Parser event")
close(x)
return XmlDoc
proc loadXML*(xml: string): PDocument =
## Loads and parses XML from a string specified by ``xml``, and returns
## a ``PDocument``
var s = newStringStream(xml)
return loadXMLStream(s)
proc loadXMLFile*(path: string): PDocument =
## Loads and parses XML from a file specified by ``path``, and returns
## a ``PDocument``
var s = newFileStream(path, fmRead)
if s == nil: raise newException(EIO, "Unable to read file " & path)
return loadXMLStream(s)
when isMainModule:
var xml = loadXML(r"C:\Users\Dominik\Desktop\Code\Nimrod\xmldom\test.xml")
echo($xml)
var xml = loadXMLFile(r"C:\Users\Dominik\Desktop\Code\Nimrod\xmldom\test.xml")
#echo(xml.getElementsByTagName("m:test2")[0].namespaceURI)
#echo(xml.getElementsByTagName("bla:test")[0].namespaceURI)
#echo(xml.getElementsByTagName("test")[0].namespaceURI)
for i in items(xml.getElementsByTagName("*")):
if i.namespaceURI != nil:
echo(i.nodeName, "=", i.namespaceURI)
echo($xml)

View File

@@ -25,6 +25,8 @@ proc raiseInvalidXml(errors: seq[string]) =
proc addNode(father, son: PXmlNode) =
if son != nil: add(father, son)
proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode
proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
errors: var seq[string]) =
while true:
@@ -33,11 +35,11 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
if x.elementName == result.tag:
next(x)
else:
errors.add(errorMsg(x, "</" & result.tag & "$1> expected"))
errors.add(errorMsg(x, "</" & result.tag & "> expected"))
# do not skip it here!
break
of xmlEof:
errors.add(errorMsg(x, "</" & result.tag & "$1> expected"))
errors.add(errorMsg(x, "</" & result.tag & "> expected"))
break
else:
result.addNode(parse(x, errors))
@@ -91,7 +93,7 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode =
next(x)
of xmlEntity:
## &entity;
## XXX To implement!
errors.add(errorMsg(x, "unknown entity: " & x.entityName))
next(x)
of xmlEof: nil
@@ -110,6 +112,8 @@ proc parseXml*(s: PStream, filename: string,
of xmlComment, xmlWhitespace: nil # just skip it
of xmlError:
errors.add(errorMsg(x))
of xmlSpecial:
errors.add(errorMsg(x, "<some_tag> expected"))
else:
errors.add(errorMsg(x, "<some_tag> expected"))
break
@@ -122,17 +126,33 @@ proc parseXml*(s: PStream): PXmlNode =
result = parseXml(s, "unknown_html_doc", errors)
if errors.len > 0: raiseInvalidXMl(errors)
proc loadXml*(path: string, reportErrors = false): PXmlNode =
proc loadXml*(path: string, errors: var seq[string]): PXmlNode =
## Loads and parses XML from file specified by ``path``, and returns
## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are
## ``echo``ed, otherwise an exception is thrown.
## a ``PXmlNode``. Every occured parsing error is added to the `errors`
## sequence.
var s = newFileStream(path, fmRead)
if s == nil: raise newException(EIO, "Unable to read file: " & path)
var errors: seq[string] = @[]
result = parseXml(s, path, errors)
if reportErrors:
for msg in items(errors): echo(msg)
elif errors.len > 0:
raiseInvalidXMl(errors)
proc loadXml*(path: string): PXmlNode =
## Loads and parses XML from file specified by ``path``, and returns
## a ``PXmlNode``. All parsing errors are turned into an ``EInvalidXML``
## exception.
var errors: seq[string] = @[]
result = loadXml(path, errors)
if errors.len > 0: raiseInvalidXMl(errors)
when isMainModule:
import os
var errors: seq[string] = @[]
var x = loadXml(paramStr(1), errors)
for e in items(errors): echo e
var f: TFile
if open(f, "xmltest.txt", fmWrite):
f.write($x)
f.close()
else:
quit("cannot write test.txt")

View File

@@ -153,8 +153,15 @@ proc addIndent(result: var string, indent: int) =
result.add("\n")
for i in 1..indent: result.add(' ')
proc noWhitespace(n: PXmlNode): bool =
#for i in 1..n.len-1:
# if n[i].kind != n[0].kind: return true
for i in 0..n.len-1:
if n[i].kind in {xnText, xnEntity}: return true
proc add*(result: var string, n: PXmlNode, indent = 0, indWidth = 2) =
## adds the textual representation of `n` to `result`.
if n == nil: return
case n.k
of xnElement:
result.add('<')
@@ -168,10 +175,19 @@ proc add*(result: var string, n: PXmlNode, indent = 0, indWidth = 2) =
result.add('"')
if n.len > 0:
result.add('>')
for i in 0..n.len-1:
result.addIndent(indent+indWidth)
result.add(n[i], indent+indWidth, indWidth)
result.addIndent(indent)
if n.len > 1:
if noWhitespace(n):
# for mixed leaves, we cannot output whitespace for readability,
# because this would be wrong. For example: ``a<b>b</b>`` is
# different from ``a <b>b</b>``.
for i in 0..n.len-1: result.add(n[i], indent+indWidth, indWidth)
else:
for i in 0..n.len-1:
result.addIndent(indent+indWidth)
result.add(n[i], indent+indWidth, indWidth)
result.addIndent(indent)
else:
result.add(n[0], indent+indWidth, indWidth)
result.add("</")
result.add(n.fTag)
result.add(">")