mirror of
https://github.com/nim-lang/Nim.git
synced 2026-05-02 03:54:44 +00:00
bugfixes for unicode; xmlparser; htmlparser; scanner
This commit is contained in:
@@ -265,7 +265,7 @@ proc addNode(father, son: PXmlNode) =
|
||||
proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode
|
||||
|
||||
proc expected(x: var TXmlParser, n: PXmlNode): string =
|
||||
result = errorMsg(x, "</" & n.tag & "$1> expected")
|
||||
result = errorMsg(x, "</" & n.tag & "> expected")
|
||||
|
||||
proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
|
||||
errors: var seq[string]) =
|
||||
@@ -378,17 +378,19 @@ proc parseHtml*(s: PStream): PXmlNode =
|
||||
var errors: seq[string] = @[]
|
||||
result = parseHtml(s, "unknown_html_doc", errors)
|
||||
|
||||
proc loadHtml*(path: string, reportErrors = false): PXmlNode =
|
||||
proc loadHtml*(path: string, errors: var seq[string]): PXmlNode =
|
||||
## Loads and parses HTML from file specified by ``path``, and returns
|
||||
## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are
|
||||
## ``echo``ed, otherwise they are ignored.
|
||||
## a ``PXmlNode``. Every occured parsing error is added to
|
||||
## the `errors` sequence.
|
||||
var s = newFileStream(path, fmRead)
|
||||
if s == nil: raise newException(EIO, "Unable to read file: " & path)
|
||||
|
||||
var errors: seq[string] = @[]
|
||||
result = parseHtml(s, path, errors)
|
||||
if reportErrors:
|
||||
for msg in items(errors): echo(msg)
|
||||
|
||||
proc loadHtml*(path: string): PXmlNode =
|
||||
## Loads and parses HTML from file specified by ``path``, and returns
|
||||
## a ``PXmlNode``. All parsing errors are ignored.
|
||||
var errors: seq[string] = @[]
|
||||
result = loadHtml(path, errors)
|
||||
|
||||
when true:
|
||||
nil
|
||||
@@ -402,4 +404,18 @@ else:
|
||||
if n == nil or n.htmlTag != tagHtml:
|
||||
errors.add("<html> tag expected")
|
||||
checkHtmlAux(n, errors)
|
||||
|
||||
when isMainModule:
|
||||
import os
|
||||
|
||||
var errors: seq[string] = @[]
|
||||
var x = loadHtml(paramStr(1), errors)
|
||||
for e in items(errors): echo e
|
||||
|
||||
var f: TFile
|
||||
if open(f, "test.txt", fmWrite):
|
||||
f.write($x)
|
||||
f.close()
|
||||
else:
|
||||
quit("cannot write test.txt")
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#
|
||||
#
|
||||
# Nimrod's Runtime Library
|
||||
# (c) Copyright 2009 Andreas Rumpf
|
||||
# (c) Copyright 2010 Andreas Rumpf
|
||||
#
|
||||
# See the file "copying.txt", included in this
|
||||
# distribution, for details about the copyright.
|
||||
@@ -619,9 +619,11 @@ proc sameFileContent*(path1, path2: string): bool =
|
||||
close(a)
|
||||
close(b)
|
||||
|
||||
proc copyFile*(dest, source: string) =
|
||||
proc copyFile*(dest, source: string) {.deprecated.} =
|
||||
## Copies a file from `source` to `dest`. If this fails,
|
||||
## `EOS` is raised.
|
||||
## **Deprecated since version 0.8.8**: Use this proc with named arguments
|
||||
## only, because the order will change!
|
||||
when defined(Windows):
|
||||
if CopyFileA(source, dest, 0'i32) == 0'i32: OSError()
|
||||
else:
|
||||
@@ -647,8 +649,10 @@ proc copyFile*(dest, source: string) =
|
||||
close(s)
|
||||
close(d)
|
||||
|
||||
proc moveFile*(dest, source: string) =
|
||||
proc moveFile*(dest, source: string) {.deprecated.} =
|
||||
## Moves a file from `source` to `dest`. If this fails, `EOS` is raised.
|
||||
## **Deprecated since version 0.8.8**: Use this proc with named arguments
|
||||
## only, because the order will change!
|
||||
if crename(source, dest) != 0'i32: OSError()
|
||||
|
||||
proc removeFile*(file: string) =
|
||||
|
||||
@@ -83,8 +83,8 @@ proc toUTF8*(c: TRune): string =
|
||||
result[0] = chr(i)
|
||||
elif i <=% 0x07FF:
|
||||
result = newString(2)
|
||||
result[0] = chr(i shr 6 or 0b110_0000)
|
||||
result[1] = chr(i and ones(6) or 0b10_000000)
|
||||
result[0] = chr((i shr 6) or 0b110_00000)
|
||||
result[1] = chr((i and ones(6)) or 0b10_000000)
|
||||
elif i <=% 0xFFFF:
|
||||
result = newString(3)
|
||||
result[0] = chr(i shr 12 or 0b1110_0000)
|
||||
|
||||
@@ -227,7 +227,7 @@ proc createAttributeNS*(doc: PDocument, namespaceURI: string, qualifiedName: str
|
||||
raise newException(EInvalidCharacterErr, "Invalid character")
|
||||
# Exceptions
|
||||
if qualifiedName.contains(':'):
|
||||
if namespaceURI == nil or namespaceURI == "":
|
||||
if namespaceURI == nil:
|
||||
raise newException(ENamespaceErr, "When qualifiedName contains a prefix namespaceURI cannot be nil")
|
||||
elif qualifiedName.split(':')[0].toLower() == "xml" and namespaceURI != "http://www.w3.org/XML/1998/namespace":
|
||||
raise newException(ENamespaceErr,
|
||||
@@ -303,7 +303,7 @@ proc createElement*(doc: PDocument, tagName: string): PElement =
|
||||
proc createElementNS*(doc: PDocument, namespaceURI: string, qualifiedName: string): PElement =
|
||||
## Creates an element of the given qualified name and namespace URI.
|
||||
if qualifiedName.contains(':'):
|
||||
if namespaceURI == nil or namespaceURI == "":
|
||||
if namespaceURI == nil:
|
||||
raise newException(ENamespaceErr, "When qualifiedName contains a prefix namespaceURI cannot be nil")
|
||||
elif qualifiedName.split(':')[0].toLower() == "xml" and namespaceURI != "http://www.w3.org/XML/1998/namespace":
|
||||
raise newException(ENamespaceErr,
|
||||
@@ -464,8 +464,11 @@ proc localName*(n: PNode): string =
|
||||
|
||||
proc namespaceURI*(n: PNode): string =
|
||||
## Returns this nodes namespace URI
|
||||
|
||||
|
||||
return n.FNamespaceURI
|
||||
|
||||
proc `namespaceURI=`*(n: PNode, value: string) =
|
||||
n.FNamespaceURI = value
|
||||
|
||||
proc nextSibling*(n: PNode): PNode =
|
||||
## Returns the next sibling of this node
|
||||
@@ -507,7 +510,7 @@ proc previousSibling*(n: PNode): PNode =
|
||||
return n.FParentNode.childNodes[i - 1]
|
||||
return nil
|
||||
|
||||
proc `prefix=`*(n: var PNode, value: string) =
|
||||
proc `prefix=`*(n: PNode, value: string) =
|
||||
## Modifies the prefix of this node
|
||||
|
||||
# Setter
|
||||
@@ -530,11 +533,10 @@ proc `prefix=`*(n: var PNode, value: string) =
|
||||
if n.nodeType == ElementNode:
|
||||
var el: PElement = PElement(n)
|
||||
el.FTagName = value & ":" & n.FLocalName
|
||||
n = PNode(el)
|
||||
|
||||
elif n.nodeType == AttributeNode:
|
||||
var attr: PAttr = PAttr(n)
|
||||
attr.FName = value & ":" & n.FLocalName
|
||||
n = PNode(attr)
|
||||
|
||||
# Procedures
|
||||
proc appendChild*(n: PNode, newChild: PNode) =
|
||||
@@ -1078,4 +1080,4 @@ proc nodeToXml(n: PNode, indent: int = 0): string =
|
||||
proc `$`*(doc: PDocument): string =
|
||||
## Converts a PDocument object into a string representation of it's XML
|
||||
result = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
|
||||
result.add(nodeToXml(doc.documentElement))
|
||||
result.add(nodeToXml(doc.documentElement))
|
||||
@@ -14,9 +14,34 @@ import xmldom, os, streams, parsexml, strutils
|
||||
#XMLDom's Parser - Turns XML into a Document
|
||||
|
||||
type
|
||||
#Parsing errors
|
||||
# Parsing errors
|
||||
EMismatchedTag* = object of E_Base ## Raised when a tag is not properly closed
|
||||
EParserError* = object of E_Base ## Raised when an unexpected XML Parser event occurs
|
||||
|
||||
# For namespaces
|
||||
xmlnsAttr = tuple[name, value: string, ownerElement: PElement]
|
||||
|
||||
var nsList: seq[xmlnsAttr] = @[] # Used for storing namespaces
|
||||
|
||||
proc getNS(prefix: string): string =
|
||||
var defaultNS: seq[string] = @[]
|
||||
|
||||
for key, value, tag in items(nsList):
|
||||
if ":" in key:
|
||||
if key.split(':')[1] == prefix:
|
||||
return value
|
||||
|
||||
if key == "xmlns":
|
||||
defaultNS.add(value)
|
||||
|
||||
# Don't return the default namespaces
|
||||
# in the loop, because then they would have a precedence
|
||||
# over normal namespaces
|
||||
if defaultNS.len() > 0:
|
||||
return defaultNS[0] # Return the first found default namespace
|
||||
# if none are specified for this prefix
|
||||
|
||||
return ""
|
||||
|
||||
proc parseText(x: var TXmlParser, doc: var PDocument): PText =
|
||||
result = doc.createTextNode(x.charData())
|
||||
@@ -28,24 +53,33 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
|
||||
case x.kind()
|
||||
of xmlEof:
|
||||
break
|
||||
of xmlElementStart:
|
||||
of xmlElementStart, xmlElementOpen:
|
||||
if n.tagName() != "":
|
||||
n.appendChild(parseElement(x, doc))
|
||||
else:
|
||||
n = doc.createElement(x.elementName)
|
||||
of xmlElementOpen:
|
||||
if n.tagName() != "":
|
||||
n.appendChild(parseElement(x, doc))
|
||||
else:
|
||||
if x.elementName.contains(':'):
|
||||
#TODO: NamespaceURI
|
||||
n = doc.createElementNS("nil", x.elementName)
|
||||
else:
|
||||
n = doc.createElement(x.elementName)
|
||||
n = doc.createElementNS("", x.elementName)
|
||||
|
||||
of xmlElementEnd:
|
||||
if x.elementName == n.nodeName:
|
||||
# n.normalize() # Remove any whitespace etc.
|
||||
|
||||
var ns: string
|
||||
if x.elementName.contains(':'):
|
||||
ns = getNS(x.elementName.split(':')[0])
|
||||
else:
|
||||
ns = getNS("")
|
||||
|
||||
n.namespaceURI = ns
|
||||
|
||||
# Remove any namespaces this element declared
|
||||
var count = 0 # Variable which keeps the index
|
||||
# We need to edit it..
|
||||
for i in low(nsList)..len(nsList)-1:
|
||||
if nsList[count][2] == n:
|
||||
nsList.delete(count)
|
||||
dec(count)
|
||||
inc(count)
|
||||
|
||||
return n
|
||||
else: #The wrong element is ended
|
||||
raise newException(EMismatchedTag, "Mismatched tag at line " &
|
||||
@@ -54,11 +88,15 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
|
||||
of xmlCharData:
|
||||
n.appendChild(parseText(x, doc))
|
||||
of xmlAttribute:
|
||||
if x.attrKey == "xmlns" or x.attrKey.startsWith("xmlns:"):
|
||||
nsList.add((x.attrKey, x.attrValue, n))
|
||||
|
||||
if x.attrKey.contains(':'):
|
||||
#TODO: NamespaceURI
|
||||
n.setAttributeNS("nil", x.attrKey, x.attrValue)
|
||||
var ns = getNS(x.attrKey)
|
||||
n.setAttributeNS(ns, x.attrKey, x.attrValue)
|
||||
else:
|
||||
n.setAttribute(x.attrKey, x.attrValue)
|
||||
|
||||
of xmlCData:
|
||||
n.appendChild(doc.createCDATASection(x.charData()))
|
||||
of xmlComment:
|
||||
@@ -75,16 +113,13 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
|
||||
|
||||
raise newException(EMismatchedTag,
|
||||
"Mismatched tag at line " & $x.getLine() & " column " & $x.getColumn)
|
||||
|
||||
proc loadXML*(path: string): PDocument =
|
||||
## Loads and parses XML from file specified by ``path``, and returns
|
||||
|
||||
proc loadXMLStream*(stream: PStream): PDocument =
|
||||
## Loads and parses XML from a stream specified by ``stream``, and returns
|
||||
## a ``PDocument``
|
||||
|
||||
var s = newFileStream(path, fmRead)
|
||||
if s == nil: raise newException(EIO, "Unable to read file " & path)
|
||||
|
||||
var x: TXmlParser
|
||||
open(x, s, path, {reportComments})
|
||||
open(x, stream, nil, {reportComments})
|
||||
|
||||
var XmlDoc: PDocument
|
||||
var DOM: PDOMImplementation = getDOM()
|
||||
@@ -102,10 +137,32 @@ proc loadXML*(path: string): PDocument =
|
||||
else:
|
||||
raise newException(EParserError, "Unexpected XML Parser event")
|
||||
|
||||
close(x)
|
||||
return XmlDoc
|
||||
|
||||
proc loadXML*(xml: string): PDocument =
|
||||
## Loads and parses XML from a string specified by ``xml``, and returns
|
||||
## a ``PDocument``
|
||||
var s = newStringStream(xml)
|
||||
return loadXMLStream(s)
|
||||
|
||||
|
||||
proc loadXMLFile*(path: string): PDocument =
|
||||
## Loads and parses XML from a file specified by ``path``, and returns
|
||||
## a ``PDocument``
|
||||
|
||||
var s = newFileStream(path, fmRead)
|
||||
if s == nil: raise newException(EIO, "Unable to read file " & path)
|
||||
return loadXMLStream(s)
|
||||
|
||||
|
||||
when isMainModule:
|
||||
var xml = loadXML(r"C:\Users\Dominik\Desktop\Code\Nimrod\xmldom\test.xml")
|
||||
echo($xml)
|
||||
var xml = loadXMLFile(r"C:\Users\Dominik\Desktop\Code\Nimrod\xmldom\test.xml")
|
||||
#echo(xml.getElementsByTagName("m:test2")[0].namespaceURI)
|
||||
#echo(xml.getElementsByTagName("bla:test")[0].namespaceURI)
|
||||
#echo(xml.getElementsByTagName("test")[0].namespaceURI)
|
||||
for i in items(xml.getElementsByTagName("*")):
|
||||
if i.namespaceURI != nil:
|
||||
echo(i.nodeName, "=", i.namespaceURI)
|
||||
|
||||
|
||||
echo($xml)
|
||||
@@ -25,6 +25,8 @@ proc raiseInvalidXml(errors: seq[string]) =
|
||||
proc addNode(father, son: PXmlNode) =
|
||||
if son != nil: add(father, son)
|
||||
|
||||
proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode
|
||||
|
||||
proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
|
||||
errors: var seq[string]) =
|
||||
while true:
|
||||
@@ -33,11 +35,11 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
|
||||
if x.elementName == result.tag:
|
||||
next(x)
|
||||
else:
|
||||
errors.add(errorMsg(x, "</" & result.tag & "$1> expected"))
|
||||
errors.add(errorMsg(x, "</" & result.tag & "> expected"))
|
||||
# do not skip it here!
|
||||
break
|
||||
of xmlEof:
|
||||
errors.add(errorMsg(x, "</" & result.tag & "$1> expected"))
|
||||
errors.add(errorMsg(x, "</" & result.tag & "> expected"))
|
||||
break
|
||||
else:
|
||||
result.addNode(parse(x, errors))
|
||||
@@ -91,7 +93,7 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode =
|
||||
next(x)
|
||||
of xmlEntity:
|
||||
## &entity;
|
||||
## XXX To implement!
|
||||
errors.add(errorMsg(x, "unknown entity: " & x.entityName))
|
||||
next(x)
|
||||
of xmlEof: nil
|
||||
|
||||
@@ -110,6 +112,8 @@ proc parseXml*(s: PStream, filename: string,
|
||||
of xmlComment, xmlWhitespace: nil # just skip it
|
||||
of xmlError:
|
||||
errors.add(errorMsg(x))
|
||||
of xmlSpecial:
|
||||
errors.add(errorMsg(x, "<some_tag> expected"))
|
||||
else:
|
||||
errors.add(errorMsg(x, "<some_tag> expected"))
|
||||
break
|
||||
@@ -122,17 +126,33 @@ proc parseXml*(s: PStream): PXmlNode =
|
||||
result = parseXml(s, "unknown_html_doc", errors)
|
||||
if errors.len > 0: raiseInvalidXMl(errors)
|
||||
|
||||
proc loadXml*(path: string, reportErrors = false): PXmlNode =
|
||||
proc loadXml*(path: string, errors: var seq[string]): PXmlNode =
|
||||
## Loads and parses XML from file specified by ``path``, and returns
|
||||
## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are
|
||||
## ``echo``ed, otherwise an exception is thrown.
|
||||
## a ``PXmlNode``. Every occured parsing error is added to the `errors`
|
||||
## sequence.
|
||||
var s = newFileStream(path, fmRead)
|
||||
if s == nil: raise newException(EIO, "Unable to read file: " & path)
|
||||
|
||||
var errors: seq[string] = @[]
|
||||
result = parseXml(s, path, errors)
|
||||
if reportErrors:
|
||||
for msg in items(errors): echo(msg)
|
||||
elif errors.len > 0:
|
||||
raiseInvalidXMl(errors)
|
||||
|
||||
proc loadXml*(path: string): PXmlNode =
|
||||
## Loads and parses XML from file specified by ``path``, and returns
|
||||
## a ``PXmlNode``. All parsing errors are turned into an ``EInvalidXML``
|
||||
## exception.
|
||||
var errors: seq[string] = @[]
|
||||
result = loadXml(path, errors)
|
||||
if errors.len > 0: raiseInvalidXMl(errors)
|
||||
|
||||
when isMainModule:
|
||||
import os
|
||||
|
||||
var errors: seq[string] = @[]
|
||||
var x = loadXml(paramStr(1), errors)
|
||||
for e in items(errors): echo e
|
||||
|
||||
var f: TFile
|
||||
if open(f, "xmltest.txt", fmWrite):
|
||||
f.write($x)
|
||||
f.close()
|
||||
else:
|
||||
quit("cannot write test.txt")
|
||||
|
||||
@@ -153,8 +153,15 @@ proc addIndent(result: var string, indent: int) =
|
||||
result.add("\n")
|
||||
for i in 1..indent: result.add(' ')
|
||||
|
||||
proc noWhitespace(n: PXmlNode): bool =
|
||||
#for i in 1..n.len-1:
|
||||
# if n[i].kind != n[0].kind: return true
|
||||
for i in 0..n.len-1:
|
||||
if n[i].kind in {xnText, xnEntity}: return true
|
||||
|
||||
proc add*(result: var string, n: PXmlNode, indent = 0, indWidth = 2) =
|
||||
## adds the textual representation of `n` to `result`.
|
||||
if n == nil: return
|
||||
case n.k
|
||||
of xnElement:
|
||||
result.add('<')
|
||||
@@ -168,10 +175,19 @@ proc add*(result: var string, n: PXmlNode, indent = 0, indWidth = 2) =
|
||||
result.add('"')
|
||||
if n.len > 0:
|
||||
result.add('>')
|
||||
for i in 0..n.len-1:
|
||||
result.addIndent(indent+indWidth)
|
||||
result.add(n[i], indent+indWidth, indWidth)
|
||||
result.addIndent(indent)
|
||||
if n.len > 1:
|
||||
if noWhitespace(n):
|
||||
# for mixed leaves, we cannot output whitespace for readability,
|
||||
# because this would be wrong. For example: ``a<b>b</b>`` is
|
||||
# different from ``a <b>b</b>``.
|
||||
for i in 0..n.len-1: result.add(n[i], indent+indWidth, indWidth)
|
||||
else:
|
||||
for i in 0..n.len-1:
|
||||
result.addIndent(indent+indWidth)
|
||||
result.add(n[i], indent+indWidth, indWidth)
|
||||
result.addIndent(indent)
|
||||
else:
|
||||
result.add(n[0], indent+indWidth, indWidth)
|
||||
result.add("</")
|
||||
result.add(n.fTag)
|
||||
result.add(">")
|
||||
|
||||
Reference in New Issue
Block a user