new XML modules

This commit is contained in:
Andreas Rumpf
2010-02-10 03:21:03 +01:00
parent d4107728d3
commit f721ddd75b
11 changed files with 731 additions and 93 deletions

View File

@@ -172,7 +172,16 @@ XML Processing
This module implements the XML DOM Level 2.
* `xmldomparser <xmldomparser.html>`_
This module parses a XML Document into a XML DOM Document representation.
This module parses an XML Document into a XML DOM Document representation.
* `xmltree <xmltree.html>`_
A simple XML tree. More efficient and simpler than the DOM.
* `xmltreeparser <xmltreeparser.html>`_
This module parses an XML document and creates its XML tree representation.
* `htmlparser <htmlparser.html>`_
This module parses an HTML document and creates its XML tree representation.
Code generation

247
lib/pure/htmlparser.nim Normal file
View File

@@ -0,0 +1,247 @@
#
#
# Nimrod's Runtime Library
# (c) Copyright 2010 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## This module parses an HTML document and creates its XML tree representation.
## It is supposed to handle the *wild* HTML the real world uses.
##
## It can be used to parse a wild HTML document and output it as valid XHTML
## document (if you are lucky):
##
## .. code-block:: nimrod
##
## echo loadHtml("mydirty.html")
##
##
## Every tag in the resulting tree is in lower case.
##
## **Note:** The resulting ``PXmlNode``s already use the ``clientData`` field,
## so it cannot be used by clients of this library.
import streams, parsexml, xmltree
type
THtmlTag* = enum ## list of all supported HTML tags; order will always be
## alphabetically
tagUnknown, ## unknown HTML element
tagA, ## the HTML ``a`` element
tagAcronym, ## the HTML ``acronym`` element
tagAddress, ## the HTML ``address`` element
tagArea, ## the HTML ``area`` element
tagB, ## the HTML ``b`` element
tagBase, ## the HTML ``base`` element
tagBig, ## the HTML ``big`` element
tagBlockquote, ## the HTML ``blockquote`` element
tagBody, ## the HTML ``body`` element
tagBr, ## the HTML ``br`` element
tagButton, ## the HTML ``button`` element
tagCaption, ## the HTML ``caption`` element
tagCite, ## the HTML ``cite`` element
tagCode, ## the HTML ``code`` element
tagCol, ## the HTML ``col`` element
tagColgroup, ## the HTML ``colgroup`` element
tagDd, ## the HTML ``dd`` element
tagDel, ## the HTML ``del`` element
tagDfn, ## the HTML ``dfn`` element
tagDiv, ## the HTML ``div`` element
tagDl, ## the HTML ``dl`` element
tagDt, ## the HTML ``dt`` element
tagEm, ## the HTML ``em`` element
tagFieldset, ## the HTML ``fieldset`` element
tagForm, ## the HTML ``form`` element
tagH1, ## the HTML ``h1`` element
tagH2, ## the HTML ``h2`` element
tagH3, ## the HTML ``h3`` element
tagH4, ## the HTML ``h4`` element
tagH5, ## the HTML ``h5`` element
tagH6, ## the HTML ``h6`` element
tagHead, ## the HTML ``head`` element
tagHtml, ## the HTML ``html`` element
tagHr, ## the HTML ``hr`` element
tagI, ## the HTML ``i`` element
tagImg, ## the HTML ``img`` element
tagInput, ## the HTML ``input`` element
tagIns, ## the HTML ``ins`` element
tagKbd, ## the HTML ``kbd`` element
tagLabel, ## the HTML ``label`` element
tagLegend, ## the HTML ``legend`` element
tagLi, ## the HTML ``li`` element
tagLink, ## the HTML ``link`` element
tagMap, ## the HTML ``map`` element
tagMeta, ## the HTML ``meta`` element
tagNoscript, ## the HTML ``noscript`` element
tagObject, ## the HTML ``object`` element
tagOl, ## the HTML ``ol`` element
tagOptgroup, ## the HTML ``optgroup`` element
tagOption, ## the HTML ``option`` element
tagP, ## the HTML ``p`` element
tagParam, ## the HTML ``param`` element
tagPre, ## the HTML ``pre`` element
tagQ, ## the HTML ``q`` element
tagSamp, ## the HTML ``samp`` element
tagScript, ## the HTML ``script`` element
tagSelect, ## the HTML ``select`` element
tagSmall, ## the HTML ``small`` element
tagSpan, ## the HTML ``span`` element
tagStrong, ## the HTML ``strong`` element
tagStyle, ## the HTML ``style`` element
tagSub, ## the HTML ``sub`` element
tagSup, ## the HTML ``sup`` element
tagTable, ## the HTML ``table`` element
tagTbody, ## the HTML ``tbody`` element
tagTd, ## the HTML ``td`` element
tagTextarea, ## the HTML ``textarea`` element
tagTfoot, ## the HTML ``tfoot`` element
tagTh, ## the HTML ``th`` element
tagThead, ## the HTML ``thead`` element
tagTitle, ## the HTML ``title`` element
tagTr, ## the HTML ``tr`` element
tagTt, ## the HTML ``tt`` element
tagUl, ## the HTML ``ul`` element
tagVar ## the HTML ``var`` element
const
tagStrs = [
"a", "acronym", "address", "area", "b", "base", "big", "blockquote",
"body", "br", "button", "caption", "cite", "code", "col", "colgroup",
"dd", "del", "dfn", "div", "dl", "dt", "em", "fieldset",
"form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "html", "hr",
"i", "img", "input", "ins", "kbd", "label", "legend", "li", "link",
"map", "meta", "noscript", "object", "ol", "optgroup", "option",
"p", "param", "pre", "q", "samp", "script", "select", "small",
"span", "strong", "style", "sub", "sup", "table", "tbody", "td",
"textarea", "tfoot", "th", "thead", "title", "tr", "tt", "ul", "var"
]
proc binaryStrSearch(x: openarray[string], y: string): int =
## XXX put this into the library somewhere!
var a = 0
var b = len(x) - 1
while a <= b:
var mid = (a + b) div 2
var c = cmp(x[mid], y)
if c < 0:
a = mid + 1
elif c > 0:
b = mid - 1
else:
return mid
result = - 1
proc htmlTag*(n: PXmlNode): THtmlTag =
## gets `n`'s tag as a ``THtmlTag``. Even though results are cached, this is
## can be more expensive than comparing ``tag`` directly to a string.
if n.clientData == 0:
n.clientData = binaryStrSearch(tagStrs, n.tag)+1
result = THtmlTag(n.clientData)
proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
var n = doc.createElement("")
while True:
case x.kind()
of xmlEof:
break
of xmlElementStart:
if n.tagName() != "":
n.appendChild(parseElement(x, doc))
else:
n = doc.createElement(x.elementName)
of xmlElementOpen:
if n.tagName() != "":
n.appendChild(parseElement(x, doc))
else:
if x.elementName.contains(':'):
#TODO: NamespaceURI
n = doc.createElementNS("nil", x.elementName)
else:
n = doc.createElement(x.elementName)
of xmlElementEnd:
if x.elementName == n.nodeName:
# n.normalize() # Remove any whitespace etc.
return n
else: #The wrong element is ended
raise newException(EMismatchedTag, "Mismatched tag at line " &
$x.getLine() & " column " & $x.getColumn)
of xmlCharData:
n.appendChild(parseText(x, doc))
of xmlAttribute:
if x.attrKey.contains(':'):
#TODO: NamespaceURI
n.setAttributeNS("nil", x.attrKey, x.attrValue)
else:
n.setAttribute(x.attrKey, x.attrValue)
of xmlCData:
n.appendChild(doc.createCDATASection(x.charData()))
of xmlComment:
n.appendChild(doc.createComment(x.charData()))
of xmlPI:
n.appendChild(doc.createProcessingInstruction(x.PIName(), x.PIRest()))
of xmlWhitespace, xmlElementClose, xmlEntity, xmlSpecial:
# Unused 'events'
else:
raise newException(EParserError, "Unexpected XML Parser event")
x.next()
raise newException(EMismatchedTag,
"Mismatched tag at line " & $x.getLine() & " column " & $x.getColumn)
proc parse*(x: var TXmlParser, father: PXmlNode) =
proc parseHtml*(s: PStream, filename: string,
errors: var seq[string]): PXmlNode =
## parses the HTML from stream `s` and returns a ``PXmlNode``. Every
## occured parsing error is added to the `errors` sequence.
var x: TXmlParser
open(x, s, filename, {reportComments})
result = newElement("html")
while true:
x.next()
case x.kind
of xmlWhitespace: nil # just skip it
of xmlComment:
result.add(newComment(x.text))
while True:
x.next()
case x.kind
of xmlEof: break
of xmlElementStart, xmlElementOpen:
var el: PElement = parseElement(x, XmlDoc)
XmlDoc = dom.createDocument(el)
of xmlWhitespace, xmlElementClose, xmlEntity, xmlSpecial:
# Unused 'events'
else:
raise newException(EParserError, "Unexpected XML Parser event")
close(x)
proc parseHtml*(s: PStream): PXmlNode =
## parses the HTML from stream `s` and returns a ``PXmlNode``. All parsing
## errors are ignored.
var errors: seq[string] = @[]
result = parseHtml(s, "unknown_html_doc", errors)
proc loadHtml*(path: string, reportErrors = false): PXmlNode =
## Loads and parses HTML from file specified by ``path``, and returns
## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are
## ``echo``ed.
var s = newFileStream(path, fmRead)
if s == nil: raise newException(EIO, "Unable to read file: " & path)
var errors: seq[string] = @[]
result = parseHtml(s, path, errors)
if reportErrors:
for msg in items(errors): echo(msg)

View File

@@ -364,7 +364,7 @@ proc parsePI(my: var TXmlParser) =
break
add(my.b, '?')
inc(pos)
of '\c':
of '\c':
# the specification says that CR-LF, CR are to be transformed to LF
pos = lexbase.HandleCR(my, pos)
buf = my.buf

View File

@@ -127,13 +127,13 @@ template `=~` *(s: string, pattern: TRegEx): expr =
##
## if line =~ re"\s*(\w+)\s*\=\s*(\w+)":
## # matches a key=value pair:
## echo("Key: ", matches[1])
## echo("Value: ", matches[2])
## echo("Key: ", matches[0])
## echo("Value: ", matches[1])
## elif line =~ re"\s*(\#.*)":
## # matches a comment
## # note that the implicit ``matches`` array is different from the
## # ``matches`` array of the first branch
## echo("comment: ", matches[1])
## echo("comment: ", matches[0])
## else:
## echo("syntax error")
##

View File

@@ -9,33 +9,30 @@
import strutils
## This module implements the XML DOM Level 2
## This module implements XML DOM Level 2 Core specification(http://www.w3.org/TR/2000/REC-DOM-Level-2-Core-20001113/core.html)
#http://www.w3.org/TR/2000/REC-DOM-Level-2-Core-20001113/core.html
#DOMString = String
#DOMTimeStamp = int16 ??
#DECLARATIONS
#Exceptions
type
EDOMException* = object of E_Base #Base exception object for all DOM Exceptions
EDOMStringSizeErr* = object of EDOMException #If the specified range of text does not fit into a DOMString
#Currently not used(Since DOMString is just string)
EHierarchyRequestErr* = object of EDOMException #If any node is inserted somewhere it doesn't belong
EIndexSizeErr* = object of EDOMException #If index or size is negative, or greater than the allowed value
EInuseAttributeErr* = object of EDOMException #If an attempt is made to add an attribute that is already in use elsewhere
EInvalidAccessErr* = object of EDOMException #If a parameter or an operation is not supported by the underlying object.
EInvalidCharacterErr* = object of EDOMException #This exception is raised when a string parameter contains an illegal character
EInvalidModificationErr* = object of EDOMException #If an attempt is made to modify the type of the underlying object.
EInvalidStateErr* = object of EDOMException #If an attempt is made to use an object that is not, or is no longer, usable.
ENamespaceErr* = object of EDOMException #If an attempt is made to create or change an object in a way which is incorrect with regard to namespaces.
ENotFoundErr* = object of EDOMException #If an attempt is made to reference a node in a context where it does not exist
ENotSupportedErr* = object of EDOMException #If the implementation does not support the requested type of object or operation.
ENoDataAllowedErr* = object of EDOMException #If data is specified for a node which does not support data
ENoModificationAllowedErr* = object of EDOMException #If an attempt is made to modify an object where modifications are not allowed
ESyntaxErr* = object of EDOMException #If an invalid or illegal string is specified.
EWrongDocumentErr* = object of EDOMException #If a node is used in a different document than the one that created it (that doesn't support it)
EDOMException* = object of E_Base ## Base exception object for all DOM Exceptions
EDOMStringSizeErr* = object of EDOMException ## If the specified range of text does not fit into a DOMString
## Currently not used(Since DOMString is just string)
EHierarchyRequestErr* = object of EDOMException ## If any node is inserted somewhere it doesn't belong
EIndexSizeErr* = object of EDOMException ## If index or size is negative, or greater than the allowed value
EInuseAttributeErr* = object of EDOMException ## If an attempt is made to add an attribute that is already in use elsewhere
EInvalidAccessErr* = object of EDOMException ## If a parameter or an operation is not supported by the underlying object.
EInvalidCharacterErr* = object of EDOMException ## This exception is raised when a string parameter contains an illegal character
EInvalidModificationErr* = object of EDOMException ## If an attempt is made to modify the type of the underlying object.
EInvalidStateErr* = object of EDOMException ## If an attempt is made to use an object that is not, or is no longer, usable.
ENamespaceErr* = object of EDOMException ## If an attempt is made to create or change an object in a way which is incorrect with regard to namespaces.
ENotFoundErr* = object of EDOMException ## If an attempt is made to reference a node in a context where it does not exist
ENotSupportedErr* = object of EDOMException ## If the implementation does not support the requested type of object or operation.
ENoDataAllowedErr* = object of EDOMException ## If data is specified for a node which does not support data
ENoModificationAllowedErr* = object of EDOMException ## If an attempt is made to modify an object where modifications are not allowed
ESyntaxErr* = object of EDOMException ## If an invalid or illegal string is specified.
EWrongDocumentErr* = object of EDOMException ## If a node is used in a different document than the one that created it (that doesn't support it)
template newException(exceptn, message: expr): expr =
block: # open a new scope
@@ -65,24 +62,24 @@ type
Feature = tuple[name: string, version: string]
PDOMImplementation* = ref DOMImplementation
DOMImplementation = object
Features: seq[Feature] #Read-Only
Features: seq[Feature] # Read-Only
PNode* = ref Node
Node = object
attributes: seq[PAttr] #Read-only
childNodes*: seq[PNode] #Read-only
FLocalName: string #Read-only
FNamespaceURI: string #Read-only
FNodeName: string #Read-only
attributes*: seq[PAttr]
childNodes*: seq[PNode]
FLocalName: string # Read-only
FNamespaceURI: string # Read-only
FNodeName: string # Read-only
nodeValue*: string
FNodeType: int #Read-only
FOwnerDocument: PDocument #Read-Only
FParentNode: PNode #Read-Only
FNodeType: int # Read-only
FOwnerDocument: PDocument # Read-Only
FParentNode: PNode # Read-Only
prefix*: string # Setting this should change some values... TODO!
PElement* = ref Element
Element = object of Node
FTagName: string #Read-only
FTagName: string # Read-only
PCharacterData = ref CharacterData
CharacterData = object of Node
@@ -90,15 +87,15 @@ type
PDocument* = ref Document
Document = object of Node
FImplementation: PDOMImplementation #Read-only
FDocumentElement: PElement #Read-only
FImplementation: PDOMImplementation # Read-only
FDocumentElement: PElement # Read-only
PAttr* = ref Attr
Attr = object of Node
FName: string #Read-only
FSpecified: bool #Read-only
FName: string # Read-only
FSpecified: bool # Read-only
value*: string
FOwnerElement: PElement #Read-only
FOwnerElement: PElement # Read-only
PDocumentFragment* = ref DocumentFragment
DocumentFragment = object of Node
@@ -115,18 +112,18 @@ type
PProcessingInstruction* = ref ProcessingInstruction
ProcessingInstruction = object of Node
data*: string
FTarget: string #Read-only
FTarget: string # Read-only
#DOMImplementation
# DOMImplementation
proc getDOM*(): PDOMImplementation =
##Returns a DOMImplementation
## Returns a DOMImplementation
var DOMImpl: PDOMImplementation
new(DOMImpl)
DOMImpl.Features = @[(name: "core", version: "2.0"), (name: "core", version: "1.0"), (name: "XML", version: "2.0")]
return DOMImpl
proc createDocument*(dom: PDOMImplementation, namespaceURI: string, qualifiedName: string): PDocument =
##Creates an XML Document object of the specified type with its document element.
## Creates an XML Document object of the specified type with its document element.
var doc: PDocument
new(doc)
doc.FNamespaceURI = namespaceURI
@@ -142,8 +139,9 @@ proc createDocument*(dom: PDOMImplementation, namespaceURI: string, qualifiedNam
return doc
proc createDocument*(dom: PDOMImplementation, n: PElement): PDocument =
##Creates an XML Document object of the specified type with its document element.
#This procedure is not in the specification, it's provided for the parser.
## Creates an XML Document object of the specified type with its document element.
# This procedure is not in the specification, it's provided for the parser.
var doc: PDocument
new(doc)
doc.FDocumentElement = n
@@ -153,7 +151,7 @@ proc createDocument*(dom: PDOMImplementation, n: PElement): PDocument =
return doc
proc hasFeature*(dom: PDOMImplementation, feature: string, version: string = ""): bool =
##Returns ``true`` if this ``version`` of the DomImplementation implements ``feature``, otherwise ``false``
## Returns ``true`` if this ``version`` of the DomImplementation implements ``feature``, otherwise ``false``
for iName, iVersion in items(dom.Features):
if iName == feature:
if version == "":
@@ -164,8 +162,8 @@ proc hasFeature*(dom: PDOMImplementation, feature: string, version: string = "")
return False
#Document
#Attributes
# Document
# Attributes
proc implementation*(doc: PDocument): PDOMImplementation =
return doc.FImplementation
@@ -173,9 +171,9 @@ proc implementation*(doc: PDocument): PDOMImplementation =
proc documentElement*(doc: PDocument): PElement =
return doc.FDocumentElement
#Internal procedures
# Internal procedures
proc findNodes(nl: PNode, name: string): seq[PNode] =
#Made for getElementsByTagName
# Made for getElementsByTagName
var r: seq[PNode] = @[]
if nl.childNodes == nil: return @[]
if nl.childNodes.len() == 0: return @[]
@@ -192,7 +190,7 @@ proc findNodes(nl: PNode, name: string): seq[PNode] =
return r
proc findNodesNS(nl: PNode, namespaceURI: string, localName: string): seq[PNode] =
#Made for getElementsByTagNameNS
# Made for getElementsByTagNameNS
var r: seq[PNode] = @[]
if nl.childNodes == nil: return @[]
if nl.childNodes.len() == 0: return @[]
@@ -211,10 +209,10 @@ proc findNodesNS(nl: PNode, namespaceURI: string, localName: string): seq[PNode]
#Procedures
proc createAttribute*(doc: PDocument, name: string): PAttr =
##Creates an Attr of the given name. Note that the Attr instance can then be set on an Element using the setAttributeNode method.
##To create an attribute with a qualified name and namespace URI, use the createAttributeNS method.
## Creates an Attr of the given name. Note that the Attr instance can then be set on an Element using the setAttributeNode method.
## To create an attribute with a qualified name and namespace URI, use the createAttributeNS method.
#Check if name contains illegal characters
# Check if name contains illegal characters
if illegalChars in name:
raise newException(EInvalidCharacterErr, "Invalid character")
@@ -230,12 +228,12 @@ proc createAttribute*(doc: PDocument, name: string): PAttr =
return AttrNode
proc createAttributeNS*(doc: PDocument, namespaceURI: string, qualifiedName: string): PAttr =
##Creates an attribute of the given qualified name and namespace URI
## Creates an attribute of the given qualified name and namespace URI
#Check if name contains illegal characters
# Check if name contains illegal characters
if illegalChars in namespaceURI or illegalChars in qualifiedName:
raise newException(EInvalidCharacterErr, "Invalid character")
#Exceptions
# Exceptions
if qualifiedName.contains(':'):
if namespaceURI == nil or namespaceURI == "":
raise newException(ENamespaceErr, "When qualifiedName contains a prefix namespaceURI cannot be nil")
@@ -264,17 +262,17 @@ proc createAttributeNS*(doc: PDocument, namespaceURI: string, qualifiedName: str
return AttrNode
proc createCDATASection*(doc: PDocument, data: string): PCDATASection =
##Creates a CDATASection node whose value is the specified string.
## Creates a CDATASection node whose value is the specified string.
var CData: PCDATASection
new(CData)
CData.data = data
CData.nodeValue = data
CData.FNodeName = "#text" #Not sure about this, but this is technically a TextNode
CData.FNodeName = "#text" # Not sure about this, but this is technically a TextNode
CData.FNodeType = CDataSectionNode
return CData
proc createComment*(doc: PDocument, data: string): PComment =
##Creates a Comment node given the specified string.
## Creates a Comment node given the specified string.
var Comm: PComment
new(Comm)
Comm.data = data
@@ -284,15 +282,15 @@ proc createComment*(doc: PDocument, data: string): PComment =
return Comm
proc createDocumentFragment*(doc: PDocument): PDocumentFragment =
##Creates an empty DocumentFragment object.
## Creates an empty DocumentFragment object.
var DF: PDocumentFragment
new(DF)
return DF
proc createElement*(doc: PDocument, tagName: string): PElement =
##Creates an element of the type specified.
## Creates an element of the type specified.
#Check if name contains illegal characters
# Check if name contains illegal characters
if illegalChars in tagName:
raise newException(EInvalidCharacterErr, "Invalid character")
@@ -311,7 +309,7 @@ proc createElement*(doc: PDocument, tagName: string): PElement =
return elNode
proc createElementNS*(doc: PDocument, namespaceURI: string, qualifiedName: string): PElement =
##Creates an element of the given qualified name and namespace URI.
## Creates an element of the given qualified name and namespace URI.
if qualifiedName.contains(':'):
if namespaceURI == nil or namespaceURI == "":
raise newException(ENamespaceErr, "When qualifiedName contains a prefix namespaceURI cannot be nil")
@@ -319,7 +317,7 @@ proc createElementNS*(doc: PDocument, namespaceURI: string, qualifiedName: strin
raise newException(ENamespaceErr,
"When the namespace prefix is \"xml\" namespaceURI has to be \"http://www.w3.org/XML/1998/namespace\"")
#Check if name contains illegal characters
# Check if name contains illegal characters
if illegalChars in namespaceURI or illegalChars in qualifiedName:
raise newException(EInvalidCharacterErr, "Invalid character")
@@ -342,7 +340,7 @@ proc createElementNS*(doc: PDocument, namespaceURI: string, qualifiedName: strin
return elNode
proc createProcessingInstruction*(doc: PDocument, target: string, data: string): PProcessingInstruction =
##Creates a ProcessingInstruction node given the specified name and data strings.
## Creates a ProcessingInstruction node given the specified name and data strings.
#Check if name contains illegal characters
if illegalChars in target:
@@ -356,7 +354,7 @@ proc createProcessingInstruction*(doc: PDocument, target: string, data: string):
return PI
proc createTextNode*(doc: PDocument, data: string): PText = #Propably TextNode
##Creates a Text node given the specified string.
## Creates a Text node given the specified string.
var txtNode: PText
new(txtNode)
txtNode.data = data
@@ -371,8 +369,8 @@ discard """proc getElementById*(doc: PDocument, elementId: string): PElement =
#TODO"""
proc getElementsByTagName*(doc: PDocument, tagName: string): seq[PNode] =
##Returns a NodeList of all the Elements with a given tag name in
##the order in which they are encountered in a preorder traversal of the Document tree.
## Returns a NodeList of all the Elements with a given tag name in
## the order in which they are encountered in a preorder traversal of the Document tree.
var result: seq[PNode] = @[]
if doc.FDocumentElement.FNodeName == tagName or tagName == "*":
result.add(doc.FDocumentElement)
@@ -381,8 +379,8 @@ proc getElementsByTagName*(doc: PDocument, tagName: string): seq[PNode] =
return result
proc getElementsByTagNameNS*(doc: PDocument, namespaceURI: string, localName: string): seq[PNode] =
##Returns a NodeList of all the Elements with a given localName and namespaceURI
##in the order in which they are encountered in a preorder traversal of the Document tree.
## Returns a NodeList of all the Elements with a given localName and namespaceURI
## in the order in which they are encountered in a preorder traversal of the Document tree.
var result: seq[PNode] = @[]
if doc.FDocumentElement.FLocalName == localName or localName == "*":
if doc.FDocumentElement.FNamespaceURI == namespaceURI or namespaceURI == "*":
@@ -450,57 +448,76 @@ proc importNode*(doc: PDocument, importedNode: PNode, deep: bool): PNode =
# Node
# Attributes
proc Attributes*(n: PNode): seq[PAttr] =
if n.attributes == nil: n.attributes = @[] # Initialize the sequence if it's nil
return n.attributes
proc firstChild*(n: PNode): PNode =
## Returns this node's first child
if n.childNodes.len() > 0:
return n.childNodes[0]
else:
return nil
proc lastChild*(n: PNode): PNode =
## Returns this node's last child
if n.childNodes.len() > 0:
return n.childNodes[n.childNodes.len() - 1]
else:
return nil
proc localName*(n: PNode): string =
## Returns this nodes local name
return n.FLocalName
proc namespaceURI*(n: PNode): string =
## Returns this nodes namespace URI
return n.FNamespaceURI
proc nextSibling*(n: PNode): PNode =
## Returns the next sibling of this node
var nLow: int = low(n.FParentNode.childNodes)
var nHigh: int = high(n.FParentNode.childNodes)
for i in nLow..nHigh:
if n.FParentNode.childNodes[i] == n: # HAVE TO TEST this line, not sure if ``==`` will work
if n.FParentNode.childNodes[i] == n:
return n.FParentNode.childNodes[i + 1]
return nil
proc nodeName*(n: PNode): string =
## Returns the name of this node
return n.FNodeName
proc nodeType*(n: PNode): int =
## Returns the type of this node
return n.FNodeType
proc ownerDocument*(n: PNode): PDocument =
## Returns the owner document of this node
return n.FOwnerDocument
proc parentNode*(n: PNode): PNode =
## Returns the parent node of this node
return n.FParentNode
proc previousSibling*(n: PNode): PNode =
## Returns the previous sibling of this node
var nLow: int = low(n.FParentNode.childNodes)
var nHigh: int = high(n.FParentNode.childNodes)
for i in nLow..nHigh:
if n.FParentNode.childNodes[i] == n: # HAVE TO TEST this line, not sure if ``==`` will work
if n.FParentNode.childNodes[i] == n:
return n.FParentNode.childNodes[i - 1]
return nil
proc `prefix=`*(n: var PNode, value: string) =
## Modifies the prefix of this node
# Setter
# Check if name contains illegal characters
if illegalChars in value:
@@ -532,8 +549,11 @@ proc appendChild*(n: PNode, newChild: PNode) =
## Adds the node newChild to the end of the list of children of this node.
## If the newChild is already in the tree, it is first removed.
# TODO - Check if n contains newChild
# TODO - Exceptions
# Check if n contains newChild
if n.childNodes != nil:
for i in low(n.childNodes)..high(n.childNodes):
if n.childNodes[i] == newChild:
raise newException(EHierarchyRequestErr, "The node to append is already in this nodes children.")
# Check if newChild is from this nodes document
if n.FOwnerDocument != newChild.FOwnerDocument:
@@ -542,6 +562,9 @@ proc appendChild*(n: PNode, newChild: PNode) =
if n == newChild:
raise newException(EHierarchyRequestErr, "You can't add a node into itself")
if n.nodeType in childlessObjects:
raise newException(ENoModificationAllowedErr, "Cannot append children to a childless node")
if n.childNodes == nil: n.childNodes = @[]
newChild.FParentNode = n
@@ -604,10 +627,43 @@ proc isSupported*(n: PNode, feature: string, version: string): bool =
## feature and that feature is supported by this node.
return n.FOwnerDocument.FImplementation.hasFeature(feature, version)
proc isEmpty(s: string): bool =
if s == "" or s == nil:
return True
for i in items(s):
if i != ' ':
return False
return True
proc normalize*(n: PNode) =
## Puts all Text nodes in the full depth of the sub-tree underneath this Node
## Merges all seperated TextNodes together, and removes any empty TextNodes
var curTextNode: PNode = nil
var i: int = 0
# TODO
var newChildNodes: seq[PNode] = @[]
while True:
if i >= n.childNodes.len:
break
if n.childNodes[i].nodeType == TextNode:
#If the TextNode is empty, remove it
if PText(n.childNodes[i]).data.isEmpty():
inc(i)
if curTextNode == nil:
curTextNode = n.childNodes[i]
else:
PText(curTextNode).data.add(PText(n.childNodes[i]).data)
curTextNode.nodeValue.add(PText(n.childNodes[i]).data)
inc(i)
else:
newChildNodes.add(curTextNode)
newChildNodes.add(n.childNodes[i])
curTextNode = nil
inc(i)
n.childNodes = newChildNodes
proc removeChild*(n: PNode, oldChild: PNode): PNode =
## Removes the child node indicated by ``oldChild`` from the list of children, and returns it.
@@ -791,26 +847,32 @@ proc setNamedItemNS*(NList: var seq[PAttr], arg: PAttr): PAttr =
NList[index] = arg
return item # Return the replaced node
# TODO - Maybe implement a ChildlessNode!^
# CharacterData - Decided to implement this,
# Didn't add the procedures, because you can just edit .data
# Attr
# Attributes
proc name*(a: PAttr): string =
## Returns the name of the Attribute
return a.FName
proc specified*(a: PAttr): bool =
## Specifies whether this attribute was specified in the original document
return a.FSpecified
proc ownerElement*(a: PAttr): PElement =
## Returns this Attributes owner element
return a.FOwnerElement
# Element
# Attributes
proc tagName*(el: PElement): string =
## Returns the Element Tag Name
return el.FTagName
# Procedures
@@ -960,11 +1022,29 @@ proc setAttributeNS*(el: PElement, namespaceURI, localName, value: string) =
proc splitData*(TextNode: PText, offset: int): PText =
## Breaks this node into two nodes at the specified offset,
## keeping both in the tree as siblings.
if offset > TextNode.data.len():
raise newException(EIndexSizeErr, "Index out of bounds")
var left: string = TextNode.data.copy(0, offset)
TextNode.data = left
var right: string = TextNode.data.copy(offset, TextNode.data.len())
if TextNode.FParentNode != nil:
for i in low(TextNode.FParentNode.childNodes)..high(TextNode.FParentNode.childNodes):
if TextNode.FParentNode.childNodes[i] == TextNode:
var newNode: PText = TextNode.FOwnerDocument.createTextNode(right)
TextNode.FParentNode.childNodes.insert(newNode, i)
return newNode
else:
var newNode: PText = TextNode.FOwnerDocument.createTextNode(right)
return newNode
# TODO - need insert(seq[T])
# ProcessingInstruction
proc target*(PI: PProcessingInstruction): string =
proc target*(PI: PProcessingInstruction): string =
## Returns the Processing Instructions target
return PI.FTarget

View File

@@ -16,6 +16,7 @@ import xmldom, os, streams, parsexml, strutils
type
#Parsing errors
EMismatchedTag* = object of E_Base ## Raised when a tag is not properly closed
EParserError* = object of E_Base ## Raised when an unexpected XML Parser event occurs
template newException(exceptn, message: expr): expr =
block: # open a new scope
@@ -52,6 +53,7 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
of xmlElementEnd:
if x.elementName == n.nodeName:
# n.normalize() # Remove any whitespace etc.
return n
else: #The wrong element is ended
raise newException(EMismatchedTag, "Mismatched tag at line " &
@@ -71,8 +73,12 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
n.appendChild(doc.createComment(x.charData()))
of xmlPI:
n.appendChild(doc.createProcessingInstruction(x.PIName(), x.PIRest()))
of xmlWhitespace, xmlElementClose, xmlEntity, xmlSpecial:
# Unused 'events'
else:
# echo(x.kind()) # XXX do nothing here!?
raise newException(EParserError, "Unexpected XML Parser event")
x.next()
raise newException(EMismatchedTag,
@@ -99,9 +105,12 @@ proc loadXML*(path: string): PDocument =
of xmlElementStart, xmlElementOpen:
var el: PElement = parseElement(x, XmlDoc)
XmlDoc = dom.createDocument(el)
of xmlWhitespace, xmlElementClose, xmlEntity, xmlSpecial:
# Unused 'events'
else:
# echo(x.kind())
raise newException(EParserError, "Unexpected XML Parser event")
close(x)
return XmlDoc

View File

@@ -21,6 +21,10 @@
##
## <h1><a href="http://force7.de/nimrod">Nimrod</a></h1>
##
## **Deprecated since version 0.8.8.** Use the macro ``<>`` in xmltree
## instead.
{.deprecated.}
import
macros, strutils
@@ -52,8 +56,8 @@ proc xmlCheckedTag*(e: PNimrodNode, tag: string,
# copy the attributes; when iterating over them these lists
# will be modified, so that each attribute is only given one value
var req = splitSeq(reqAttr)
var opt = splitSeq(optAttr)
var req = split(reqAttr)
var opt = split(optAttr)
result = newNimNode(nnkBracket, e)
result.add(newStrLitNode("<"))
result.add(newStrLitNode(tag))

231
lib/pure/xmltree.nim Normal file
View File

@@ -0,0 +1,231 @@
#
#
# Nimrod's Runtime Library
# (c) Copyright 2010 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## A simple XML tree. More efficient and simpler than the DOM.
import macros, strtabs
type
PXmlNode* = ref TXmlNode ## an XML tree consists of ``PXmlNode``s.
TXmlNodeKind* = enum ## different kinds of ``PXmlNode``s
xnText, ## a text element
xnElement, ## an element with 0 or more children
xnCData, ## a CDATA node
xnComment ## an XML comment
PXmlAttributes* = PStringTable ## an alias for a string to string mapping
TXmlNode {.pure, final, acyclic.} = object
case k: TXmlNodeKind
of xnText, xnComment, xnCData:
fText: string
of xnElement:
fTag: string
s: seq[PXmlNode]
fAttr: PXmlAttributes
fClientData: int ## for other clients
proc newXmlNode(kind: TXmlNodeKind): PXmlNode =
## creates a new ``PXmlNode``.
new(result)
result.k = kind
proc newElement*(tag: string): PXmlNode =
## creates a new ``PXmlNode``. of kind ``xnText`` with the given `tag`.
result = newXmlNode(xnElement)
result.fTag = tag
result.s = @[]
# init attributes lazily to safe memory
proc newText*(text: string): PXmlNode =
## creates a new ``PXmlNode`` of kind ``xnText`` with the text `text`.
result = newXmlNode(xnText)
result.fText = text
proc newComment*(comment: string): PXmlNode =
## creates a new ``PXmlNode`` of kind ``xnComment`` with the text `comment`.
result = newXmlNode(xnComment)
result.fText = comment
proc newCData*(cdata: string): PXmlNode =
## creates a new ``PXmlNode`` of kind ``xnComment`` with the text `cdata`.
result = newXmlNode(xnCData)
result.fText = cdata
proc text*(n: PXmlNode): string {.inline.} =
## gets the associated text with the node `n`. `n` can be a CDATA, Text
## or comment node.
assert n.k in {xnText, xnComment, xnCData}
result = n.fText
proc tag*(n: PXmlNode): string {.inline.} =
## gets the tag name of `n`. `n` has to be an ``xnElement`` node.
assert n.k == xnElement
result = n.fTag
proc add*(father, son: PXmlNode) {.inline.} =
## adds the child `son` to `father`.
add(father.s, son)
proc len*(n: PXmlNode): int {.inline.} =
## returns the number `n`'s children.
if n.k == xnElement: result = len(n.s)
proc kind*(n: PXmlNode): TXmlNodeKind {.inline.} =
## returns `n`'s kind.
result = n.k
proc `[]`* (n: PXmlNode, i: int): PXmlNode {.inline.} =
## returns the `i`'th child of `n`.
assert n.k == xnElement
result = n.s[i]
iterator items*(n: PXmlNode): PXmlNode {.inline.} =
## iterates over any child of `n`.
assert n.k == xnElement
for i in 0 .. n.len-1: yield n[i]
proc attr*(n: PXmlNode): PXmlAttributes {.inline.} =
## gets the attributes belonging to `n`.
assert n.k == xnElement
result = n.fAttr
proc `attr=`*(n: PXmlNode, attr: PXmlAttributes) {.inline.} =
## sets the attributes belonging to `n`.
assert n.k == xnElement
n.fAttr = attr
proc attrLen*(n: PXmlNode): int {.inline.} =
## returns the number of `n`'s attributes.
assert n.k == xnElement
if not isNil(n.fAttr): result = len(n.fAttr)
proc clientData*(n: PXmlNode): int {.inline.} =
## gets the client data of `n`. The client data field is used by the HTML
## parser and generator.
result = n.fClientData
proc `clientData=`*(n: PXmlNode, data: int) {.inline.} =
## sets the client data of `n`. The client data field is used by the HTML
## parser and generator.
n.fClientData = data
proc addEscaped*(result: var string, s: string) =
## same as ``result.add(escape(s))``, but more efficient.
for c in items(s):
case c
of '<': result.add("&lt;")
of '>': result.add("&gt;")
of '&': result.add("&amp;")
of '"': result.add("&quot;")
else: result.add(c)
proc escape*(s: string): string =
## escapes `s` for inclusion into an XML document.
## Escapes these characters:
##
## ------------ -------------------
## char is converted to
## ------------ -------------------
## ``<`` ``&lt;``
## ``>`` ``&gt;``
## ``&`` ``&amp;``
## ``"`` ``&quot;``
## ------------ -------------------
result = newString(s.len)
setLen(result, 0)
addEscaped(result, s)
proc addIndent(result: var string, indent: int) =
result.add("\n")
for i in 1..indent: result.add(' ')
proc add*(result: var string, n: PXmlNode, indent = 0, indWidth = 2) =
## adds the textual representation of `n` to `result`.
case n.k
of xnElement:
result.add('<')
result.add(n.fTag)
if not isNil(n.fAttr):
for key, val in pairs(n.fAttr):
result.add(' ')
result.add(key)
result.add("=\"")
result.addEscaped(val)
result.add('"')
if n.len > 0:
result.add('>')
for i in 0..n.len-1:
result.addIndent(indent+indWidth)
result.add(n[i], indent+indWidth, indWidth)
result.addIndent(indent)
result.add("</")
result.add(n.fTag)
result.add(">")
else:
result.add(" />")
of xnText:
result.addEscaped(n.fText)
of xnComment:
result.add("<!-- ")
result.addEscaped(n.fText)
result.add(" -->")
of xnCDATA:
result.add("<![CDATA[")
result.add(n.fText)
result.add("]]>")
proc `$`*(n: PXmlNode): string =
## converts `n` into its string representation.
result = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
result.add(n)
proc newXmlTree*(tag: string, children: openArray[PXmlNode],
attributes: PXmlAttributes = nil): PXmlNode =
## creates a new XML tree with `tag`, `children` and `attributes`
result = newXmlNode(xnElement)
result.fTag = tag
newSeq(result.s, children.len)
for i in 0..children.len-1: result.s[i] = children[i]
result.fAttr = attributes
proc xmlConstructor(e: PNimrodNode): PNimrodNode {.compileTime.} =
## use this procedure to define a new XML tag
expectLen(e, 1)
var a = e[0]
if a.kind == nnkCall:
result = newCall("newXmlTree", toStrLit(a[0]))
var attrs = newCall("newStringTable", [])
var bracket = newNimNode(nnkBracket, a)
for i in 1..a.len-1:
if a[i].kind == nnkExprEqExpr:
attrs.add(toStrLit(a[i][0]))
attrs.add(a[i][1])
else:
bracket.add(a[i])
result.add(bracket)
if attrs.len > 1: result.add(attrs)
else:
result = newCall("newXmlTree", toStrLit(a))
macro `<>`*(x: expr): expr =
## Constructor macro for XML. Example usage:
##
## .. code-block:: nimrod
## <>a(href="http://force7.de/nimrod", "Nimrod rules.")
##
## Produces an XML tree for::
##
## <a href="http://force7.de/nimrod">Nimrod rules.</a>
##
result = xmlConstructor(x)

View File

@@ -0,0 +1,52 @@
#
#
# Nimrod's Runtime Library
# (c) Copyright 2010 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## This module parses an XML document and creates its XML tree representation.
import streams, parsexml, xmltree
proc parse*(x: var TXmlParser, father: PXmlNode) =
proc parseXml*(s: PStream, filename: string,
errors: var seq[string]): PXmlNode =
## parses the XML from stream `s` and returns a ``PXmlNode``. Every
## occured parsing error is added to the `errors` sequence.
var x: TXmlParser
open(x, s, filename, {reportComments})
result = newElement("html")
while true:
x.next()
case x.kind
of xmlWhitespace: nil # just skip it
of xmlComment:
result.add(newComment(x.text))
close(x)
proc parseXml*(s: PStream): PXmlNode =
## parses the XTML from stream `s` and returns a ``PXmlNode``. All parsing
## errors are ignored.
var errors: seq[string] = @[]
result = parseXml(s, "unknown_html_doc", errors)
proc loadXml*(path: string, reportErrors = false): PXmlNode =
## Loads and parses XML from file specified by ``path``, and returns
## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are
## ``echo``ed.
var s = newFileStream(path, fmRead)
if s == nil: raise newException(EIO, "Unable to read file: " & path)
var errors: seq[string] = @[]
result = parseXml(s, path, errors)
if reportErrors:
for msg in items(errors): echo(msg)

View File

@@ -1257,7 +1257,9 @@ proc echo*[Ty](x: openarray[Ty]) {.magic: "Echo".}
## equivalent to ``writeln(stdout, x); flush(stdout)``. BUT: This is
## available for the ECMAScript target too!
template newException(exceptn, message: expr): expr =
template newException*(exceptn, message: expr): expr =
## creates an exception object of type "exceptn" and sets its ``msg`` field
## to `message`. Returns the new exception object.
block: # open a new scope
var
e: ref exceptn

View File

@@ -26,6 +26,7 @@ Additions
- Added ``system.cstringArrayToSeq``.
- Added ``system.lines(f: TFile)`` iterator.
- Added ``system.delete``, ``system.del`` and ``system.insert`` for sequences.
- Exported ``system.newException`` template.
- Added ``cgi.decodeData(data: string): tuple[key, value: string]``.
- Added ``ropes`` module.
- Added ``sockets`` module.
@@ -36,6 +37,9 @@ Additions
- Added ``unidecode`` module.
- Added ``xmldom`` module.
- Added ``xmldomparser`` module.
- Added ``xmltree`` module.
- Added ``xmltreeparser`` module.
- Added ``htmlparser`` module.
- Many wrappers now do not contain redundant name prefixes (like ``GTK_``,
``lua``). The new wrappers are available in ``lib/newwrap``. Change
your configuration file to use these.
@@ -100,7 +104,7 @@ Changes affecting backwards compatibility
- The compiler does not skip the linking step anymore even if no file
has changed.
- ``os.splitFile(".xyz")`` now returns ``("", ".xyz", "")`` instead of
``("", "", ".xyz")``. Filenames starting with a dot are handled
``("", "", ".xyz")``. So filenames starting with a dot are handled
differently.
- ``strutils.split(s: string, seps: set[char])`` never yields the empty string
anymore. This behaviour is probably more appropriate for whitespace splitting.