bugfixes for unicode; xmlparser; htmlparser; scanner

2026-05-02 03:54:44 +00:00 · 2010-02-20 19:21:38 +01:00
parent 64da2f1681
commit 6bc16904ed
18 changed files with 226 additions and 74 deletions
--- a/lib/pure/htmlparser.nim
+++ b/lib/pure/htmlparser.nim
@@ -265,7 +265,7 @@ proc addNode(father, son: PXmlNode) =
 proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode

 proc expected(x: var TXmlParser, n: PXmlNode): string =
-  result = errorMsg(x, "</" & n.tag & "$1> expected")
+  result = errorMsg(x, "</" & n.tag & "> expected")

 proc untilElementEnd(x: var TXmlParser, result: PXmlNode, 
                     errors: var seq[string]) =
@@ -378,17 +378,19 @@ proc parseHtml*(s: PStream): PXmlNode =
  var errors: seq[string] = @[]
  result = parseHtml(s, "unknown_html_doc", errors)

-proc loadHtml*(path: string, reportErrors = false): PXmlNode = 
+proc loadHtml*(path: string, errors: var seq[string]): PXmlNode = 
  ## Loads and parses HTML from file specified by ``path``, and returns 
-  ## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are
-  ## ``echo``ed, otherwise they are ignored.
+  ## a ``PXmlNode``.  Every occured parsing error is added to
+  ## the `errors` sequence.
  var s = newFileStream(path, fmRead)
  if s == nil: raise newException(EIO, "Unable to read file: " & path)
-  
-  var errors: seq[string] = @[]
  result = parseHtml(s, path, errors)
-  if reportErrors: 
-    for msg in items(errors): echo(msg)
+
+proc loadHtml*(path: string): PXmlNode = 
+  ## Loads and parses HTML from file specified by ``path``, and returns 
+  ## a ``PXmlNode``. All parsing errors are ignored.
+  var errors: seq[string] = @[]  
+  result = loadHtml(path, errors)

 when true:
  nil
@@ -402,4 +404,18 @@ else:
    if n == nil or n.htmlTag != tagHtml: 
      errors.add("<html> tag expected")
    checkHtmlAux(n, errors)
+  
+when isMainModule:
+  import os
+
+  var errors: seq[string] = @[]  
+  var x = loadHtml(paramStr(1), errors)
+  for e in items(errors): echo e
+  
+  var f: TFile
+  if open(f, "test.txt", fmWrite):
+    f.write($x)
+    f.close()
+  else:
+    quit("cannot write test.txt")
  
--- a/lib/pure/os.nim
+++ b/lib/pure/os.nim
@@ -1,7 +1,7 @@
 #
 #
 #            Nimrod's Runtime Library
-#        (c) Copyright 2009 Andreas Rumpf
+#        (c) Copyright 2010 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
@@ -619,9 +619,11 @@ proc sameFileContent*(path1, path2: string): bool =
  close(a)
  close(b)

-proc copyFile*(dest, source: string) =
+proc copyFile*(dest, source: string) {.deprecated.} =
  ## Copies a file from `source` to `dest`. If this fails,
  ## `EOS` is raised.
+  ## **Deprecated since version 0.8.8**: Use this proc with named arguments
+  ## only, because the order will change!
  when defined(Windows):
    if CopyFileA(source, dest, 0'i32) == 0'i32: OSError()
  else:
@@ -647,8 +649,10 @@ proc copyFile*(dest, source: string) =
    close(s)
    close(d)

-proc moveFile*(dest, source: string) =
+proc moveFile*(dest, source: string) {.deprecated.} =
  ## Moves a file from `source` to `dest`. If this fails, `EOS` is raised.
+  ## **Deprecated since version 0.8.8**: Use this proc with named arguments
+  ## only, because the order will change!
  if crename(source, dest) != 0'i32: OSError()

 proc removeFile*(file: string) =
--- a/lib/pure/unicode.nim
+++ b/lib/pure/unicode.nim
@@ -83,8 +83,8 @@ proc toUTF8*(c: TRune): string =
    result[0] = chr(i)
  elif i <=% 0x07FF:
    result = newString(2)
-    result[0] = chr(i shr 6 or 0b110_0000)
-    result[1] = chr(i and ones(6) or 0b10_000000)
+    result[0] = chr((i shr 6) or 0b110_00000)
+    result[1] = chr((i and ones(6)) or 0b10_000000)
  elif i <=% 0xFFFF:
    result = newString(3)
    result[0] = chr(i shr 12 or 0b1110_0000)
--- a/lib/pure/xmldom.nim
+++ b/lib/pure/xmldom.nim
@@ -227,7 +227,7 @@ proc createAttributeNS*(doc: PDocument, namespaceURI: string, qualifiedName: str
    raise newException(EInvalidCharacterErr, "Invalid character")
  # Exceptions
  if qualifiedName.contains(':'):
-    if namespaceURI == nil or namespaceURI == "":
+    if namespaceURI == nil:
      raise newException(ENamespaceErr, "When qualifiedName contains a prefix namespaceURI cannot be nil")
    elif qualifiedName.split(':')[0].toLower() == "xml" and namespaceURI != "http://www.w3.org/XML/1998/namespace":
      raise newException(ENamespaceErr, 
@@ -303,7 +303,7 @@ proc createElement*(doc: PDocument, tagName: string): PElement =
 proc createElementNS*(doc: PDocument, namespaceURI: string, qualifiedName: string): PElement =
  ## Creates an element of the given qualified name and namespace URI.
  if qualifiedName.contains(':'):
-    if namespaceURI == nil or namespaceURI == "":
+    if namespaceURI == nil:
      raise newException(ENamespaceErr, "When qualifiedName contains a prefix namespaceURI cannot be nil")
    elif qualifiedName.split(':')[0].toLower() == "xml" and namespaceURI != "http://www.w3.org/XML/1998/namespace":
      raise newException(ENamespaceErr, 
@@ -464,8 +464,11 @@ proc localName*(n: PNode): string =

 proc namespaceURI*(n: PNode): string =
  ## Returns this nodes namespace URI
-
+  
  return n.FNamespaceURI
+  
+proc `namespaceURI=`*(n: PNode, value: string) = 
+  n.FNamespaceURI = value

 proc nextSibling*(n: PNode): PNode =
  ## Returns the next sibling of this node
@@ -507,7 +510,7 @@ proc previousSibling*(n: PNode): PNode =
      return n.FParentNode.childNodes[i - 1]
  return nil
  
-proc `prefix=`*(n: var PNode, value: string) =
+proc `prefix=`*(n: PNode, value: string) =
  ## Modifies the prefix of this node

  # Setter
@@ -530,11 +533,10 @@ proc `prefix=`*(n: var PNode, value: string) =
  if n.nodeType == ElementNode:
    var el: PElement = PElement(n)
    el.FTagName = value & ":" & n.FLocalName
-    n = PNode(el)
+
  elif n.nodeType == AttributeNode:
    var attr: PAttr = PAttr(n)
    attr.FName = value & ":" & n.FLocalName
-    n = PNode(attr)

 # Procedures
 proc appendChild*(n: PNode, newChild: PNode) =
@@ -1078,4 +1080,4 @@ proc nodeToXml(n: PNode, indent: int = 0): string =
 proc `$`*(doc: PDocument): string =
  ## Converts a PDocument object into a string representation of it's XML
  result = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
-  result.add(nodeToXml(doc.documentElement))
+  result.add(nodeToXml(doc.documentElement))
--- a/lib/pure/xmldomparser.nim
+++ b/lib/pure/xmldomparser.nim
@@ -14,9 +14,34 @@ import xmldom, os, streams, parsexml, strutils
 #XMLDom's Parser - Turns XML into a Document

 type
-  #Parsing errors
+  # Parsing errors
  EMismatchedTag* = object of E_Base ## Raised when a tag is not properly closed
  EParserError* = object of E_Base ## Raised when an unexpected XML Parser event occurs
+
+  # For namespaces
+  xmlnsAttr = tuple[name, value: string, ownerElement: PElement]
+
+var nsList: seq[xmlnsAttr] = @[] # Used for storing namespaces
+
+proc getNS(prefix: string): string =
+  var defaultNS: seq[string] = @[]
+
+  for key, value, tag in items(nsList):
+    if ":" in key:
+      if key.split(':')[1] == prefix:
+        return value
+        
+    if key == "xmlns":
+      defaultNS.add(value)
+      
+  # Don't return the default namespaces
+  # in the loop, because then they would have a precedence
+  # over normal namespaces
+  if defaultNS.len() > 0:
+    return defaultNS[0] # Return the first found default namespace
+                        # if none are specified for this prefix
+    
+  return ""
    
 proc parseText(x: var TXmlParser, doc: var PDocument): PText =
  result = doc.createTextNode(x.charData())
@@ -28,24 +53,33 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
    case x.kind()
    of xmlEof:
      break
-    of xmlElementStart:
+    of xmlElementStart, xmlElementOpen:
      if n.tagName() != "":
        n.appendChild(parseElement(x, doc))
      else:
-        n = doc.createElement(x.elementName)
-    of xmlElementOpen:
-      if n.tagName() != "":
-        n.appendChild(parseElement(x, doc))
-      else:
-        if x.elementName.contains(':'):
-          #TODO: NamespaceURI
-          n = doc.createElementNS("nil", x.elementName)
-        else:  
-          n = doc.createElement(x.elementName)
+        n = doc.createElementNS("", x.elementName)
        
    of xmlElementEnd:
      if x.elementName == n.nodeName:
        # n.normalize() # Remove any whitespace etc.
+        
+        var ns: string
+        if x.elementName.contains(':'):
+          ns = getNS(x.elementName.split(':')[0])
+        else:
+          ns = getNS("")
+        
+        n.namespaceURI = ns
+        
+        # Remove any namespaces this element declared
+        var count = 0 # Variable which keeps the index
+                      # We need to edit it..
+        for i in low(nsList)..len(nsList)-1:
+          if nsList[count][2] == n:
+            nsList.delete(count)
+            dec(count)
+          inc(count)
+
        return n
      else: #The wrong element is ended
        raise newException(EMismatchedTag, "Mismatched tag at line " & 
@@ -54,11 +88,15 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
    of xmlCharData:
      n.appendChild(parseText(x, doc))
    of xmlAttribute:
+      if x.attrKey == "xmlns" or x.attrKey.startsWith("xmlns:"):
+        nsList.add((x.attrKey, x.attrValue, n))
+        
      if x.attrKey.contains(':'):
-        #TODO: NamespaceURI
-        n.setAttributeNS("nil", x.attrKey, x.attrValue)
+        var ns = getNS(x.attrKey)
+        n.setAttributeNS(ns, x.attrKey, x.attrValue)
      else:
        n.setAttribute(x.attrKey, x.attrValue)
+
    of xmlCData:
      n.appendChild(doc.createCDATASection(x.charData()))
    of xmlComment:
@@ -75,16 +113,13 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =

  raise newException(EMismatchedTag, 
    "Mismatched tag at line " & $x.getLine() & " column " & $x.getColumn)
-    
-proc loadXML*(path: string): PDocument =
-  ## Loads and parses XML from file specified by ``path``, and returns 
+
+proc loadXMLStream*(stream: PStream): PDocument =
+  ## Loads and parses XML from a stream specified by ``stream``, and returns 
  ## a ``PDocument``
-  
-  var s = newFileStream(path, fmRead)
-  if s == nil: raise newException(EIO, "Unable to read file " & path)

  var x: TXmlParser
-  open(x, s, path, {reportComments})
+  open(x, stream, nil, {reportComments})
  
  var XmlDoc: PDocument
  var DOM: PDOMImplementation = getDOM()
@@ -102,10 +137,32 @@ proc loadXML*(path: string): PDocument =
    else:
      raise newException(EParserError, "Unexpected XML Parser event")

-  close(x)
  return XmlDoc

+proc loadXML*(xml: string): PDocument =
+  ## Loads and parses XML from a string specified by ``xml``, and returns 
+  ## a ``PDocument``
+  var s = newStringStream(xml)
+  return loadXMLStream(s)
+  
+    
+proc loadXMLFile*(path: string): PDocument =
+  ## Loads and parses XML from a file specified by ``path``, and returns 
+  ## a ``PDocument``
+  
+  var s = newFileStream(path, fmRead)
+  if s == nil: raise newException(EIO, "Unable to read file " & path)
+  return loadXMLStream(s)
+

 when isMainModule:
-  var xml = loadXML(r"C:\Users\Dominik\Desktop\Code\Nimrod\xmldom\test.xml")
-  echo($xml)
+  var xml = loadXMLFile(r"C:\Users\Dominik\Desktop\Code\Nimrod\xmldom\test.xml")
+  #echo(xml.getElementsByTagName("m:test2")[0].namespaceURI)
+  #echo(xml.getElementsByTagName("bla:test")[0].namespaceURI)
+  #echo(xml.getElementsByTagName("test")[0].namespaceURI)
+  for i in items(xml.getElementsByTagName("*")):
+    if i.namespaceURI != nil:
+      echo(i.nodeName, "=", i.namespaceURI)
+
+    
+  echo($xml)
--- a/lib/pure/xmltreeparser.nim
+++ b/lib/pure/xmltreeparser.nim
@@ -25,6 +25,8 @@ proc raiseInvalidXml(errors: seq[string]) =
 proc addNode(father, son: PXmlNode) = 
  if son != nil: add(father, son)

+proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode
+
 proc untilElementEnd(x: var TXmlParser, result: PXmlNode, 
                     errors: var seq[string]) =
  while true:
@@ -33,11 +35,11 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
      if x.elementName == result.tag: 
        next(x)
      else:
-        errors.add(errorMsg(x, "</" & result.tag & "$1> expected"))
+        errors.add(errorMsg(x, "</" & result.tag & "> expected"))
        # do not skip it here!
      break
    of xmlEof:
-      errors.add(errorMsg(x, "</" & result.tag & "$1> expected"))
+      errors.add(errorMsg(x, "</" & result.tag & "> expected"))
      break
    else:
      result.addNode(parse(x, errors))
@@ -91,7 +93,7 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode =
    next(x)
  of xmlEntity:
    ## &entity;
-    ## XXX To implement!
+    errors.add(errorMsg(x, "unknown entity: " & x.entityName))
    next(x)
  of xmlEof: nil

@@ -110,6 +112,8 @@ proc parseXml*(s: PStream, filename: string,
    of xmlComment, xmlWhitespace: nil # just skip it
    of xmlError:
      errors.add(errorMsg(x))
+    of xmlSpecial:
+      errors.add(errorMsg(x, "<some_tag> expected"))      
    else:
      errors.add(errorMsg(x, "<some_tag> expected"))
      break
@@ -122,17 +126,33 @@ proc parseXml*(s: PStream): PXmlNode =
  result = parseXml(s, "unknown_html_doc", errors)
  if errors.len > 0: raiseInvalidXMl(errors)

-proc loadXml*(path: string, reportErrors = false): PXmlNode = 
+proc loadXml*(path: string, errors: var seq[string]): PXmlNode = 
  ## Loads and parses XML from file specified by ``path``, and returns 
-  ## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are
-  ## ``echo``ed, otherwise an exception is thrown.
+  ## a ``PXmlNode``. Every occured parsing error is added to the `errors`
+  ## sequence.
  var s = newFileStream(path, fmRead)
  if s == nil: raise newException(EIO, "Unable to read file: " & path)
-  
-  var errors: seq[string] = @[]
  result = parseXml(s, path, errors)
-  if reportErrors: 
-    for msg in items(errors): echo(msg)
-  elif errors.len > 0: 
-    raiseInvalidXMl(errors)

+proc loadXml*(path: string): PXmlNode = 
+  ## Loads and parses XML from file specified by ``path``, and returns 
+  ## a ``PXmlNode``.  All parsing errors are turned into an ``EInvalidXML``
+  ## exception.  
+  var errors: seq[string] = @[]
+  result = loadXml(path, errors)
+  if errors.len > 0: raiseInvalidXMl(errors)
+
+when isMainModule:
+  import os
+
+  var errors: seq[string] = @[]  
+  var x = loadXml(paramStr(1), errors)
+  for e in items(errors): echo e
+  
+  var f: TFile
+  if open(f, "xmltest.txt", fmWrite):
+    f.write($x)
+    f.close()
+  else:
+    quit("cannot write test.txt")
+    
--- a/lib/pure/xmltree.nim
+++ b/lib/pure/xmltree.nim
@@ -153,8 +153,15 @@ proc addIndent(result: var string, indent: int) =
  result.add("\n")
  for i in 1..indent: result.add(' ')
  
+proc noWhitespace(n: PXmlNode): bool =
+  #for i in 1..n.len-1:
+  #  if n[i].kind != n[0].kind: return true
+  for i in 0..n.len-1:
+    if n[i].kind in {xnText, xnEntity}: return true
+  
 proc add*(result: var string, n: PXmlNode, indent = 0, indWidth = 2) = 
  ## adds the textual representation of `n` to `result`.
+  if n == nil: return
  case n.k
  of xnElement:
    result.add('<')
@@ -168,10 +175,19 @@ proc add*(result: var string, n: PXmlNode, indent = 0, indWidth = 2) =
        result.add('"')
    if n.len > 0:
      result.add('>')
-      for i in 0..n.len-1:
-        result.addIndent(indent+indWidth)
-        result.add(n[i], indent+indWidth, indWidth)
-      result.addIndent(indent)
+      if n.len > 1:
+        if noWhitespace(n):
+          # for mixed leaves, we cannot output whitespace for readability,
+          # because this would be wrong. For example: ``a<b>b</b>`` is
+          # different from ``a <b>b</b>``.
+          for i in 0..n.len-1: result.add(n[i], indent+indWidth, indWidth)
+        else: 
+          for i in 0..n.len-1:
+            result.addIndent(indent+indWidth)
+            result.add(n[i], indent+indWidth, indWidth)
+          result.addIndent(indent)
+      else:
+        result.add(n[0], indent+indWidth, indWidth)
      result.add("</")
      result.add(n.fTag)
      result.add(">")