progress for httpclient

2026-06-02 01:48:00 +00:00 · 2010-01-30 20:05:29 +01:00
parent e20293818c
commit d318f2eb35
3 changed files with 291 additions and 171 deletions
--- a/lib/devel/httpclient.nim
+++ b/lib/devel/httpclient.nim
@@ -1,26 +1,113 @@
-import sockets, strutils, parseurl, pegs
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2010 Dominik Picheta, Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## This module implements a simple HTTP client that can be used to retrieve
+## webpages/other data.
+
+# neuer Code:
+import sockets, strutils, parseurl, pegs, os, parseutils

 type
-  response = tuple[version: string, status: string, headers: seq[header], body: string]
-  header = tuple[htype: string, hvalue: string] 
+  TResponse* = tuple[
+    version: string, status: string, headers: seq[THeader],
+    body: string]
+  THeader* = tuple[htype: string, hvalue: string]

  EInvalidHttp* = object of EBase ## exception that is raised when server does
                                  ## not conform to the implemented HTTP
                                  ## protocol

+  EHttpRequestErr* = object of EBase ## Thrown in the ``getContent`` proc,
+                                     ## when the server returns an error
+
+template newException(exceptn, message: expr): expr =
+  block: # open a new scope
+    var
+      e: ref exceptn
+    new(e)
+    e.msg = message
+    e
+
 proc httpError(msg: string) =
  var e: ref EInvalidHttp
  new(e)
  e.msg = msg
  raise e
+  
+proc fileError(msg: string) =
+  var e: ref EIO
+  new(e)
+  e.msg = msg
+  raise e

-proc parseResponse(data: string): response =
+proc getHeaderValue*(headers: seq[THeader], name: string): string =
+  ## Retrieves a header by ``name``, from ``headers``.
+  ## Returns "" if a header is not found
+  for i in low(headers)..high(headers):
+    if cmpIgnoreCase(headers[i].htype, name) == 0:
+      return headers[i].hvalue
+  return ""
+
+proc parseBody(data: var string, start: int, s: TSocket,
+               headers: seq[THeader]): string =
+  if getHeaderValue(headers, "Transfer-Encoding") == "chunked":
+    # get chunks:
+    var i = start
+    result = ""
+    while true:
+      var chunkSize = 0
+      var j = parseHex(data, chunkSize, i)
+      if j <= 0: break
+      inc(i, j)
+      while data[i] notin {'\C', '\L', '\0'}: inc(i)
+      if data[i] == '\C': inc(i)
+      if data[i] == '\L': inc(i)
+      if chunkSize <= 0: break
+      result.add(copy(data, i, i+chunkSize-1))
+      if i + chunkSize > data.len:
+        echo "i: ", i, " size: ", chunkSize, " len: ", data.len
+      
+      assert(i + chunkSize <= data.len)
+      i = i + chunkSize
+      # skip trailing CR-LF:
+      #if data[i] == '\C': inc(i)
+      #if data[i] == '\L': inc(i)
+      
+      echo "came here"
+      data.add(s.recv())
+  else:
+    result = copy(data, start)
+    # -REGION- Content-Length
+    # (http://tools.ietf.org/html/rfc2616#section-4.4) NR.3
+    var contentLengthHeader = getHeaderValue(headers, "Content-Length")
+    if contentLengthHeader != "":
+      var length = contentLengthHeader.parseint()
+      while result.len() < length: result.add(s.recv())
+    else:
+      # (http://tools.ietf.org/html/rfc2616#section-4.4) NR.4 TODO
+      
+      # -REGION- Connection: Close
+      # (http://tools.ietf.org/html/rfc2616#section-4.4) NR.5
+      if getHeaderValue(headers, "Connection") == "close":
+        while True:
+          var moreData = recv(s)
+          if moreData.len == 0: break
+          result.add(moreData)
+
+proc parseResponse(s: TSocket): TResponse =
+  var data = s.recv()
  var i = 0

-  #Parse the version
-  #Parses the first line of the headers
-  #``HTTP/1.1`` 200 OK
-    
+  # Parse the version
+  # Parses the first line of the headers
+  # ``HTTP/1.1`` 200 OK
+
  var matches: array[0..1, string]
  var L = data.matchLen(peg"\i 'HTTP/' {'1.1'/'1.0'} \s+ {(!\n .)*}\n",
                        matches, i)
@@ -30,9 +117,9 @@ proc parseResponse(data: string): response =
  result.status = matches[1]
  inc(i, L)
  
-  #Parse the headers
-  #Everything after the first line leading up to the body
-  #htype: hvalue
+  # Parse the headers
+  # Everything after the first line leading up to the body
+  # htype: hvalue

  result.headers = @[]
  while true:
@@ -42,7 +129,7 @@ proc parseResponse(data: string): response =
      key.add(data[i])
      inc(i)
    inc(i) # skip ':'
-    if data[i] == ' ': inc(i)
+    if data[i] == ' ': inc(i) # skip if the character is a space
    var val = ""
    while data[i] notin {'\C', '\L', '\0'}:
      val.add(data[i])
@@ -59,58 +146,9 @@ proc parseResponse(data: string): response =
      inc(i)
      break
    
-  #Parse the body
-  #Everything after the headers(The first double CRLF)
-  result.body = data.copy(i)
-  
+  result.body = parseBody(data, i, s, result.headers) 

-proc readChunked(data: var string, s: TSocket): response =
-  #Read data from socket until the terminating chunk size is found(0\c\L\c\L)
-  while true:
-    data.add(s.recv())
-    #Contains because 
-    #trailers might be present
-    #after the terminating chunk size
-    if data.contains("0\c\L\c\L"): 
-      break
-      
-  result = parseResponse(data) #Re-parse the body
-  
-  var count, length, chunkLength: int = 0
-  var newBody: string = ""
-  var bodySplit: seq[string] = result.body.splitLines()
-  #Remove the chunks
-  for i in items(bodySplit):
-    if count == 1: #Get the first chunk size
-      chunkLength = ParseHexInt(i) - i.len() - 1
-    else:
-      if length >= chunkLength:
-        #The chunk size determines how much text is left
-        #Until the next chunk size
-        chunkLength = ParseHexInt(i)
-        length = 0
-      else:
-        #Break if the terminating chunk size is found
-        #This should ignore the `trailers`
-        if bodySplit[count] == "0": #This might cause problems...
-          break
-        
-        #Add the text to the newBody
-        newBody.add(i & "\c\L")
-        length = length + i.len()
-    inc(count)
-  #Make the parsed body the new body
-  result.body = newBody
-    
-proc getHeaderValue*(headers: seq[header], name: string): string =
-  ## Retrieves a header by ``name``, from ``headers``.
-  ## Returns "" if a header is not found
-  for i in low(headers)..high(headers):
-    if cmpIgnoreCase(headers[i].htype, name) == 0:
-      return headers[i].hvalue
-  return ""
-
-proc request*(url: string): response =
+proc request*(url: string): TResponse =
  var r = parse(url)
  
  var headers: string
@@ -119,58 +157,46 @@ proc request*(url: string): response =
  else:
    headers = "GET / HTTP/1.1\c\L"
  
-  headers = headers & "Host: " & r.subdomain & r.domain & "\c\L\c\L"
-  
+  add(headers, "Host: " & r.hostname & "\c\L\c\L")
+
  var s = socket()
-  s.connect(r.subdomain & r.domain, TPort(80))
+  s.connect(r.hostname, TPort(80))
  s.send(headers)
-  
-  var data = s.recv()
-  
-  result = parseResponse(data)
-
-  #-REGION- Transfer-Encoding 
-  #-Takes precedence over Content-Length
-  #(http://tools.ietf.org/html/rfc2616#section-4.4) NR.2
-  var transferEncodingHeader = getHeaderValue(result.headers, "Transfer-Encoding")
-  if transferEncodingHeader == "chunked":
-    result = readChunked(data, s)
-  
-  #-REGION- Content-Length
-  #(http://tools.ietf.org/html/rfc2616#section-4.4) NR.3
-  var contentLengthHeader = getHeaderValue(result.headers, "Content-Length")
-  if contentLengthHeader != "":
-    var length = contentLengthHeader.parseint()
-
-    while data.len() < length:
-      data.add(s.recv())
-      
-    result = parseResponse(data)
-    
-  #(http://tools.ietf.org/html/rfc2616#section-4.4) NR.4 TODO
-    
-  #-REGION- Connection: Close
-  #(http://tools.ietf.org/html/rfc2616#section-4.4) NR.5
-  var connectionHeader = getHeaderValue(result.headers, "Connection")
-  if connectionHeader == "close":
-    while True:
-      var nD = s.recv()
-      if nD == "": break
-      data.add(nD)
-    result = parseResponse(data)
-  
+  result = parseResponse(s)
  s.close()
-
-proc get*(url: string): response =
-  result = request(url)
  
+proc redirection(status: string): bool =
+  const redirectionNRs = ["301", "302", "303", "307"]
+  for i in items(redirectionNRs):
+    if status.startsWith(i):
+      return True
+  
+proc get*(url: string, maxRedirects = 5): TResponse =
+  ## low-level proc similar to ``request`` which handles redirection
+  result = request(url)
+  for i in 1..maxRedirects:
+    if result.status.redirection():
+      var locationHeader = getHeaderValue(result.headers, "Location")
+      if locationHeader == "": httpError("location header expected")
+      result = request(locationHeader)
+      
+proc getContent*(url: string): string =
+  ## GET's the body and returns it as a string
+  ## Raises exceptions for the status codes ``4xx`` and ``5xx``
+  var r = get(url)
+  if r.status[0] in {'4','5'}:
+    raise newException(EHTTPRequestErr, r.status)
+  else:
+    return r.body
+  
+proc downloadFile*(url: string, outputFilename: string) =
+  var f: TFile
+  if open(f, outputFilename, fmWrite):
+    f.write(getContent(url))
+    f.close()
+  else:
+    fileError("Unable to open file")

-var r = get("http://www.google.co.uk/index.html")
-#var r = get("http://www.crunchyroll.com")
-echo("===================================")
-echo(r.version & " " & r.status)

-for htype, hvalue in items(r.headers):
-  echo(htype, ": ", hvalue)
-echo("---------------------------------")
-echo(r.body)
+when isMainModule:
+  downloadFile("http://www.google.com", "GoogleTest.txt")
--- a/lib/devel/parseurl.nim
+++ b/lib/devel/parseurl.nim
@@ -1,64 +1,95 @@
-import regexprs, strutils
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2010 Dominik Picheta
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Parses & constructs URLs.
+
+import strutils

 type
-  TURL* = tuple[protocol, username, password,
-    subdomain, domain, port, path, query, anchor: string]
-
+  TURL* = tuple[      ## represents a *Uniform Resource Locator* (URL)
+                      ## any optional component is "" if it does not exist
+    scheme, username, password, 
+    hostname, port, path, query, anchor: string]
+    
 proc parse*(url: string): TURL =
-  const pattern = r"([a-zA-Z]+://)?(.+@)?(.+\.)?(\w+)(\.\w+)(:[0-9]+)?(/.+)?"
-  var m: array[0..7, string] #Array with the matches
-  discard regexprs.match(url, pattern, m)
- 
-  var msplit = m[2].split(':')
+  var i: int = 0

-  var username: string = ""
-  var password: string = ""
-  if m[2] != "":
-    username = msplit[0]
-    if msplit.len() == 2:
-      password = msplit[1].replace("@", "")
+  var scheme, username, password: string = ""
+  var hostname, port, path, query, anchor: string = ""

-  var path: string = ""
-  var query: string = ""
-  var anchor: string = ""
-     
-  if m[7] != nil:
-    msplit = m[7].split('?')
-    path = msplit[0]
-    query = ""
-    anchor = ""
-    if msplit.len() == 2:
-      query = "?" & msplit[1]
-     
-    msplit = path.split('#')
-    if msplit.len() == 2:
-      anchor = "#" & msplit[1]
-      path = msplit[0]
-    msplit = query.split('#')
-    if msplit.len() == 2:
-      anchor = "#" & msplit[1]
-      query = msplit[0]
- 
-  result = (protocol: m[1], username: username, password: password,
-    subdomain: m[3], domain: m[4] & m[5], port: m[6], path: path, query: query, anchor: anchor)
- 
-when isMainModule:
-  proc test(r: TURL) =
-    echo("protocol=" & r.protocol)
-    echo("username=" & r.username)
-    echo("password=" & r.password)
-    echo("subdomain=" & r.subdomain)
-    echo("domain=" & r.domain)
-    echo("port=" & r.port)
-    echo("path=" & r.path)
-    echo("query=" & r.query)
-    echo("anchor=" & r.anchor)
-    echo("---------------")
-   
-  var r: TURL
-  r = parse(r"http://google.co.uk/search?var=bleahdhsad")
-  test(r)
-  r = parse(r"http://dom96:test@google.com:80/search.php?q=562gs6&foo=6gs6&bar=7hs6#test")
-  test(r)
-  r = parse(r"http://www.google.co.uk/search?q=multiple+subdomains&ie=utf-8&oe=utf-8&aq=t&rls=org.mozilla:pl:official&client=firefox-a")
-  test(r)
+  var temp: string = ""
+  
+  if url[i] != '/': #url isn't a relative path
+    while True:
+      #Scheme
+      if url[i] == ':':
+        if url[i+1] == '/' and url[i+2] == '/':
+          scheme = temp
+          temp = ""
+          inc(i, 3) #Skip the //
+      #Authority(username, password)
+      if url[i] == '@':
+        username = temp.split(':')[0]
+        if temp.split(':').len() > 1:
+          password = temp.split(':')[1]
+        temp = ""
+        inc(i) #Skip the @ 
+      #hostname(subdomain, domain, port)
+      if url[i] == '/' or url[i] == '\0':
+        #TODO
+        hostname = temp
+        if hostname.split(':').len() > 1:
+          port = hostname.split(':')[1]
+          hostname = hostname.split(':')[0]
+        
+        temp = ""
+        break
+      
+      temp.add(url[i])
+      inc(i)
+
+  #Path
+  while True:
+    if url[i] == '?':
+      path = temp
+      temp = ""
+    if url[i] == '#':
+      if temp[0] == '?':
+        query = temp
+      else:
+        path = temp
+      temp = ""
+      
+    if url[i] == '\0':
+      if temp[0] == '?':
+        query = temp
+      elif temp[0] == '#':
+        anchor = temp
+      else:
+        path = temp
+      break
+      
+    temp.add(url[i])
+    inc(i)
+    
+  return (scheme, username, password, hostname, port, path, query, anchor)
+
+proc `$`*(t: TURL): string =
+  result = ""
+  if t.scheme != "": result.add(t.scheme & "://")
+  if t.username != "":
+    if t.password != "":
+      result.add(t.username & ":" & t.password & "@")
+    else:
+      result.add(t.username & "@")
+  if t.hostname != "": result.add(t.hostname)
+  if t.port != "": result.add(":" & t.port)
+  if t.path != "": result.add(t.path)
+  if t.query != "": result.add(t.query)
+  if t.anchor != "": result.add(t.anchor)
--- a/lib/devel/parseutils.nim
+++ b/lib/devel/parseutils.nim
@@ -0,0 +1,63 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2010 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Helpers for parsing.
+
+import strutils
+
+proc parseHex*(s: string, number: var int, start = 0): int = 
+  ## parses a hexadecimal number and stores its value in ``number``. Returns
+  ## the number of the parsed characters or 0 in case of an error.
+  var i = start
+  var foundDigit = false
+  if s[i] == '0' and (s[i+1] == 'x' or s[i+1] == 'X'): inc(i, 2)
+  elif s[i] == '#': inc(i)
+  while true: 
+    case s[i]
+    of '_': nil
+    of '0'..'9':
+      number = number shl 4 or (ord(s[i]) - ord('0'))
+      foundDigit = true
+    of 'a'..'f':
+      number = number shl 4 or (ord(s[i]) - ord('a') + 10)
+      foundDigit = true
+    of 'A'..'F':
+      number = number shl 4 or (ord(s[i]) - ord('A') + 10)
+      foundDigit = true
+    else: break
+    inc(i)
+  if foundDigit: result = i-start
+
+proc parseIdent*(s: string, ident: var string, start = 0): int =
+  ## parses an identifier and stores it in ``ident``. Returns
+  ## the number of the parsed characters or 0 in case of an error.
+  var i = start
+  if s[i] in IdentStartChars:
+    inc(i)
+    while s[i] in IdentChars: inc(i)
+    ident = copy(s, start, i-1)
+    result = i-start
+
+proc skipWhitespace*(s: string, start = 0): int {.inline.} =
+  while s[start+result] in Whitespace: inc(result)
+
+proc skip*(s, token: string, start = 0): int =
+  while result < token.len and s[result+start] == token[result]: inc(result)
+  if result != token.len: result = 0
+  
+proc skipIgnoreCase*(s, token: string, start = 0): int =
+  while result < token.len and
+      toLower(s[result+start]) == toLower(token[result]): inc(result)
+  if result != token.len: result = 0  
+
+proc parseBiggestInt*(s: string, number: var biggestInt, start = 0): int =
+  assert(false) # to implement
+
+proc parseBiggestFloat*(s: string, number: var biggestFloat, start = 0): int = 
+  assert(false) # to implement