progress for httpclient

This commit is contained in:
rumpf_a@web.de
2010-01-30 20:05:29 +01:00
parent e20293818c
commit d318f2eb35
3 changed files with 291 additions and 171 deletions

View File

@@ -1,26 +1,113 @@
import sockets, strutils, parseurl, pegs
#
#
# Nimrod's Runtime Library
# (c) Copyright 2010 Dominik Picheta, Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## This module implements a simple HTTP client that can be used to retrieve
## webpages/other data.
# neuer Code:
import sockets, strutils, parseurl, pegs, os, parseutils
type
response = tuple[version: string, status: string, headers: seq[header], body: string]
header = tuple[htype: string, hvalue: string]
TResponse* = tuple[
version: string, status: string, headers: seq[THeader],
body: string]
THeader* = tuple[htype: string, hvalue: string]
EInvalidHttp* = object of EBase ## exception that is raised when server does
## not conform to the implemented HTTP
## protocol
EHttpRequestErr* = object of EBase ## Thrown in the ``getContent`` proc,
## when the server returns an error
template newException(exceptn, message: expr): expr =
block: # open a new scope
var
e: ref exceptn
new(e)
e.msg = message
e
proc httpError(msg: string) =
var e: ref EInvalidHttp
new(e)
e.msg = msg
raise e
proc fileError(msg: string) =
var e: ref EIO
new(e)
e.msg = msg
raise e
proc parseResponse(data: string): response =
proc getHeaderValue*(headers: seq[THeader], name: string): string =
## Retrieves a header by ``name``, from ``headers``.
## Returns "" if a header is not found
for i in low(headers)..high(headers):
if cmpIgnoreCase(headers[i].htype, name) == 0:
return headers[i].hvalue
return ""
proc parseBody(data: var string, start: int, s: TSocket,
headers: seq[THeader]): string =
if getHeaderValue(headers, "Transfer-Encoding") == "chunked":
# get chunks:
var i = start
result = ""
while true:
var chunkSize = 0
var j = parseHex(data, chunkSize, i)
if j <= 0: break
inc(i, j)
while data[i] notin {'\C', '\L', '\0'}: inc(i)
if data[i] == '\C': inc(i)
if data[i] == '\L': inc(i)
if chunkSize <= 0: break
result.add(copy(data, i, i+chunkSize-1))
if i + chunkSize > data.len:
echo "i: ", i, " size: ", chunkSize, " len: ", data.len
assert(i + chunkSize <= data.len)
i = i + chunkSize
# skip trailing CR-LF:
#if data[i] == '\C': inc(i)
#if data[i] == '\L': inc(i)
echo "came here"
data.add(s.recv())
else:
result = copy(data, start)
# -REGION- Content-Length
# (http://tools.ietf.org/html/rfc2616#section-4.4) NR.3
var contentLengthHeader = getHeaderValue(headers, "Content-Length")
if contentLengthHeader != "":
var length = contentLengthHeader.parseint()
while result.len() < length: result.add(s.recv())
else:
# (http://tools.ietf.org/html/rfc2616#section-4.4) NR.4 TODO
# -REGION- Connection: Close
# (http://tools.ietf.org/html/rfc2616#section-4.4) NR.5
if getHeaderValue(headers, "Connection") == "close":
while True:
var moreData = recv(s)
if moreData.len == 0: break
result.add(moreData)
proc parseResponse(s: TSocket): TResponse =
var data = s.recv()
var i = 0
#Parse the version
#Parses the first line of the headers
#``HTTP/1.1`` 200 OK
# Parse the version
# Parses the first line of the headers
# ``HTTP/1.1`` 200 OK
var matches: array[0..1, string]
var L = data.matchLen(peg"\i 'HTTP/' {'1.1'/'1.0'} \s+ {(!\n .)*}\n",
matches, i)
@@ -30,9 +117,9 @@ proc parseResponse(data: string): response =
result.status = matches[1]
inc(i, L)
#Parse the headers
#Everything after the first line leading up to the body
#htype: hvalue
# Parse the headers
# Everything after the first line leading up to the body
# htype: hvalue
result.headers = @[]
while true:
@@ -42,7 +129,7 @@ proc parseResponse(data: string): response =
key.add(data[i])
inc(i)
inc(i) # skip ':'
if data[i] == ' ': inc(i)
if data[i] == ' ': inc(i) # skip if the character is a space
var val = ""
while data[i] notin {'\C', '\L', '\0'}:
val.add(data[i])
@@ -59,58 +146,9 @@ proc parseResponse(data: string): response =
inc(i)
break
#Parse the body
#Everything after the headers(The first double CRLF)
result.body = data.copy(i)
result.body = parseBody(data, i, s, result.headers)
proc readChunked(data: var string, s: TSocket): response =
#Read data from socket until the terminating chunk size is found(0\c\L\c\L)
while true:
data.add(s.recv())
#Contains because
#trailers might be present
#after the terminating chunk size
if data.contains("0\c\L\c\L"):
break
result = parseResponse(data) #Re-parse the body
var count, length, chunkLength: int = 0
var newBody: string = ""
var bodySplit: seq[string] = result.body.splitLines()
#Remove the chunks
for i in items(bodySplit):
if count == 1: #Get the first chunk size
chunkLength = ParseHexInt(i) - i.len() - 1
else:
if length >= chunkLength:
#The chunk size determines how much text is left
#Until the next chunk size
chunkLength = ParseHexInt(i)
length = 0
else:
#Break if the terminating chunk size is found
#This should ignore the `trailers`
if bodySplit[count] == "0": #This might cause problems...
break
#Add the text to the newBody
newBody.add(i & "\c\L")
length = length + i.len()
inc(count)
#Make the parsed body the new body
result.body = newBody
proc getHeaderValue*(headers: seq[header], name: string): string =
## Retrieves a header by ``name``, from ``headers``.
## Returns "" if a header is not found
for i in low(headers)..high(headers):
if cmpIgnoreCase(headers[i].htype, name) == 0:
return headers[i].hvalue
return ""
proc request*(url: string): response =
proc request*(url: string): TResponse =
var r = parse(url)
var headers: string
@@ -119,58 +157,46 @@ proc request*(url: string): response =
else:
headers = "GET / HTTP/1.1\c\L"
headers = headers & "Host: " & r.subdomain & r.domain & "\c\L\c\L"
add(headers, "Host: " & r.hostname & "\c\L\c\L")
var s = socket()
s.connect(r.subdomain & r.domain, TPort(80))
s.connect(r.hostname, TPort(80))
s.send(headers)
var data = s.recv()
result = parseResponse(data)
#-REGION- Transfer-Encoding
#-Takes precedence over Content-Length
#(http://tools.ietf.org/html/rfc2616#section-4.4) NR.2
var transferEncodingHeader = getHeaderValue(result.headers, "Transfer-Encoding")
if transferEncodingHeader == "chunked":
result = readChunked(data, s)
#-REGION- Content-Length
#(http://tools.ietf.org/html/rfc2616#section-4.4) NR.3
var contentLengthHeader = getHeaderValue(result.headers, "Content-Length")
if contentLengthHeader != "":
var length = contentLengthHeader.parseint()
while data.len() < length:
data.add(s.recv())
result = parseResponse(data)
#(http://tools.ietf.org/html/rfc2616#section-4.4) NR.4 TODO
#-REGION- Connection: Close
#(http://tools.ietf.org/html/rfc2616#section-4.4) NR.5
var connectionHeader = getHeaderValue(result.headers, "Connection")
if connectionHeader == "close":
while True:
var nD = s.recv()
if nD == "": break
data.add(nD)
result = parseResponse(data)
result = parseResponse(s)
s.close()
proc get*(url: string): response =
result = request(url)
proc redirection(status: string): bool =
const redirectionNRs = ["301", "302", "303", "307"]
for i in items(redirectionNRs):
if status.startsWith(i):
return True
proc get*(url: string, maxRedirects = 5): TResponse =
## low-level proc similar to ``request`` which handles redirection
result = request(url)
for i in 1..maxRedirects:
if result.status.redirection():
var locationHeader = getHeaderValue(result.headers, "Location")
if locationHeader == "": httpError("location header expected")
result = request(locationHeader)
proc getContent*(url: string): string =
## GET's the body and returns it as a string
## Raises exceptions for the status codes ``4xx`` and ``5xx``
var r = get(url)
if r.status[0] in {'4','5'}:
raise newException(EHTTPRequestErr, r.status)
else:
return r.body
proc downloadFile*(url: string, outputFilename: string) =
var f: TFile
if open(f, outputFilename, fmWrite):
f.write(getContent(url))
f.close()
else:
fileError("Unable to open file")
var r = get("http://www.google.co.uk/index.html")
#var r = get("http://www.crunchyroll.com")
echo("===================================")
echo(r.version & " " & r.status)
for htype, hvalue in items(r.headers):
echo(htype, ": ", hvalue)
echo("---------------------------------")
echo(r.body)
when isMainModule:
downloadFile("http://www.google.com", "GoogleTest.txt")

View File

@@ -1,64 +1,95 @@
import regexprs, strutils
#
#
# Nimrod's Runtime Library
# (c) Copyright 2010 Dominik Picheta
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## Parses & constructs URLs.
import strutils
type
TURL* = tuple[protocol, username, password,
subdomain, domain, port, path, query, anchor: string]
TURL* = tuple[ ## represents a *Uniform Resource Locator* (URL)
## any optional component is "" if it does not exist
scheme, username, password,
hostname, port, path, query, anchor: string]
proc parse*(url: string): TURL =
const pattern = r"([a-zA-Z]+://)?(.+@)?(.+\.)?(\w+)(\.\w+)(:[0-9]+)?(/.+)?"
var m: array[0..7, string] #Array with the matches
discard regexprs.match(url, pattern, m)
var msplit = m[2].split(':')
var i: int = 0
var username: string = ""
var password: string = ""
if m[2] != "":
username = msplit[0]
if msplit.len() == 2:
password = msplit[1].replace("@", "")
var scheme, username, password: string = ""
var hostname, port, path, query, anchor: string = ""
var path: string = ""
var query: string = ""
var anchor: string = ""
if m[7] != nil:
msplit = m[7].split('?')
path = msplit[0]
query = ""
anchor = ""
if msplit.len() == 2:
query = "?" & msplit[1]
msplit = path.split('#')
if msplit.len() == 2:
anchor = "#" & msplit[1]
path = msplit[0]
msplit = query.split('#')
if msplit.len() == 2:
anchor = "#" & msplit[1]
query = msplit[0]
result = (protocol: m[1], username: username, password: password,
subdomain: m[3], domain: m[4] & m[5], port: m[6], path: path, query: query, anchor: anchor)
when isMainModule:
proc test(r: TURL) =
echo("protocol=" & r.protocol)
echo("username=" & r.username)
echo("password=" & r.password)
echo("subdomain=" & r.subdomain)
echo("domain=" & r.domain)
echo("port=" & r.port)
echo("path=" & r.path)
echo("query=" & r.query)
echo("anchor=" & r.anchor)
echo("---------------")
var r: TURL
r = parse(r"http://google.co.uk/search?var=bleahdhsad")
test(r)
r = parse(r"http://dom96:test@google.com:80/search.php?q=562gs6&foo=6gs6&bar=7hs6#test")
test(r)
r = parse(r"http://www.google.co.uk/search?q=multiple+subdomains&ie=utf-8&oe=utf-8&aq=t&rls=org.mozilla:pl:official&client=firefox-a")
test(r)
var temp: string = ""
if url[i] != '/': #url isn't a relative path
while True:
#Scheme
if url[i] == ':':
if url[i+1] == '/' and url[i+2] == '/':
scheme = temp
temp = ""
inc(i, 3) #Skip the //
#Authority(username, password)
if url[i] == '@':
username = temp.split(':')[0]
if temp.split(':').len() > 1:
password = temp.split(':')[1]
temp = ""
inc(i) #Skip the @
#hostname(subdomain, domain, port)
if url[i] == '/' or url[i] == '\0':
#TODO
hostname = temp
if hostname.split(':').len() > 1:
port = hostname.split(':')[1]
hostname = hostname.split(':')[0]
temp = ""
break
temp.add(url[i])
inc(i)
#Path
while True:
if url[i] == '?':
path = temp
temp = ""
if url[i] == '#':
if temp[0] == '?':
query = temp
else:
path = temp
temp = ""
if url[i] == '\0':
if temp[0] == '?':
query = temp
elif temp[0] == '#':
anchor = temp
else:
path = temp
break
temp.add(url[i])
inc(i)
return (scheme, username, password, hostname, port, path, query, anchor)
proc `$`*(t: TURL): string =
result = ""
if t.scheme != "": result.add(t.scheme & "://")
if t.username != "":
if t.password != "":
result.add(t.username & ":" & t.password & "@")
else:
result.add(t.username & "@")
if t.hostname != "": result.add(t.hostname)
if t.port != "": result.add(":" & t.port)
if t.path != "": result.add(t.path)
if t.query != "": result.add(t.query)
if t.anchor != "": result.add(t.anchor)

63
lib/devel/parseutils.nim Normal file
View File

@@ -0,0 +1,63 @@
#
#
# Nimrod's Runtime Library
# (c) Copyright 2010 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## Helpers for parsing.
import strutils
proc parseHex*(s: string, number: var int, start = 0): int =
## parses a hexadecimal number and stores its value in ``number``. Returns
## the number of the parsed characters or 0 in case of an error.
var i = start
var foundDigit = false
if s[i] == '0' and (s[i+1] == 'x' or s[i+1] == 'X'): inc(i, 2)
elif s[i] == '#': inc(i)
while true:
case s[i]
of '_': nil
of '0'..'9':
number = number shl 4 or (ord(s[i]) - ord('0'))
foundDigit = true
of 'a'..'f':
number = number shl 4 or (ord(s[i]) - ord('a') + 10)
foundDigit = true
of 'A'..'F':
number = number shl 4 or (ord(s[i]) - ord('A') + 10)
foundDigit = true
else: break
inc(i)
if foundDigit: result = i-start
proc parseIdent*(s: string, ident: var string, start = 0): int =
## parses an identifier and stores it in ``ident``. Returns
## the number of the parsed characters or 0 in case of an error.
var i = start
if s[i] in IdentStartChars:
inc(i)
while s[i] in IdentChars: inc(i)
ident = copy(s, start, i-1)
result = i-start
proc skipWhitespace*(s: string, start = 0): int {.inline.} =
while s[start+result] in Whitespace: inc(result)
proc skip*(s, token: string, start = 0): int =
while result < token.len and s[result+start] == token[result]: inc(result)
if result != token.len: result = 0
proc skipIgnoreCase*(s, token: string, start = 0): int =
while result < token.len and
toLower(s[result+start]) == toLower(token[result]): inc(result)
if result != token.len: result = 0
proc parseBiggestInt*(s: string, number: var biggestInt, start = 0): int =
assert(false) # to implement
proc parseBiggestFloat*(s: string, number: var biggestFloat, start = 0): int =
assert(false) # to implement