Files
Nim/tests/niminaction/Chapter6/WikipediaStats/parallel_counts.nim
2017-10-01 17:17:40 +01:00

72 lines
2.2 KiB
Nim

import os, parseutils, threadpool, strutils
type
Stats = ref object
domainCode, pageTitle: string
countViews, totalSize: int
proc newStats(): Stats =
Stats(domainCode: "", pageTitle: "", countViews: 0, totalSize: 0)
proc `$`(stats: Stats): string =
"(domainCode: $#, pageTitle: $#, countViews: $#, totalSize: $#)" % [
stats.domainCode, stats.pageTitle, $stats.countViews, $stats.totalSize
]
proc parse(line: string, domainCode, pageTitle: var string,
countViews, totalSize: var int) =
if line.len == 0: return
var i = 0
domainCode.setLen(0)
i.inc parseUntil(line, domainCode, {' '}, i)
i.inc
pageTitle.setLen(0)
i.inc parseUntil(line, pageTitle, {' '}, i)
i.inc
countViews = 0
i.inc parseInt(line, countViews, i)
i.inc
totalSize = 0
i.inc parseInt(line, totalSize, i)
proc parseChunk(chunk: string): Stats =
result = newStats()
var domainCode = ""
var pageTitle = ""
var countViews = 0
var totalSize = 0
for line in splitLines(chunk):
parse(line, domainCode, pageTitle, countViews, totalSize)
if domainCode == "en" and countViews > result.countViews:
result = Stats(domainCode: domainCode, pageTitle: pageTitle,
countViews: countViews, totalSize: totalSize)
proc readPageCounts(filename: string, chunkSize = 1_000_000) =
var file = open(filename)
var responses = newSeq[FlowVar[Stats]]()
var buffer = newString(chunksize)
var oldBufferLen = 0
while not endOfFile(file):
let reqSize = chunksize - oldBufferLen
let readSize = file.readChars(buffer, oldBufferLen, reqSize) + oldBufferLen
var chunkLen = readSize
while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines:
chunkLen.dec
responses.add(spawn parseChunk(buffer[0 .. <chunkLen]))
oldBufferLen = readSize - chunkLen
buffer[0 .. <oldBufferLen] = buffer[readSize - oldBufferLen .. ^1]
var mostPopular = newStats()
for resp in responses:
let statistic = ^resp
if statistic.countViews > mostPopular.countViews:
mostPopular = statistic
echo("Most popular is: ", mostPopular)
when isMainModule:
const file = "pagecounts-20160101-050000"
let filename = getCurrentDir() / file
readPageCounts(filename)