mirror of
https://github.com/nim-lang/Nim.git
synced 2026-01-08 22:13:29 +00:00
72 lines
2.2 KiB
Nim
72 lines
2.2 KiB
Nim
import os, parseutils, threadpool, strutils
|
|
|
|
type
|
|
Stats = ref object
|
|
domainCode, pageTitle: string
|
|
countViews, totalSize: int
|
|
|
|
proc newStats(): Stats =
|
|
Stats(domainCode: "", pageTitle: "", countViews: 0, totalSize: 0)
|
|
|
|
proc `$`(stats: Stats): string =
|
|
"(domainCode: $#, pageTitle: $#, countViews: $#, totalSize: $#)" % [
|
|
stats.domainCode, stats.pageTitle, $stats.countViews, $stats.totalSize
|
|
]
|
|
|
|
proc parse(line: string, domainCode, pageTitle: var string,
|
|
countViews, totalSize: var int) =
|
|
if line.len == 0: return
|
|
var i = 0
|
|
domainCode.setLen(0)
|
|
i.inc parseUntil(line, domainCode, {' '}, i)
|
|
i.inc
|
|
pageTitle.setLen(0)
|
|
i.inc parseUntil(line, pageTitle, {' '}, i)
|
|
i.inc
|
|
countViews = 0
|
|
i.inc parseInt(line, countViews, i)
|
|
i.inc
|
|
totalSize = 0
|
|
i.inc parseInt(line, totalSize, i)
|
|
|
|
proc parseChunk(chunk: string): Stats =
|
|
result = newStats()
|
|
var domainCode = ""
|
|
var pageTitle = ""
|
|
var countViews = 0
|
|
var totalSize = 0
|
|
for line in splitLines(chunk):
|
|
parse(line, domainCode, pageTitle, countViews, totalSize)
|
|
if domainCode == "en" and countViews > result.countViews:
|
|
result = Stats(domainCode: domainCode, pageTitle: pageTitle,
|
|
countViews: countViews, totalSize: totalSize)
|
|
|
|
proc readPageCounts(filename: string, chunkSize = 1_000_000) =
|
|
var file = open(filename)
|
|
var responses = newSeq[FlowVar[Stats]]()
|
|
var buffer = newString(chunksize)
|
|
var oldBufferLen = 0
|
|
while not endOfFile(file):
|
|
let reqSize = chunksize - oldBufferLen
|
|
let readSize = file.readChars(buffer, oldBufferLen, reqSize) + oldBufferLen
|
|
var chunkLen = readSize
|
|
|
|
while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines:
|
|
chunkLen.dec
|
|
|
|
responses.add(spawn parseChunk(buffer[0 .. <chunkLen]))
|
|
oldBufferLen = readSize - chunkLen
|
|
buffer[0 .. <oldBufferLen] = buffer[readSize - oldBufferLen .. ^1]
|
|
|
|
var mostPopular = newStats()
|
|
for resp in responses:
|
|
let statistic = ^resp
|
|
if statistic.countViews > mostPopular.countViews:
|
|
mostPopular = statistic
|
|
|
|
echo("Most popular is: ", mostPopular)
|
|
|
|
when isMainModule:
|
|
const file = "pagecounts-20160101-050000"
|
|
let filename = getCurrentDir() / file
|
|
readPageCounts(filename) |