implemented a stack trace profiler

This commit is contained in:
Araq
2012-09-15 00:21:10 +02:00
parent 4a435a8fb4
commit 214c7a2ebd
11 changed files with 241 additions and 65 deletions

View File

@@ -265,6 +265,10 @@ proc genWhileStmt(p: BProc, t: PNode) =
let label = assignLabel(p.blocks[p.breakIdx])
lineF(p, cpsStmts, "if (!$1) goto $2;$n", [rdLoc(a), label])
genStmts(p, t.sons[1])
if optProfiler in p.options:
# invoke at loop body exit:
lineCg(p, cpsStmts, "#nimProfile();$n")
endBlock(p)
dec(p.withinLoop)

View File

@@ -680,27 +680,14 @@ proc genProcAux(m: BModule, prc: PSym) =
app(generatedProc, initFrame(p, procname, filename))
else:
app(generatedProc, p.s(cpsLocals))
if (optProfiler in prc.options) and (gCmd != cmdCompileToLLVM):
if gProcProfile >= 64 * 1024:
InternalError(prc.info, "too many procedures for profiling")
discard cgsym(m, "profileData")
appf(p.s(cpsLocals), "\tticks NIM_profilingStart;$n")
if prc.loc.a < 0:
appf(m.s[cfsDebugInit], "\tprofileData[$1].procname = $2;$n", [
toRope(gProcProfile),
makeCString(prc.name.s)])
prc.loc.a = gProcProfile
inc(gProcProfile)
prepend(p.s(cpsInit), ropef("\tNIM_profilingStart = getticks();$n"))
if (optProfiler in prc.options) and (gCmd != cmdCompileToLLVM):
# invoke at proc entry for recursion:
appcg(p, cpsInit, "\t#nimProfile();$n", [])
app(generatedProc, p.s(cpsInit))
app(generatedProc, p.s(cpsStmts))
if p.beforeRetNeeded: appf(generatedProc, "\tBeforeRet: ;$n")
app(generatedProc, deinitGCFrame(p))
if optStackTrace in prc.options: app(generatedProc, deinitFrame(p))
if (optProfiler in prc.options) and (gCmd != cmdCompileToLLVM):
appf(generatedProc,
"\tprofileData[$1].total += elapsed(getticks(), NIM_profilingStart);$n",
[toRope(prc.loc.a)])
app(generatedProc, returnStmt)
appf(generatedProc, "}$N")
app(m.s[cfsProcs], generatedProc)

View File

@@ -115,7 +115,6 @@ type
var
mainModProcs*, mainModInit*, mainDatInit*: PRope # parts of the main module
gMapping*: PRope # the generated mapping file (if requested)
gProcProfile*: Natural # proc profile counter
gPendingModules*: seq[BModule] = @[] # list of modules that are not
# finished with code generation
gModules*: seq[BModule] = @[] # list of all compiled modules

View File

@@ -20,6 +20,10 @@ import
when hasTinyCBackend:
import tccgen
when defined(profiler):
{.hint: "Profiling support is turned on!".}
import nimprof
var
arguments: string = "" # the arguments to be passed to the program that
# should be run

View File

@@ -294,6 +294,7 @@ proc crcFromRopeAux(r: PRope, startVal: TCrc32): TCrc32 =
result = crcFromRopeAux(r.right, result)
proc newCrcFromRopeAux(r: PRope, startVal: TCrc32): TCrc32 =
# XXX profiling shows this is actually expensive
var stack: TRopeSeq = @[r]
result = startVal
while len(stack) > 0:

125
lib/pure/nimprof.nim Normal file
View File

@@ -0,0 +1,125 @@
#
#
# Nimrod's Runtime Library
# (c) Copyright 2012 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## Profiling support for Nimrod. This is an embedded profiler that requires
## ``--profiler:on``. You only need to import this module to get a profiling
## report at program exit.
when not defined(profiler):
{.error: "Profiling support is turned off!".}
# We don't want to profile the profiling code ...
{.push profiler: off.}
import hashes, algorithm, strutils
# We use a simple hash table of bounded size to keep track of the stack traces:
type
TProfileEntry = object
total: int
st: system.TStackTrace
TProfileData = array [0..64*1024-1, ref TProfileEntry]
proc `==`(a, b: TStackTrace): bool =
for i in 0 .. high(a):
if a[i] != b[i]: return false
result = true
# XXX extract this data structure; it is generally useful ;-)
var
profileData: TProfileData
emptySlots = profileData.len * 3 div 2
maxChainLen = 0
totalCalls = 0
proc hook(st: system.TStackTrace) {.nimcall.} =
# this is quite performance sensitive!
inc totalCalls
var last = high(st)
while last > 0 and isNil(st[last]): dec last
var h = hash(pointer(st[last])) and high(profileData)
# we use probing for maxChainLen entries and replace the encountered entry
# with the minimal 'total' value:
if emptySlots == 0:
var minIdx = h
var probes = maxChainLen
while probes >= 0:
if profileData[h].st == st:
# wow, same entry found:
inc profileData[h].total
return
if profileData[minIdx].total < profileData[h].total:
minIdx = h
h = ((5 * h) + 1) and high(profileData)
dec probes
profileData[minIdx].total = 1
profileData[minIdx].st = st
else:
var chain = 0
while true:
if profileData[h] == nil:
GC_disable()
new profileData[h]
GC_enable()
profileData[h].total = 1
profileData[h].st = st
dec emptySlots
maxChainLen = max(maxChainLen, chain)
return
if profileData[h].st == st:
# wow, same entry found:
inc profileData[h].total
maxChainLen = max(maxChainLen, chain)
return
h = ((5 * h) + 1) and high(profileData)
inc chain
proc getTotal(x: ref TProfileEntry): int =
result = if isNil(x): 0 else: x.total
proc cmpEntries(a, b: ref TProfileEntry): int =
result = b.getTotal - a.getTotal
proc writeProfile() {.noconv.} =
stopProfiling()
const filename = "profile_results"
var f: TFile
var j = 1
while open(f, filename & $j & ".txt"):
close(f)
inc(j)
let filen = filename & $j & ".txt"
echo "writing ", filen, "..."
if open(f, filen, fmWrite):
sort(profileData, cmpEntries)
writeln(f, "total executions of each stack trace:")
var entries = 0
for i in 0..high(profileData):
if profileData[i] != nil: inc entries
var sum = 0
# only write the first 100 entries:
for i in 0..min(100, high(profileData)):
if profileData[i] != nil and profileData[i].total > 1:
inc sum, profileData[i].total
writeln(f, "Entry: ", i, "/", entries, " Calls: ",
profileData[i].total, "/", totalCalls, " [sum: ", sum, "; ",
formatFloat(sum / totalCalls * 100.0, ffDefault, 2), "%]")
for ii in 0..high(TStackTrace):
let procname = profileData[i].st[ii]
if isNil(procname): break
writeln(f, " ", procname)
close(f)
echo "... done"
system.profilerHook = hook
addQuitProc(writeProfile)
{.pop.}

View File

@@ -829,7 +829,8 @@ when taintMode:
else:
type TaintedString* = string
when defined(profiler):
proc nimProfile() {.compilerProc, noinline.}
when hasThreadSupport:
{.pragma: rtlThreadVar, threadvar.}
else:

View File

@@ -11,6 +11,7 @@
# TODO:
# - eliminate "used" field
# - make searching for block O(1)
{.push profiler:off.}
# ------------ platform specific chunk allocation code -----------------------
@@ -794,3 +795,4 @@ template InstantiateForRegion(allocator: expr) =
else:
result = realloc(p, newsize)
{.pop.}

View File

@@ -7,7 +7,6 @@
# distribution, for details about the copyright.
#
# Garbage Collector
#
# The basic algorithm is *Deferrent Reference Counting* with cycle detection.
@@ -17,6 +16,7 @@
# Special care has been taken to avoid recursion as far as possible to avoid
# stack overflows when traversing deep datastructures. It is well-suited
# for soft real time applications (like games).
{.push profiler:off.}
const
CycleIncrease = 2 # is a multiplicative increase
@@ -26,7 +26,7 @@ const
# this seems to be a good value
withRealTime = defined(useRealtimeGC)
when withRealTime:
when withRealTime and not defined(getTicks):
include "system/timers"
const
@@ -426,6 +426,8 @@ proc rawNewObj(typ: PNimType, size: int, gch: var TGcHeap): pointer =
result = cellToUsr(res)
sysAssert(allocInv(gch.region), "rawNewObj end")
{.pop.}
proc newObj(typ: PNimType, size: int): pointer {.compilerRtl.} =
result = rawNewObj(typ, size, gch)
zeroMem(result, size)
@@ -514,6 +516,8 @@ proc growObj(old: pointer, newsize: int, gch: var TGcHeap): pointer =
proc growObj(old: pointer, newsize: int): pointer {.rtl.} =
result = growObj(old, newsize, gch)
{.push profiler:off.}
# ---------------- cycle collector -------------------------------------------
proc doOperation(p: pointer, op: TWalkOp) =
@@ -917,3 +921,5 @@ when not defined(useNimRtl):
"[GC] max pause time [ms]: " & $(gch.stat.maxPause div 1000_000)
when traceGC: writeLeakage()
GC_enable()
{.pop.}

View File

@@ -8,54 +8,100 @@
#
# This file implements the Nimrod profiler. The profiler needs support by the
# code generator.
# code generator. The idea is to inject the instruction stream
# with 'nimProfile()' calls. These calls are injected at every loop end
# (except perhaps loops that have no side-effects). At every Nth call a
# stack trace is taken. A stack tace is a list of cstrings. We have a count
# table of those.
#
# The nice thing about this profiler is that it's completely time independent!
{.push profiler: off.}
when not defined(getTicks):
include "system/timers"
const
MaxTraceLen = 20 # tracking the last 20 calls is enough
type
TProfileData {.compilerproc, final.} = object
procname: cstring
total: float
TStackTrace* = array [0..MaxTraceLen-1, cstring]
TProfilerHook* = proc (st: TStackTrace) {.nimcall.}
proc captureStackTrace(f: PFrame, st: var TStackTrace) =
const
firstCalls = 5
var
it = f
i = 0
total = 0
while it != nil and i <= high(st)-(firstCalls-1):
# the (-1) is for a nil entry that marks where the '...' should occur
st[i] = it.procname
inc(i)
inc(total)
it = it.prev
var b = it
while it != nil:
inc(total)
it = it.prev
for j in 1..total-i-(firstCalls-1):
if b != nil: b = b.prev
if total != i:
st[i] = "..."
inc(i)
while b != nil and i <= high(st):
st[i] = b.procname
inc(i)
b = b.prev
var
profileData {.compilerproc.}: array [0..64*1024-1, TProfileData]
profilerHook*: TProfilerHook
## set this variable to provide a procedure that implements a profiler in
## user space. See the `nimprof` module for a reference implementation.
SamplingInterval = 50_000
# set this to change the default sampling interval
gTicker = SamplingInterval
interval: TNanos = 5_000_000 # 5ms
proc sortProfile(a: var array[0..64*1024-1, TProfileData], N: int) =
# we use shellsort here; fast enough and simple
var h = 1
while true:
h = 3 * h + 1
if h > N: break
while true:
h = h div 3
for i in countup(h, N - 1):
var v = a[i]
var j = i
while a[j-h].total <= v.total:
a[j] = a[j-h]
j = j-h
if j < h: break
a[j] = v
if h == 1: break
proc callProfilerHook(hook: TProfilerHook) {.noinline.} =
# 'noinline' so that 'nimProfile' does not perform the stack allocation
# in the common case.
var st: TStackTrace
captureStackTrace(framePtr, st)
hook(st)
proc writeProfile() {.noconv.} =
const filename = "profile_results"
var i = 0
var f: TFile
var j = 1
while open(f, filename & $j & ".txt"):
close(f)
inc(j)
if open(f, filename & $j & ".txt", fmWrite):
var N = 0
# we have to compute the actual length of the array:
while profileData[N].procname != nil: inc(N)
sortProfile(profileData, N)
writeln(f, "total running time of each proc" &
" (interpret these numbers relatively)")
while profileData[i].procname != nil:
write(f, profileData[i].procname)
write(f, ": ")
writeln(f, profileData[i].total)
inc(i)
close(f)
proc setProfilingInterval*(intervalInUs: int): TNanos =
## set this to change the sampling interval. Default value is 5ms.
interval = intervalInUs * 1000
addQuitProc(writeProfile)
var t0: TTicks
var usefulCall* = 0
var uselessCall* = 0
proc nimProfile() =
## This is invoked by the compiler in every loop and on every proc entry!
dec gTicker
if gTicker == 0:
gTicker = -1
let t1 = getticks()
if getticks() - t0 > interval:
inc usefulCall
if not isNil(profilerHook):
# disable recursive calls: XXX should use try..finally,
# but that's too expensive!
let oldHook = profilerHook
profilerHook = nil
callProfilerHook(oldHook)
profilerHook = oldHook
t0 = getticks()
else:
inc uselessCall
gTicker = SamplingInterval
proc stopProfiling*() =
## call this to stop profiling; should be called by the client profiler.
profilerHook = nil
{.pop.}

View File

@@ -1,7 +1,6 @@
version 0.9.0
=============
- make 'm: stmt' use overloading resolution
- make 'bind' default for templates and introduce 'mixin'
- implicit deref for parameter matching
@@ -22,6 +21,7 @@ Bugs
- bug: the parser is not strict enough with newlines: 'echo "a" echo "b"'
compiles
- bug: blocks can "export" an identifier but the CCG generates {} for them ...
- bug: what if we pass an iterator to a proc var? iterators use tyProc too ...
version 0.9.XX
@@ -38,6 +38,7 @@ version 0.9.XX
echo a
echo b)
- make 'm: stmt' use overloading resolution
- implement "closure tuple consists of a single 'ref'" optimization
- implement for loop transformation for first class iterators
- JS gen: