bugfix: 'set' overloadable; further steps for multi threading support

2026-02-20 01:48:31 +00:00 · 2011-07-08 01:29:15 +02:00
parent 170573a87f
commit 99bcc233cd
34 changed files with 934 additions and 459 deletions
--- a/compiler/ast.nim
+++ b/compiler/ast.nim
@@ -57,7 +57,7 @@ type
    nkStrLit,             # a string literal ""
    nkRStrLit,            # a raw string literal r""
    nkTripleStrLit,       # a triple string literal """
-    nkMetaNode,           # difficult to explan; represents itself
+    nkMetaNode,           # difficult to explain; represents itself
                          # (used for macros)
    nkNilLit,             # the nil literal
                          # end of atoms
@@ -217,7 +217,7 @@ type
    sfDiscriminant,   # field is a discriminant in a record/object
    sfDeprecated,     # symbol is deprecated
    sfInClosure,      # variable is accessed by a closure
-    sfTypeCheck,      # wether macro parameters should be type checked
+    sfThread,         # proc will run as a thread
    sfCompileTime,    # proc can be evaluated at compile time
    sfThreadVar,      # variable is a thread variable
    sfMerge,          # proc can be merged with itself
@@ -274,7 +274,8 @@ type
    tfFinal,          # is the object final?
    tfAcyclic,        # type is acyclic (for GC optimization)
    tfEnumHasHoles,   # enum cannot be mapped into a range
-    tfShallow         # type can be shallow copied on assignment
+    tfShallow,        # type can be shallow copied on assignment
+    tfThread          # proc type is marked as ``thread``

  TTypeFlags* = set[TTypeFlag]

@@ -310,7 +311,7 @@ type

  TMagic* = enum # symbols that require compiler magic:
    mNone, mDefined, mDefinedInScope, mLow, mHigh, mSizeOf, mIs, 
-    mEcho, mCreateThread, mShallowCopy,
+    mEcho, mShallowCopy,
    mUnaryLt, mSucc, 
    mPred, mInc, mDec, mOrd, mNew, mNewFinalize, mNewSeq, mLengthOpenArray, 
    mLengthStr, mLengthArray, mLengthSeq, mIncl, mExcl, mCard, mChr, mGCref, 
--- a/compiler/ccgexprs.nim
+++ b/compiler/ccgexprs.nim
@@ -1452,7 +1452,6 @@ proc genMagicExpr(p: BProc, e: PNode, d: var TLoc, op: TMagic) =
  of mIncl, mExcl, mCard, mLtSet, mLeSet, mEqSet, mMulSet, mPlusSet, mMinusSet,
     mInSet:
    genSetOp(p, e, d, op)
-  of mCreateThread: genCall(p, e, d)
  of mNewString, mNewStringOfCap, mCopyStr, mCopyStrLast, mExit:
    var opr = e.sons[0].sym
    if lfNoDecl notin opr.loc.flags:
--- a/compiler/ccgthreadvars.nim
+++ b/compiler/ccgthreadvars.nim
@@ -10,24 +10,23 @@
 ## Thread var support for crappy architectures that lack native support for 
 ## thread local storage.

+proc emulatedThreadVars(): bool {.inline.} =
+  result = optThreads in gGlobalOptions
+  # NOW: Use the work-around everywhere, because it should be faster anyway.
+  #platform.OS[targetOS].props.contains(ospLacksThreadVars)
+
 proc AccessThreadLocalVar(p: BProc, s: PSym) =
-  if optThreads in gGlobalOptions:
-    if platform.OS[targetOS].props.contains(ospLacksThreadVars):
-      if not p.ThreadVarAccessed:
-        p.ThreadVarAccessed = true
-        p.module.usesThreadVars = true
-        appf(p.s[cpsLocals], "NimThreadVars* NimTV;$n")
-        appcg(p, cpsInit, "NimTV=(NimThreadVars*)#GetThreadLocalVars();$n")
+  if emulatedThreadVars() and not p.ThreadVarAccessed:
+    p.ThreadVarAccessed = true
+    p.module.usesThreadVars = true
+    appf(p.s[cpsLocals], "NimThreadVars* NimTV;$n")
+    appcg(p, cpsInit, "NimTV=(NimThreadVars*)#GetThreadLocalVars();$n")

 var
  nimtv: PRope # nimrod thread vars
  nimtvDeps: seq[PType] = @[]
  nimtvDeclared = initIntSet()

-proc emulatedThreadVars(): bool {.inline.} =
-  result = optThreads in gGlobalOptions and
-    platform.OS[targetOS].props.contains(ospLacksThreadVars)
-
 proc declareThreadVar(m: BModule, s: PSym, isExtern: bool) =
  if emulatedThreadVars():
    # we gather all thread locals var into a struct; we need to allocate
--- a/compiler/msgs.nim
+++ b/compiler/msgs.nim
@@ -94,7 +94,7 @@ type
    warnSmallLshouldNotBeUsed, warnUnknownMagic, warnRedefinitionOfLabel, 
    warnUnknownSubstitutionX, warnLanguageXNotSupported, warnCommentXIgnored, 
    warnXisPassedToProcVar, warnDerefDeprecated, warnAnalysisLoophole,
-    warnDifferentHeaps,
+    warnDifferentHeaps, warnWriteToForeignHeap,
    warnUser, 
    hintSuccess, hintSuccessX, 
    hintLineTooLong, hintXDeclaredButNotUsed, hintConvToBaseNotNeeded, 
@@ -328,7 +328,8 @@ const
    warnXisPassedToProcVar: "\'$1\' is passed to a procvar; deprecated [XisPassedToProcVar]", 
    warnDerefDeprecated: "p^ is deprecated; use p[] instead [DerefDeprecated]",
    warnAnalysisLoophole: "thread analysis incomplete due to unkown call '$1' [AnalysisLoophole]",
-    warnDifferentHeaps: "possible inconsistency of thread local heaps",
+    warnDifferentHeaps: "possible inconsistency of thread local heaps [DifferentHeaps]",
+    warnWriteToForeignHeap: "write to foreign heap [WriteToForeignHeap]",
    warnUser: "$1 [User]", 
    hintSuccess: "operation successful [Success]", 
    hintSuccessX: "operation successful ($1 lines compiled; $2 sec total) [SuccessX]", 
@@ -345,13 +346,13 @@ const
    hintPath: "added path: '$1' [Path]",
    hintUser: "$1 [User]"]

-const 
-  WarningsToStr*: array[0..17, string] = ["CannotOpenFile", "OctalEscape", 
+const
+  WarningsToStr*: array[0..18, string] = ["CannotOpenFile", "OctalEscape", 
    "XIsNeverRead", "XmightNotBeenInit", "CannotWriteMO2", "CannotReadMO2", 
    "Deprecated", "SmallLshouldNotBeUsed", "UnknownMagic", 
    "RedefinitionOfLabel", "UnknownSubstitutionX", "LanguageXNotSupported", 
    "CommentXIgnored", "XisPassedToProcVar", "DerefDeprecated",
-    "AnalysisLoophole", "DifferentHeaps", "User"]
+    "AnalysisLoophole", "DifferentHeaps", "WriteToForeignHeap", "User"]

  HintsToStr*: array[0..13, string] = ["Success", "SuccessX", "LineTooLong", 
    "XDeclaredButNotUsed", "ConvToBaseNotNeeded", "ConvFromXtoItselfNotNeeded", 
--- a/compiler/pragmas.nim
+++ b/compiler/pragmas.nim
@@ -22,11 +22,11 @@ const
  procPragmas* = {FirstCallConv..LastCallConv, wImportc, wExportc, wNodecl, 
    wMagic, wNosideEffect, wSideEffect, wNoreturn, wDynLib, wHeader, 
    wCompilerProc, wPure, wProcVar, wDeprecated, wVarargs, wCompileTime, wMerge, 
-    wBorrow, wExtern, wImportCompilerProc}
+    wBorrow, wExtern, wImportCompilerProc, wThread}
  converterPragmas* = procPragmas
  methodPragmas* = procPragmas
  macroPragmas* = {FirstCallConv..LastCallConv, wImportc, wExportc, wNodecl, 
-    wMagic, wNosideEffect, wCompilerProc, wDeprecated, wTypeCheck, wExtern}
+    wMagic, wNosideEffect, wCompilerProc, wDeprecated, wExtern}
  iteratorPragmas* = {FirstCallConv..LastCallConv, wNosideEffect, wSideEffect, 
    wImportc, wExportc, wNodecl, wMagic, wDeprecated, wBorrow, wExtern}
  stmtPragmas* = {wChecks, wObjChecks, wFieldChecks, wRangechecks, wBoundchecks, 
@@ -37,7 +37,7 @@ const
    wInfChecks, wNanChecks, wPragma, wEmit, wUnroll, wLinearScanEnd}
  lambdaPragmas* = {FirstCallConv..LastCallConv, wImportc, wExportc, wNodecl, 
    wNosideEffect, wSideEffect, wNoreturn, wDynLib, wHeader, wPure, 
-    wDeprecated, wExtern}
+    wDeprecated, wExtern, wThread}
  typePragmas* = {wImportc, wExportc, wDeprecated, wMagic, wAcyclic, wNodecl, 
    wPure, wHeader, wCompilerProc, wFinal, wSize, wExtern, wShallow}
  fieldPragmas* = {wImportc, wExportc, wDeprecated, wExtern}
@@ -45,7 +45,8 @@ const
    wMagic, wHeader, wDeprecated, wCompilerProc, wDynLib, wExtern}
  constPragmas* = {wImportc, wExportc, wHeader, wDeprecated, wMagic, wNodecl,
    wExtern}
-  procTypePragmas* = {FirstCallConv..LastCallConv, wVarargs, wNosideEffect}
+  procTypePragmas* = {FirstCallConv..LastCallConv, wVarargs, wNosideEffect,
+                      wThread}

 proc pragma*(c: PContext, sym: PSym, n: PNode, validPragmas: TSpecialWords)
 proc pragmaAsm*(c: PContext, n: PNode): char
@@ -125,10 +126,9 @@ proc processMagic(c: PContext, n: PNode, s: PSym) =
      s.magic = m
      break
  if s.magic == mNone: Message(n.info, warnUnknownMagic, v)
-  if s.magic != mCreateThread: 
-    # magics don't need an implementation, so we
-    # treat them as imported, instead of modifing a lot of working code:
-    incl(s.flags, sfImportc)
+  # magics don't need an implementation, so we
+  # treat them as imported, instead of modifing a lot of working code:
+  incl(s.flags, sfImportc)

 proc wordToCallConv(sw: TSpecialWord): TCallingConvention = 
  # this assumes that the order of special words and calling conventions is
@@ -500,9 +500,11 @@ proc pragma(c: PContext, sym: PSym, n: PNode, validPragmas: TSpecialWords) =
            noVal(it)
            if sym.typ == nil: invalidPragma(it)
            incl(sym.typ.flags, tfShallow)
-          of wTypeCheck: 
+          of wThread:
            noVal(it)
-            incl(sym.flags, sfTypeCheck)
+            incl(sym.flags, sfThread)
+            incl(sym.flags, sfProcVar)
+            if sym.typ != nil: incl(sym.typ.flags, tfThread)
          of wHint: Message(it.info, hintUser, expectStrLit(c, it))
          of wWarning: Message(it.info, warnUser, expectStrLit(c, it))
          of wError: LocalError(it.info, errUser, expectStrLit(c, it))
--- a/compiler/sem.nim
+++ b/compiler/sem.nim
@@ -203,8 +203,9 @@ proc myProcess(context: PPassContext, n: PNode): PNode =
      result = ast.emptyNode
  
 proc checkThreads(c: PContext) =
+  if not needsGlobalAnalysis(): return
  for i in 0 .. c.threadEntries.len-1:
-    semthreads.AnalyseThread(c.threadEntries[i])
+    semthreads.AnalyseThreadProc(c.threadEntries[i])
  
 proc myClose(context: PPassContext, n: PNode): PNode = 
  var c = PContext(context)
--- a/compiler/semdata.nim
+++ b/compiler/semdata.nim
@@ -41,7 +41,7 @@ type
    generics*: PNode          # a list of the things to compile; list of
                              # nkExprEqExpr nodes which contain the
                              # generic symbol and the instantiated symbol
-    threadEntries*: PNode     # list of thread entries to check
+    threadEntries*: TSymSeq   # list of thread entries to check
    lastGenericIdx*: int      # used for the generics stack
    tab*: TSymTab             # each module has its own symbol table
    AmbiguousSymbols*: TIntSet # ids of all ambiguous symbols (cannot
@@ -125,7 +125,7 @@ proc newContext(module: PSym, nimfile: string): PContext =
  append(result.optionStack, newOptionEntry())
  result.module = module
  result.generics = newNode(nkStmtList)
-  result.threadEntries = newNode(nkStmtList)
+  result.threadEntries = @[]
  result.converters = @[]
  result.filename = nimfile
  result.includedFiles = initIntSet()
--- a/compiler/semexprs.nim
+++ b/compiler/semexprs.nim
@@ -41,8 +41,8 @@ proc semSymGenericInstantiation(c: PContext, n: PNode, s: PSym): PNode =
 proc semSym(c: PContext, n: PNode, s: PSym, flags: TExprFlags): PNode = 
  case s.kind
  of skProc, skMethod, skIterator, skConverter: 
-    if not (sfProcVar in s.flags) and (s.typ.callConv == ccDefault) and
-        (getModule(s).id != c.module.id): 
+    if sfProcVar notin s.flags and s.typ.callConv == ccDefault and
+        getModule(s).id != c.module.id: 
      LocalError(n.info, errXCannotBePassedToProcVar, s.name.s)
    result = symChoice(c, n, s)
  of skConst: 
@@ -103,8 +103,8 @@ proc checkConvertible(info: TLineInfo, castDest, src: PType) =
    d = base(d)
    s = base(s)
  if d == nil: 
-    GlobalError(info, errGenerated, `%`(msgKindToString(errIllegalConvFromXtoY), [
-        typeToString(src), typeToString(castDest)]))
+    GlobalError(info, errGenerated, msgKindToString(errIllegalConvFromXtoY) % [
+        src.typeToString, castDest.typeToString])
  elif d.Kind == tyObject and s.Kind == tyObject: 
    checkConversionBetweenObjects(info, d, s)
  elif (skipTypes(castDest, abstractVarRange).Kind in IntegralTypes) and
@@ -195,15 +195,13 @@ proc semIs(c: PContext, n: PNode): PNode =
  if sonsLen(n) == 3: 
    n.sons[1] = semExprWithType(c, n.sons[1], {efAllowType})
    n.sons[2] = semExprWithType(c, n.sons[2], {efAllowType})
-    var a = n.sons[1].typ
-    var b = n.sons[2].typ
-    # a and b can be nil in case of an error:
-    if a != nil and b != nil:
-      if (b.kind != tyObject) or (a.kind != tyObject): 
-        GlobalError(n.info, errIsExpectsObjectTypes)
-      while (b != nil) and (b.id != a.id): b = b.sons[0]
-      if b == nil: 
-        GlobalError(n.info, errXcanNeverBeOfThisSubtype, typeToString(a))
+    var a = skipTypes(n.sons[1].typ, abstractPtrs)
+    var b = skipTypes(n.sons[2].typ, abstractPtrs)
+    if b.kind != tyObject or a.kind != tyObject: 
+      GlobalError(n.info, errIsExpectsObjectTypes)
+    while b != nil and b.id != a.id: b = b.sons[0]
+    if b == nil:
+      GlobalError(n.info, errXcanNeverBeOfThisSubtype, typeToString(a))
    n.typ = getSysType(tyBool)
  else: 
    GlobalError(n.info, errIsExpectsTwoArguments)
@@ -338,13 +336,13 @@ proc isAssignable(n: PNode): TAssignableResult =
  result = arNone
  case n.kind
  of nkSym: 
-    if (n.sym.kind in {skVar, skTemp}): result = arLValue
+    if n.sym.kind in {skVar, skTemp}: result = arLValue
  of nkDotExpr: 
    if skipTypes(n.sons[0].typ, abstractInst).kind in {tyVar, tyPtr, tyRef}: 
      result = arLValue
    else: 
      result = isAssignable(n.sons[0])
-    if (result == arLValue) and (sfDiscriminant in n.sons[1].sym.flags): 
+    if result == arLValue and sfDiscriminant in n.sons[1].sym.flags: 
      result = arDiscriminant
  of nkBracketExpr: 
    if skipTypes(n.sons[0].typ, abstractInst).kind in {tyVar, tyPtr, tyRef}: 
@@ -400,7 +398,7 @@ proc analyseIfAddressTakenInCall(c: PContext, n: PNode) =
      mAppendSeqElem, mNewSeq, mReset, mShallowCopy}
  checkMinSonsLen(n, 1)
  var t = n.sons[0].typ
-  if (n.sons[0].kind == nkSym) and (n.sons[0].sym.magic in FakeVarParams): 
+  if n.sons[0].kind == nkSym and n.sons[0].sym.magic in FakeVarParams: 
    # BUGFIX: check for L-Value still needs to be done for the arguments!
    for i in countup(1, sonsLen(n) - 1): 
      if i < sonsLen(t) and t.sons[i] != nil and
@@ -409,8 +407,8 @@ proc analyseIfAddressTakenInCall(c: PContext, n: PNode) =
          LocalError(n.sons[i].info, errVarForOutParamNeeded)
    return
  for i in countup(1, sonsLen(n) - 1): 
-    if (i < sonsLen(t)) and
-        (skipTypes(t.sons[i], abstractInst).kind == tyVar): 
+    if i < sonsLen(t) and
+        skipTypes(t.sons[i], abstractInst).kind == tyVar:
      n.sons[i] = analyseIfAddressTaken(c, n.sons[i])
  
 proc semDirectCallAnalyseEffects(c: PContext, n: PNode,
@@ -466,7 +464,7 @@ proc semIndirectOp(c: PContext, n: PNode, flags: TExprFlags): PNode =
      result = m.call
    # we assume that a procedure that calls something indirectly 
    # has side-effects:
-    if not (tfNoSideEffect in t.flags): incl(c.p.owner.flags, sfSideEffect)
+    if tfNoSideEffect notin t.flags: incl(c.p.owner.flags, sfSideEffect)
  else: 
    result = overloadedCallOpr(c, n)
    # Now that nkSym does not imply an iteration over the proc/iterator space,
@@ -845,10 +843,6 @@ proc semMagic(c: PContext, n: PNode, s: PSym, flags: TExprFlags): PNode =
  of mSizeOf: result = semSizeof(c, setMs(n, s))
  of mIs: result = semIs(c, setMs(n, s))
  of mEcho: result = semEcho(c, setMs(n, s))
-  of mCreateThread: 
-    result = semDirectOp(c, n, flags)
-    if semthreads.needsGlobalAnalysis():
-      c.threadEntries.add(result)
  of mShallowCopy:
    if sonsLen(n) == 3:
      # XXX ugh this is really a hack: shallowCopy() can be overloaded only
@@ -1103,12 +1097,17 @@ proc semExpr(c: PContext, n: PNode, flags: TExprFlags = {}): PNode =
      of skTemplate: result = semTemplateExpr(c, n, s)
      of skType: 
        if n.kind != nkCall: GlobalError(n.info, errXisNotCallable, s.name.s)
-        # XXX does this check make any sense?
-        result = semConv(c, n, s)
+        # XXX think about this more (``set`` procs)
+        if n.len == 2:
+          result = semConv(c, n, s)
+        elif Contains(c.AmbiguousSymbols, s.id): 
+          LocalError(n.info, errUseQualifier, s.name.s)
+        elif s.magic == mNone: result = semDirectOp(c, n, flags)
+        else: result = semMagic(c, n, s, flags)
      of skProc, skMethod, skConverter, skIterator: 
        if s.magic == mNone: result = semDirectOp(c, n, flags)
        else: result = semMagic(c, n, s, flags)
-      else: 
+      else:
        #liMessage(n.info, warnUser, renderTree(n));
        result = semIndirectOp(c, n, flags)
    elif n.sons[0].kind == nkSymChoice: 
--- a/compiler/semstmts.nim
+++ b/compiler/semstmts.nim
@@ -549,10 +549,12 @@ proc sideEffectsCheck(c: PContext, s: PSym) =
  if {sfNoSideEffect, sfSideEffect} * s.flags ==
      {sfNoSideEffect, sfSideEffect}: 
    LocalError(s.info, errXhasSideEffects, s.name.s)
+  elif sfThread in s.flags and semthreads.needsGlobalAnalysis():
+    c.threadEntries.add(s)
  
 proc addResult(c: PContext, t: PType, info: TLineInfo) = 
  if t != nil: 
-    var s = newSym(skVar, getIdent("result"), getCurrOwner())
+    var s = newSym(skVar, getIdent"result", getCurrOwner())
    s.info = info
    s.typ = t
    incl(s.flags, sfResult)
@@ -566,7 +568,7 @@ proc addResultNode(c: PContext, n: PNode) =
 proc semLambda(c: PContext, n: PNode): PNode = 
  result = n
  checkSonsLen(n, codePos + 1)
-  var s = newSym(skProc, getIdent(":anonymous"), getCurrOwner())
+  var s = newSym(skProc, getIdent":anonymous", getCurrOwner())
  s.info = n.info
  s.ast = n
  n.sons[namePos] = newSymNode(s)
@@ -594,10 +596,11 @@ proc semLambda(c: PContext, n: PNode): PNode =
    popProcCon(c)
  else: 
    LocalError(n.info, errImplOfXexpected, s.name.s)
+  sideEffectsCheck(c, s)
  closeScope(c.tab)           # close scope for parameters
  popOwner()
  result.typ = s.typ
-
+  
 proc semProcAux(c: PContext, n: PNode, kind: TSymKind, 
                validPragmas: TSpecialWords): PNode = 
  var 
--- a/compiler/semthreads.nim
+++ b/compiler/semthreads.nim
@@ -18,7 +18,7 @@
 ## The only crucial operation that can violate the heap invariants is the
 ## write access. The analysis needs to distinguish between 'unknown', 'mine',
 ## and 'theirs' memory and pointers. Assignments 'whatever <- unknown' are 
-## invalid, and so are 'theirs <- mine' but not 'mine <- theirs'. Since
+## invalid, and so are 'theirs <- whatever' but not 'mine <- theirs'. Since
 ## strings and sequences are heap allocated they are affected too:
 ##
 ## .. code-block:: nimrod
@@ -30,8 +30,9 @@
 ## If the type system would distinguish between 'ref' and '!ref' and threads
 ## could not have '!ref' as input parameters the analysis could simply need to
 ## reject any write access to a global variable which contains GC'ed data.
-## However, '!ref' is not implemented yet and this scheme would be too
-## restrictive anyway.
+## Thanks to the write barrier of the GC, this is exactly what needs to be
+## done! Every write access to a global that contains GC'ed data needs to
+## be prevented! Unfortunately '!ref' is not implemented yet...
 ##
 ## The assignment target is essential for the algorithm: only 
 ## write access to heap locations and global variables are critical and need
@@ -42,7 +43,8 @@
 ##  
 ##  var x = globalVar     # 'x' points to 'theirs'
 ##  while true:
-##    globalVar = x       # OK: 'theirs <- theirs'
+##    globalVar = x       # NOT OK: 'theirs <- theirs' invalid due to
+##                        # write barrier!
 ##    x = "new string"    # ugh: 'x is toUnknown'!
 ##
 ##  --> Solution: toUnknown is never allowed anywhere!
@@ -106,10 +108,12 @@ proc analyseSym(c: PProcCtx, n: PNode): TThreadOwner =
  if result != toUndefined: return
  case v.kind
  of skVar:
+    result = toNil
    if sfGlobal in v.flags:
-      result = if sfThreadVar in v.flags: toMine else: toTheirs
-    else:
-      result = toNil
+      if sfThreadVar in v.flags: 
+        result = toMine 
+      elif containsTyRef(v.typ):
+        result = toTheirs
  of skTemp, skForVar: result = toNil
  of skConst: result = toMine
  of skParam: 
@@ -136,7 +140,8 @@ proc writeAccess(c: PProcCtx, n: PNode, owner: TThreadOwner) =
    of toNil:
      c.mapping[v.id] = owner # fine, toNil can be overwritten
    of toVoid, toUndefined: InternalError(n.info, "writeAccess")
-    of toTheirs, toMine:
+    of toTheirs: Message(n.info, warnWriteToForeignHeap)
+    of toMine:
      if lastOwner != owner and owner != toNil:
        Message(n.info, warnDifferentHeaps)
  else:
@@ -145,7 +150,8 @@ proc writeAccess(c: PProcCtx, n: PNode, owner: TThreadOwner) =
    case lastOwner
    of toNil: nil # fine, toNil can be overwritten
    of toVoid, toUndefined: InternalError(n.info, "writeAccess")
-    of toTheirs, toMine:
+    of toTheirs: Message(n.info, warnWriteToForeignHeap)
+    of toMine:
      if lastOwner != owner and owner != toNil:
        Message(n.info, warnDifferentHeaps)

@@ -171,7 +177,8 @@ proc analyseCall(c: PProcCtx, n: PNode): TThreadOwner =
      newCtx.mapping[formal.id] = call.args[i-1]
    pushInfoContext(n.info)
    result = analyse(newCtx, prc.ast.sons[codePos])
-    if prc.ast.sons[codePos].kind == nkEmpty and sfNoSideEffect notin prc.flags:
+    if prc.ast.sons[codePos].kind == nkEmpty and 
+       {sfNoSideEffect, sfThread} * prc.flags == {}:
      Message(n.info, warnAnalysisLoophole, renderTree(n))
    if prc.typ.sons[0] != nil:
      if prc.ast.len > resultPos:
@@ -228,7 +235,7 @@ proc analyseArgs(c: PProcCtx, n: PNode, start = 1) =

 proc analyseOp(c: PProcCtx, n: PNode): TThreadOwner =
  if n[0].kind != nkSym or n[0].sym.kind != skProc:
-    if tfNoSideEffect notin n[0].typ.flags:
+    if {tfNoSideEffect, tfThread} * n[0].typ.flags == {}:
      Message(n.info, warnAnalysisLoophole, renderTree(n))
    result = toNil
  else:
@@ -335,22 +342,26 @@ proc analyse(c: PProcCtx, n: PNode): TThreadOwner =
      result = toVoid
  else: InternalError(n.info, "analysis not implemented for: " & $n.kind)

-proc analyseThreadCreationCall(n: PNode) =
-  # thread proc is second param of ``createThread``:
-  if n[2].kind != nkSym or n[2].sym.kind != skProc:
-    Message(n.info, warnAnalysisLoophole, renderTree(n))
-    return
-  var prc = n[2].sym
+proc analyseThreadProc*(prc: PSym) =
  var c = newProcCtx(prc)
-  var formal = skipTypes(prc.typ, abstractInst).n.sons[1].sym 
-  c.mapping[formal.id] = toTheirs # thread receives foreign data!
+  var formals = skipTypes(prc.typ, abstractInst).n
+  for i in 1 .. formals.len-1:
+    var formal = formals.sons[i].sym 
+    c.mapping[formal.id] = toTheirs # thread receives foreign data!
  discard analyse(c, prc.ast.sons[codePos])

+when false:
+  proc analyseThreadCreationCall(n: PNode) =
+    # thread proc is second param of ``createThread``:
+    if n[2].kind != nkSym or n[2].sym.kind != skProc:
+      Message(n.info, warnAnalysisLoophole, renderTree(n))
+      return
+    analyseProc(n[2].sym)
+
+  proc AnalyseThread*(threadCreation: PNode) =
+    analyseThreadCreationCall(threadCreation)
+
 proc needsGlobalAnalysis*: bool =
  result = gGlobalOptions * {optThreads, optThreadAnalysis} == 
                            {optThreads, optThreadAnalysis}

-proc AnalyseThread*(threadCreation: PNode) =
-  if needsGlobalAnalysis():
-    analyseThreadCreationCall(threadCreation)
-
--- a/compiler/sigmatch.nim
+++ b/compiler/sigmatch.nim
@@ -220,6 +220,9 @@ proc procTypeRel(mapping: var TIdTable, f, a: PType): TTypeRelation =
      result = isNone
    if tfNoSideEffect in f.flags and tfNoSideEffect notin a.flags:
      result = isNone
+    elif tfThread in f.flags and a.flags * {tfThread, tfNoSideEffect} == {}:
+      # noSideEffect implies ``tfThread``!
+      result = isNone
  else: nil

 proc typeRel(mapping: var TIdTable, f, a: PType): TTypeRelation = 
--- a/compiler/types.nim
+++ b/compiler/types.nim
@@ -331,6 +331,13 @@ proc containsGarbageCollectedRef(typ: PType): bool =
  # that are garbage-collected)
  result = searchTypeFor(typ, isGBCRef)

+proc isTyRef(t: PType): bool =
+  result = t.kind == tyRef
+
+proc containsTyRef*(typ: PType): bool = 
+  # returns true if typ contains a 'ref'
+  result = searchTypeFor(typ, isTyRef)
+
 proc isHiddenPointer(t: PType): bool = 
  result = t.kind in {tyString, tySequence}

@@ -484,6 +491,9 @@ proc TypeToString(typ: PType, prefer: TPreferedDesc = preferName): string =
    if tfNoSideEffect in t.flags: 
      addSep(prag)
      add(prag, "noSideEffect")
+    if tfThread in t.flags:
+      addSep(prag)
+      add(prag, "thread")
    if len(prag) != 0: add(result, "{." & prag & ".}")
  else: 
    result = typeToStr[t.kind]
--- a/compiler/wordrecg.nim
+++ b/compiler/wordrecg.nim
@@ -33,7 +33,7 @@ type
    wWithout, wXor, wYield,
    
    wColon, wColonColon, wEquals, wDot, wDotDot, wHat, wStar, wMinus, 
-    wMagic, wTypeCheck, wFinal, wProfiler, wObjChecks,
+    wMagic, wThread, wFinal, wProfiler, wObjChecks,
    wImportCompilerProc,
    wImportc, wExportc, wExtern,
    wAlign, wNodecl, wPure, wVolatile, wRegister, wSideeffect, wHeader, 
@@ -80,7 +80,7 @@ const
    "yield",

    ":", "::", "=", ".", "..", "^", "*", "-",
-    "magic", "typecheck", "final", "profiler", "objchecks", 
+    "magic", "thread", "final", "profiler", "objchecks", 
    "importcompilerproc", "importc", "exportc", "extern",
    "align", "nodecl", "pure", "volatile", "register", "sideeffect", 
    "header", "nosideeffect", "noreturn", "merge", "lib", "dynlib", 
--- a/doc/lib.txt
+++ b/doc/lib.txt
@@ -52,6 +52,8 @@ Collections and algorithms
 * `lists <lists.html>`_
  Nimrod linked list support. Contains singly and doubly linked lists and
  circular lists ("rings").
+* `queues <queues.html>`_
+  Implementation of a queue. The underlying implementation uses a ``seq``.
 * `intsets <intsets.html>`_
  Efficient implementation of a set of ints as a sparse bit set.

--- a/install.txt
+++ b/install.txt
@@ -45,9 +45,9 @@ manually. An alternative is to create a symbolic link in ``/usr/bin``::

  [sudo] ln -s $your_install_dir/bin/nimrod  /usr/bin/nimrod

-From version 0.7.10 onwards ``install.sh`` and ``deinstall.sh`` scripts are
-provided for distributing the files over the UNIX hierarchy. However, 
-updating your Nimrod installation is more cumbersome then. 
+There are also ``install.sh`` and ``deinstall.sh`` scripts for distributing 
+the files over the UNIX hierarchy. However, updating your Nimrod installation
+is more cumbersome then.


 Installation on the Macintosh
--- a/lib/pure/collections/queues.nim
+++ b/lib/pure/collections/queues.nim
@@ -0,0 +1,89 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2011 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Implementation of a queue. The underlying implementation uses a ``seq``.
+
+import math
+
+type
+  TQueue* {.pure, final.}[T] = object ## a queue
+    data: seq[T]
+    rd, wr, count, mask: int
+    
+proc initQueue*[T](initialSize=4): TQueue[T] =
+  ## creates a new queue. `initialSize` needs to be a power of 2.
+  assert IsPowerOfTwo(initialSize)
+  result.mask = initialSize-1
+  newSeq(result.data, initialSize)
+
+proc len*[T](q: TQueue[T]): int =
+  ## returns the number of elements of `q`.
+  result = q.count
+
+iterator items*[T](q: TQueue[T]): T =
+  ## yields every element of `q`.
+  var i = q.rd
+  var c = q.count
+  while c > 0:
+    dec c
+    yield q.data[i]
+    i = (i + 1) and q.mask
+
+proc add*[T](q: var TQueue[T], item: T) =
+  ## adds an `item` to the end of the queue `q`.
+  var cap = q.mask+1
+  if q.count >= cap:
+    var n: seq[T]
+    newSeq(n, cap*2)
+    var i = 0
+    for x in items(q):
+      shallowCopy(n[i], x)
+      inc i
+    shallowCopy(q.data, n)
+    q.mask = cap*2 - 1
+    q.wr = q.count
+    q.rd = 0
+  inc q.count
+  q.data[q.wr] = item
+  q.wr = (q.wr + 1) and q.mask
+
+proc enqueue*[T](q: var TQueue[T], item: T) =
+  ## alias for the ``add`` operation.
+  add(q, item)
+
+proc dequeue*[T](q: var TQueue[T]): T =
+  ## removes and returns the first element of the queue `q`.
+  assert q.count > 0
+  dec q.count
+  result = q.data[q.rd]
+  q.rd = (q.rd + 1) and q.mask
+
+proc `$`*[T](q: TQueue[T]): string = 
+  ## turns a queue into its string representation.
+  result = "["
+  for x in items(q):
+    if result.len > 1: result.add(", ")
+    result.add($x)
+  result.add("]")
+
+when isMainModule:
+  var q = initQueue[int]()
+  q.add(123)
+  q.add(9)
+  q.add(4)
+  var first = q.dequeue
+  q.add(56)
+  q.add(6)
+  var second = q.dequeue
+  q.add(789)
+  
+  assert first == 123
+  assert second == 9
+  assert($q == "[4, 56, 6, 789]")
+
--- a/lib/pure/marshal.nim
+++ b/lib/pure/marshal.nim
@@ -8,7 +8,26 @@
 #

 ## This module contains procs for serialization and deseralization of 
-## arbitrary Nimrod data structures. The serialization format uses JSON.
+## arbitrary Nimrod data structures. The serialization format uses JSON.
+##
+## **Restriction**: For objects their type is **not** serialized. This means
+## essentially that it does not work if the object has some other runtime
+## type than its compiletime type:
+##
+## .. code-block:: nimrod
+## 
+##   type 
+##     TA = object
+##     TB = object of TA
+##       f: int
+##
+##   var
+##     a: ref TA
+##     b: ref TB
+##
+##   new(b)
+##   a = b
+##   echo($$a[]) # produces "{}", not "{f: 0}"

 import streams, typeinfo, json, intsets, tables

@@ -286,3 +305,15 @@ when isMainModule:
  echo($$test7)
  testit(test7)

+  type 
+    TA = object
+    TB = object of TA
+      f: int
+
+  var
+    a: ref TA
+    b: ref TB
+  new(b)
+  a = b
+  echo($$a[]) # produces "{}", not "{f: 0}"
+
--- a/lib/pure/osproc.nim
+++ b/lib/pure/osproc.nim
@@ -77,11 +77,14 @@ proc startProcess*(command: string,
  ## If ``env == nil`` the environment is inherited of
  ## the parent process. `options` are additional flags that may be passed
  ## to `startProcess`. See the documentation of ``TProcessOption`` for the
-  ## meaning of these flags.
+  ## meaning of these flags. You need to `close` the process when done.
  ##
  ## Return value: The newly created process object. Nil is never returned,
  ## but ``EOS`` is raised in case of an error.

+proc close*(p: PProcess) {.rtl, extern: "nosp$1".}
+  ## When the process has finished executing, cleanup related handles
+
 proc suspend*(p: PProcess) {.rtl, extern: "nosp$1".}
  ## Suspends the process `p`.

@@ -179,6 +182,7 @@ proc execProcesses*(cmds: openArray[string],
            err.add("\n")
          echo(err)
        result = max(waitForExit(q[r]), result)
+        if q[r] != nil: close(q[r])
        q[r] = startProcessAux(cmds[i], options=options)
        r = (r + 1) mod n
    else:
@@ -189,15 +193,18 @@ proc execProcesses*(cmds: openArray[string],
          if not running(q[r]):
            #echo(outputStream(q[r]).readLine())
            result = max(waitForExit(q[r]), result)
+            if q[r] != nil: close(q[r])
            q[r] = startProcessAux(cmds[i], options=options)
            inc(i)
            if i > high(cmds): break
    for i in 0..m-1:
+      if q[i] != nil: close(q[i])
      result = max(waitForExit(q[i]), result)
  else:
    for i in 0..high(cmds):
      var p = startProcessAux(cmds[i], options=options)
      result = max(waitForExit(p), result)
+      close(p)

 proc select*(readfds: var seq[PProcess], timeout = 500): int
  ## `select` with a sensible Nimrod interface. `timeout` is in miliseconds.
@@ -215,6 +222,8 @@ when not defined(useNimRtl):
    while running(p) or not outp.atEnd(outp):
      result.add(outp.readLine())
      result.add("\n")
+    outp.close(outp)
+    close(p)

 when false:
  proc deallocCStringArray(a: cstringArray) =
@@ -356,6 +365,12 @@ when defined(Windows) and not defined(useNimRtl):
    result.FProcessHandle = procInfo.hProcess
    result.id = procInfo.dwProcessID

+  proc close(p: PProcess) =
+    discard CloseHandle(p.inputHandle)
+    discard CloseHandle(p.outputHandle)
+    discard CloseHandle(p.errorHandle)
+    discard CloseHandle(p.FProcessHandle)
+
  proc suspend(p: PProcess) =
    discard SuspendThread(p.FProcessHandle)

@@ -523,6 +538,11 @@ elif not defined(useNimRtl):
    discard close(p_stdin[readIdx])
    discard close(p_stdout[writeIdx])

+  proc close(p: PProcess) =
+    discard close(p.inputHandle)
+    discard close(p.outputHandle)
+    discard close(p.errorHandle)
+
  proc suspend(p: PProcess) =
    discard kill(p.id, SIGSTOP)

--- a/lib/system.nim
+++ b/lib/system.nim
@@ -785,6 +785,10 @@ when hasThreadSupport and not hasSharedHeap:
 else:
  {.pragma: rtlThreadVar.}

+template sysAssert(cond: expr) =
+  # change this to activate system asserts
+  nil
+
 include "system/inclrtl"

 when not defined(ecmascript) and not defined(nimrodVm):
@@ -1251,7 +1255,7 @@ proc each*[T](data: var openArray[T], op: proc (x: var T)) =
  for i in 0..data.len-1: op(data[i])

 iterator fields*[T: tuple](x: T): expr {.magic: "Fields", noSideEffect.}
-  ## iterates over every field of `x`. Warning: This is really transforms
+  ## iterates over every field of `x`. Warning: This really transforms
  ## the 'for' and unrolls the loop. The current implementation also has a bug
  ## that affects symbol binding in the loop body.
 iterator fields*[S: tuple, T: tuple](x: S, y: T): tuple[a, b: expr] {.
@@ -1261,13 +1265,13 @@ iterator fields*[S: tuple, T: tuple](x: S, y: T): tuple[a, b: expr] {.
  ## The current implementation also has a bug that affects symbol binding
  ## in the loop body.
 iterator fieldPairs*[T: tuple](x: T): expr {.magic: "FieldPairs", noSideEffect.}
-  ## iterates over every field of `x`. Warning: This is really transforms
+  ## iterates over every field of `x`. Warning: This really transforms
  ## the 'for' and unrolls the loop. The current implementation also has a bug
  ## that affects symbol binding in the loop body.
 iterator fieldPairs*[S: tuple, T: tuple](x: S, y: T): tuple[a, b: expr] {.
  magic: "FieldPairs", noSideEffect.}
  ## iterates over every field of `x` and `y`.
-  ## Warning: This is really transforms the 'for' and unrolls the loop. 
+  ## Warning: This really transforms the 'for' and unrolls the loop. 
  ## The current implementation also has a bug that affects symbol binding
  ## in the loop body.

@@ -1703,10 +1707,10 @@ when not defined(EcmaScript) and not defined(NimrodVM):

  # ----------------------------------------------------------------------------

-  proc atomicInc*(memLoc: var int, x: int): int {.inline.}
+  proc atomicInc*(memLoc: var int, x: int = 1): int {.inline.}
    ## atomic increment of `memLoc`. Returns the value after the operation.
  
-  proc atomicDec*(memLoc: var int, x: int): int {.inline.}
+  proc atomicDec*(memLoc: var int, x: int = 1): int {.inline.}
    ## atomic decrement of `memLoc`. Returns the value after the operation.

  include "system/atomics"
@@ -1719,6 +1723,7 @@ when not defined(EcmaScript) and not defined(NimrodVM):
      context: C_JmpBuf

  when hasThreadSupport:
+    include "system/syslocks"
    include "system/threads"
  else:
    initStackBottom()
@@ -1739,14 +1744,14 @@ when not defined(EcmaScript) and not defined(NimrodVM):
  proc reprAny(p: pointer, typ: PNimType): string {.compilerRtl.}

  proc getDiscriminant(aa: Pointer, n: ptr TNimNode): int =
-    assert(n.kind == nkCase)
+    sysAssert(n.kind == nkCase)
    var d: int
    var a = cast[TAddress](aa)
    case n.typ.size
    of 1: d = ze(cast[ptr int8](a +% n.offset)[])
    of 2: d = ze(cast[ptr int16](a +% n.offset)[])
    of 4: d = int(cast[ptr int32](a +% n.offset)[])
-    else: assert(false)
+    else: sysAssert(false)
    return d

  proc selectBranch(aa: Pointer, n: ptr TNimNode): ptr TNimNode =
@@ -1764,6 +1769,8 @@ when not defined(EcmaScript) and not defined(NimrodVM):
  {.pop.}

  include "system/sysio"
+  when hasThreadSupport:
+    include "system/inboxes"

  iterator lines*(filename: string): string =
    ## Iterate over any line in the file named `filename`.
--- a/lib/system/alloc.nim
+++ b/lib/system/alloc.nim
@@ -128,12 +128,12 @@ template bigChunkOverhead(): expr = sizeof(TBigChunk)-sizeof(TAlignType)

 proc roundup(x, v: int): int {.inline.} = 
  result = (x + (v-1)) and not (v-1)
-  assert(result >= x)
+  sysAssert(result >= x)
  #return ((-x) and (v-1)) +% x

-assert(roundup(14, PageSize) == PageSize)
-assert(roundup(15, 8) == 16)
-assert(roundup(65, 8) == 72)
+sysAssert(roundup(14, PageSize) == PageSize)
+sysAssert(roundup(15, 8) == 16)
+sysAssert(roundup(65, 8) == 72)

 # ------------- chunk table ---------------------------------------------------
 # We use a PtrSet of chunk starts and a table[Page, chunksize] for chunk
@@ -149,35 +149,35 @@ type
    acc: int                 # accumulator
    next: PLLChunk           # next low-level chunk; only needed for dealloc
    
-  TAllocator {.final, pure.} = object
+  TMemRegion {.final, pure.} = object
    llmem: PLLChunk
    currMem, maxMem, freeMem: int # memory sizes (allocated from OS)
    lastSize: int # needed for the case that OS gives us pages linearly 
    freeSmallChunks: array[0..SmallChunkSize div MemAlign-1, PSmallChunk]
    freeChunksList: PBigChunk # XXX make this a datastructure with O(1) access
    chunkStarts: TIntSet
-   
-proc incCurrMem(a: var TAllocator, bytes: int) {.inline.} = 
+  
+proc incCurrMem(a: var TMemRegion, bytes: int) {.inline.} = 
  inc(a.currMem, bytes)

-proc decCurrMem(a: var TAllocator, bytes: int) {.inline.} =
+proc decCurrMem(a: var TMemRegion, bytes: int) {.inline.} =
  a.maxMem = max(a.maxMem, a.currMem)
  dec(a.currMem, bytes)

-proc getMaxMem(a: var TAllocator): int =
+proc getMaxMem(a: var TMemRegion): int =
  # Since we update maxPagesCount only when freeing pages, 
  # maxPagesCount may not be up to date. Thus we use the
  # maximum of these both values here:
  return max(a.currMem, a.maxMem)
  
-proc llAlloc(a: var TAllocator, size: int): pointer =
+proc llAlloc(a: var TMemRegion, size: int): pointer =
  # *low-level* alloc for the memory managers data structures. Deallocation
  # is done at he end of the allocator's life time.
  if a.llmem == nil or size > a.llmem.size:
    # the requested size is ``roundup(size+sizeof(TLLChunk), PageSize)``, but
    # since we know ``size`` is a (small) constant, we know the requested size
    # is one page:
-    assert roundup(size+sizeof(TLLChunk), PageSize) == PageSize
+    sysAssert roundup(size+sizeof(TLLChunk), PageSize) == PageSize
    var old = a.llmem # can be nil and is correct with nil
    a.llmem = cast[PLLChunk](osAllocPages(PageSize))
    incCurrMem(a, PageSize)
@@ -189,7 +189,7 @@ proc llAlloc(a: var TAllocator, size: int): pointer =
  inc(a.llmem.acc, size)
  zeroMem(result, size)
  
-proc llDeallocAll(a: var TAllocator) =
+proc llDeallocAll(a: var TMemRegion) =
  var it = a.llmem
  while it != nil:
    # we know each block in the list has the size of 1 page:
@@ -204,7 +204,7 @@ proc IntSetGet(t: TIntSet, key: int): PTrunk =
    it = it.next
  result = nil

-proc IntSetPut(a: var TAllocator, t: var TIntSet, key: int): PTrunk = 
+proc IntSetPut(a: var TMemRegion, t: var TIntSet, key: int): PTrunk = 
  result = IntSetGet(t, key)
  if result == nil:
    result = cast[PTrunk](llAlloc(a, sizeof(result[])))
@@ -220,7 +220,7 @@ proc Contains(s: TIntSet, key: int): bool =
  else: 
    result = false
  
-proc Incl(a: var TAllocator, s: var TIntSet, key: int) = 
+proc Incl(a: var TMemRegion, s: var TIntSet, key: int) = 
  var t = IntSetPut(a, s, key shr TrunkShift)
  var u = key and TrunkMask
  t.bits[u shr IntShift] = t.bits[u shr IntShift] or (1 shl (u and IntMask))
@@ -259,13 +259,13 @@ proc pageIndex(p: pointer): int {.inline.} =

 proc pageAddr(p: pointer): PChunk {.inline.} = 
  result = cast[PChunk](cast[TAddress](p) and not PageMask)
-  #assert(Contains(allocator.chunkStarts, pageIndex(result)))
+  #sysAssert(Contains(allocator.chunkStarts, pageIndex(result)))

-proc requestOsChunks(a: var TAllocator, size: int): PBigChunk = 
+proc requestOsChunks(a: var TMemRegion, size: int): PBigChunk = 
  incCurrMem(a, size)
  inc(a.freeMem, size)
  result = cast[PBigChunk](osAllocPages(size))
-  assert((cast[TAddress](result) and PageMask) == 0)
+  sysAssert((cast[TAddress](result) and PageMask) == 0)
  #zeroMem(result, size)
  result.next = nil
  result.prev = nil
@@ -273,7 +273,7 @@ proc requestOsChunks(a: var TAllocator, size: int): PBigChunk =
  result.size = size
  # update next.prevSize:
  var nxt = cast[TAddress](result) +% size
-  assert((nxt and PageMask) == 0)
+  sysAssert((nxt and PageMask) == 0)
  var next = cast[PChunk](nxt)
  if pageIndex(next) in a.chunkStarts:
    #echo("Next already allocated!")
@@ -281,7 +281,7 @@ proc requestOsChunks(a: var TAllocator, size: int): PBigChunk =
  # set result.prevSize:
  var lastSize = if a.lastSize != 0: a.lastSize else: PageSize
  var prv = cast[TAddress](result) -% lastSize
-  assert((nxt and PageMask) == 0)
+  sysAssert((nxt and PageMask) == 0)
  var prev = cast[PChunk](prv)
  if pageIndex(prev) in a.chunkStarts and prev.size == lastSize:
    #echo("Prev already allocated!")
@@ -290,11 +290,11 @@ proc requestOsChunks(a: var TAllocator, size: int): PBigChunk =
    result.prevSize = 0 # unknown
  a.lastSize = size # for next request

-proc freeOsChunks(a: var TAllocator, p: pointer, size: int) = 
+proc freeOsChunks(a: var TMemRegion, p: pointer, size: int) = 
  # update next.prevSize:
  var c = cast[PChunk](p)
  var nxt = cast[TAddress](p) +% c.size
-  assert((nxt and PageMask) == 0)
+  sysAssert((nxt and PageMask) == 0)
  var next = cast[PChunk](nxt)
  if pageIndex(next) in a.chunkStarts:
    next.prevSize = 0 # XXX used
@@ -304,7 +304,7 @@ proc freeOsChunks(a: var TAllocator, p: pointer, size: int) =
  dec(a.freeMem, size)
  #c_fprintf(c_stdout, "[Alloc] back to OS: %ld\n", size)

-proc isAccessible(a: TAllocator, p: pointer): bool {.inline.} = 
+proc isAccessible(a: TMemRegion, p: pointer): bool {.inline.} = 
  result = Contains(a.chunkStarts, pageIndex(p))

 proc contains[T](list, x: T): bool = 
@@ -313,7 +313,7 @@ proc contains[T](list, x: T): bool =
    if it == x: return true
    it = it.next
    
-proc writeFreeList(a: TAllocator) =
+proc writeFreeList(a: TMemRegion) =
  var it = a.freeChunksList
  c_fprintf(c_stdout, "freeChunksList: %p\n", it)
  while it != nil: 
@@ -322,23 +322,23 @@ proc writeFreeList(a: TAllocator) =
    it = it.next

 proc ListAdd[T](head: var T, c: T) {.inline.} = 
-  assert(c notin head)
-  assert c.prev == nil
-  assert c.next == nil
+  sysAssert(c notin head)
+  sysAssert c.prev == nil
+  sysAssert c.next == nil
  c.next = head
  if head != nil: 
-    assert head.prev == nil
+    sysAssert head.prev == nil
    head.prev = c
  head = c

 proc ListRemove[T](head: var T, c: T) {.inline.} =
-  assert(c in head)
+  sysAssert(c in head)
  if c == head: 
    head = c.next
-    assert c.prev == nil
+    sysAssert c.prev == nil
    if head != nil: head.prev = nil
  else:
-    assert c.prev != nil
+    sysAssert c.prev != nil
    c.prev.next = c.next
    if c.next != nil: c.next.prev = c.prev
  c.next = nil
@@ -350,22 +350,22 @@ proc isSmallChunk(c: PChunk): bool {.inline.} =
 proc chunkUnused(c: PChunk): bool {.inline.} = 
  result = not c.used
  
-proc updatePrevSize(a: var TAllocator, c: PBigChunk, 
+proc updatePrevSize(a: var TMemRegion, c: PBigChunk, 
                    prevSize: int) {.inline.} = 
  var ri = cast[PChunk](cast[TAddress](c) +% c.size)
-  assert((cast[TAddress](ri) and PageMask) == 0)
+  sysAssert((cast[TAddress](ri) and PageMask) == 0)
  if isAccessible(a, ri):
    ri.prevSize = prevSize
  
-proc freeBigChunk(a: var TAllocator, c: PBigChunk) = 
+proc freeBigChunk(a: var TMemRegion, c: PBigChunk) = 
  var c = c
-  assert(c.size >= PageSize)
+  sysAssert(c.size >= PageSize)
  inc(a.freeMem, c.size)
  when coalescRight:
    var ri = cast[PChunk](cast[TAddress](c) +% c.size)
-    assert((cast[TAddress](ri) and PageMask) == 0)
+    sysAssert((cast[TAddress](ri) and PageMask) == 0)
    if isAccessible(a, ri) and chunkUnused(ri):
-      assert(not isSmallChunk(ri))
+      sysAssert(not isSmallChunk(ri))
      if not isSmallChunk(ri):
        ListRemove(a.freeChunksList, cast[PBigChunk](ri))
        inc(c.size, ri.size)
@@ -373,9 +373,9 @@ proc freeBigChunk(a: var TAllocator, c: PBigChunk) =
  when coalescLeft:
    if c.prevSize != 0: 
      var le = cast[PChunk](cast[TAddress](c) -% c.prevSize)
-      assert((cast[TAddress](le) and PageMask) == 0)
+      sysAssert((cast[TAddress](le) and PageMask) == 0)
      if isAccessible(a, le) and chunkUnused(le):
-        assert(not isSmallChunk(le))
+        sysAssert(not isSmallChunk(le))
        if not isSmallChunk(le):
          ListRemove(a.freeChunksList, cast[PBigChunk](le))
          inc(le.size, c.size)
@@ -390,9 +390,9 @@ proc freeBigChunk(a: var TAllocator, c: PBigChunk) =
  else:
    freeOsChunks(a, c, c.size)

-proc splitChunk(a: var TAllocator, c: PBigChunk, size: int) = 
+proc splitChunk(a: var TMemRegion, c: PBigChunk, size: int) = 
  var rest = cast[PBigChunk](cast[TAddress](c) +% size)
-  assert(rest notin a.freeChunksList)
+  sysAssert(rest notin a.freeChunksList)
  rest.size = c.size - size
  rest.used = false
  rest.next = nil
@@ -403,14 +403,14 @@ proc splitChunk(a: var TAllocator, c: PBigChunk, size: int) =
  incl(a, a.chunkStarts, pageIndex(rest))
  ListAdd(a.freeChunksList, rest)

-proc getBigChunk(a: var TAllocator, size: int): PBigChunk = 
+proc getBigChunk(a: var TMemRegion, size: int): PBigChunk = 
  # use first fit for now:
-  assert((size and PageMask) == 0)
-  assert(size > 0)
+  sysAssert((size and PageMask) == 0)
+  sysAssert(size > 0)
  result = a.freeChunksList
  block search:
    while result != nil:
-      assert chunkUnused(result)
+      sysAssert chunkUnused(result)
      if result.size == size: 
        ListRemove(a.freeChunksList, result)
        break search
@@ -419,7 +419,7 @@ proc getBigChunk(a: var TAllocator, size: int): PBigChunk =
        splitChunk(a, result, size)
        break search
      result = result.next
-      assert result != a.freeChunksList
+      sysAssert result != a.freeChunksList
    if size < InitialMemoryRequest: 
      result = requestOsChunks(a, InitialMemoryRequest)
      splitChunk(a, result, size)
@@ -430,10 +430,10 @@ proc getBigChunk(a: var TAllocator, size: int): PBigChunk =
  incl(a, a.chunkStarts, pageIndex(result))
  dec(a.freeMem, size)

-proc getSmallChunk(a: var TAllocator): PSmallChunk = 
+proc getSmallChunk(a: var TMemRegion): PSmallChunk = 
  var res = getBigChunk(a, PageSize)
-  assert res.prev == nil
-  assert res.next == nil
+  sysAssert res.prev == nil
+  sysAssert res.next == nil
  result = cast[PSmallChunk](res)

 # -----------------------------------------------------------------------------
@@ -442,9 +442,13 @@ proc getCellSize(p: pointer): int {.inline.} =
  var c = pageAddr(p)
  result = c.size
  
-proc rawAlloc(a: var TAllocator, requestedSize: int): pointer =
-  assert(roundup(65, 8) == 72)
-  assert requestedSize >= sizeof(TFreeCell)
+proc memSize(a: TMemRegion, p: pointer): int {.inline.} =
+  var c = pageAddr(p)
+  result = c.size
+    
+proc rawAlloc(a: var TMemRegion, requestedSize: int): pointer =
+  sysAssert(roundup(65, 8) == 72)
+  sysAssert requestedSize >= sizeof(TFreeCell)
  var size = roundup(requestedSize, MemAlign)
  #c_fprintf(c_stdout, "alloc; size: %ld; %ld\n", requestedSize, size)
  if size <= SmallChunkSize-smallChunkOverhead(): 
@@ -454,7 +458,7 @@ proc rawAlloc(a: var TAllocator, requestedSize: int): pointer =
    if c == nil: 
      c = getSmallChunk(a)
      c.freeList = nil
-      assert c.size == PageSize
+      sysAssert c.size == PageSize
      c.size = size
      c.acc = size
      c.free = SmallChunkSize - smallChunkOverhead() - size
@@ -462,36 +466,40 @@ proc rawAlloc(a: var TAllocator, requestedSize: int): pointer =
      c.prev = nil
      ListAdd(a.freeSmallChunks[s], c)
      result = addr(c.data)
-      assert((cast[TAddress](result) and (MemAlign-1)) == 0)
+      sysAssert((cast[TAddress](result) and (MemAlign-1)) == 0)
    else:
-      assert c.next != c
+      sysAssert c.next != c
      #if c.size != size:
      #  c_fprintf(c_stdout, "csize: %lld; size %lld\n", c.size, size)
-      assert c.size == size
+      sysAssert c.size == size
      if c.freeList == nil:
-        assert(c.acc + smallChunkOverhead() + size <= SmallChunkSize) 
+        sysAssert(c.acc + smallChunkOverhead() + size <= SmallChunkSize) 
        result = cast[pointer](cast[TAddress](addr(c.data)) +% c.acc)
        inc(c.acc, size)      
      else:
        result = c.freeList
-        assert(c.freeList.zeroField == 0)
+        sysAssert(c.freeList.zeroField == 0)
        c.freeList = c.freeList.next
      dec(c.free, size)
-      assert((cast[TAddress](result) and (MemAlign-1)) == 0)
+      sysAssert((cast[TAddress](result) and (MemAlign-1)) == 0)
    if c.free < size: 
      ListRemove(a.freeSmallChunks[s], c)
  else:
    size = roundup(requestedSize+bigChunkOverhead(), PageSize)
    # allocate a large block
    var c = getBigChunk(a, size)
-    assert c.prev == nil
-    assert c.next == nil
-    assert c.size == size
+    sysAssert c.prev == nil
+    sysAssert c.next == nil
+    sysAssert c.size == size
    result = addr(c.data)
-    assert((cast[TAddress](result) and (MemAlign-1)) == 0)
-  assert(isAccessible(a, result))
+    sysAssert((cast[TAddress](result) and (MemAlign-1)) == 0)
+  sysAssert(isAccessible(a, result))

-proc rawDealloc(a: var TAllocator, p: pointer) = 
+proc rawAlloc0(a: var TMemRegion, requestedSize: int): pointer =
+  result = rawAlloc(a, requestedSize)
+  zeroMem(result, requestedSize)
+
+proc rawDealloc(a: var TMemRegion, p: pointer) = 
  var c = pageAddr(p)
  if isSmallChunk(c):
    # `p` is within a small chunk:
@@ -499,7 +507,7 @@ proc rawDealloc(a: var TAllocator, p: pointer) =
    var s = c.size
    var f = cast[ptr TFreeCell](p)
    #echo("setting to nil: ", $cast[TAddress](addr(f.zeroField)))
-    assert(f.zeroField != 0)
+    sysAssert(f.zeroField != 0)
    f.zeroField = 0
    f.next = c.freeList
    c.freeList = f
@@ -509,7 +517,7 @@ proc rawDealloc(a: var TAllocator, p: pointer) =
               s -% sizeof(TFreeCell))
    # check if it is not in the freeSmallChunks[s] list:
    if c.free < s:
-      assert c notin a.freeSmallChunks[s div memAlign]
+      sysAssert c notin a.freeSmallChunks[s div memAlign]
      # add it to the freeSmallChunks[s] array:
      ListAdd(a.freeSmallChunks[s div memAlign], c)
      inc(c.free, s)
@@ -525,7 +533,7 @@ proc rawDealloc(a: var TAllocator, p: pointer) =
    # free big chunk
    freeBigChunk(a, cast[PBigChunk](c))

-proc isAllocatedPtr(a: TAllocator, p: pointer): bool = 
+proc isAllocatedPtr(a: TMemRegion, p: pointer): bool = 
  if isAccessible(a, p):
    var c = pageAddr(p)
    if not chunkUnused(c):
@@ -539,40 +547,40 @@ proc isAllocatedPtr(a: TAllocator, p: pointer): bool =
        var c = cast[PBigChunk](c)
        result = p == addr(c.data) and cast[ptr TFreeCell](p).zeroField >% 1

-proc deallocOsPages(a: var TAllocator) =
-  # we free every 'ordinarily' allocated page by iterating over the page
-  # bits:
-  for p in elements(a.chunkStarts): 
+proc deallocOsPages(a: var TMemRegion) =
+  # we free every 'ordinarily' allocated page by iterating over the page bits:
+  for p in elements(a.chunkStarts):
    var page = cast[PChunk](p shl pageShift)
    var size = if page.size < PageSize: PageSize else: page.size
    osDeallocPages(page, size)
  # And then we free the pages that are in use for the page bits:
  llDeallocAll(a)

-var
-  allocator {.rtlThreadVar.}: TAllocator
+proc getFreeMem(a: TMemRegion): int {.inline.} = result = a.freeMem
+proc getTotalMem(a: TMemRegion): int {.inline.} = result = a.currMem
+proc getOccupiedMem(a: TMemRegion): int {.inline.} = 
+  result = a.currMem - a.freeMem

-proc deallocOsPages = deallocOsPages(allocator)
+# ---------------------- thread memory region -------------------------------

-# ---------------------- interface to programs -------------------------------
+template InstantiateForRegion(allocator: expr) =
+  proc deallocOsPages = deallocOsPages(allocator)

-when not defined(useNimRtl):
-
-  proc unlockedAlloc(size: int): pointer {.inline.} =
+  proc unlockedAlloc(size: int): pointer =
    result = rawAlloc(allocator, size+sizeof(TFreeCell))
    cast[ptr TFreeCell](result).zeroField = 1 # mark it as used
-    assert(not isAllocatedPtr(allocator, result))
+    sysAssert(not isAllocatedPtr(allocator, result))
    result = cast[pointer](cast[TAddress](result) +% sizeof(TFreeCell))

-  proc unlockedAlloc0(size: int): pointer {.inline.} =
+  proc unlockedAlloc0(size: int): pointer =
    result = unlockedAlloc(size)
    zeroMem(result, size)

-  proc unlockedDealloc(p: pointer) {.inline.} =
+  proc unlockedDealloc(p: pointer) =
    var x = cast[pointer](cast[TAddress](p) -% sizeof(TFreeCell))
-    assert(cast[ptr TFreeCell](x).zeroField == 1)
+    sysAssert(cast[ptr TFreeCell](x).zeroField == 1)
    rawDealloc(allocator, x)
-    assert(not isAllocatedPtr(allocator, x))
+    sysAssert(not isAllocatedPtr(allocator, x))

  proc alloc(size: int): pointer =
    when hasThreadSupport and hasSharedHeap: AcquireSys(HeapLock)
@@ -601,37 +609,18 @@ when not defined(useNimRtl):
    elif p != nil:
      dealloc(p)

-  proc countFreeMem(): int =
-    # only used for assertions
-    var it = allocator.freeChunksList
-    while it != nil:
-      inc(result, it.size)
-      it = it.next
+  when false:
+    proc countFreeMem(): int =
+      # only used for assertions
+      var it = allocator.freeChunksList
+      while it != nil:
+        inc(result, it.size)
+        it = it.next

  proc getFreeMem(): int = 
    result = allocator.freeMem
-    #assert(result == countFreeMem())
+    #sysAssert(result == countFreeMem())

  proc getTotalMem(): int = return allocator.currMem
  proc getOccupiedMem(): int = return getTotalMem() - getFreeMem()

-when isMainModule:
-  const iterations = 4000_000
-  incl(allocator.chunkStarts, 11)
-  assert 11 in allocator.chunkStarts
-  excl(allocator.chunkStarts, 11)
-  assert 11 notin allocator.chunkStarts
-  var p: array [1..iterations, pointer]
-  for i in 7..7:
-    var x = i * 8
-    for j in 1.. iterations:
-      p[j] = alloc(allocator, x)
-    for j in 1..iterations:
-      assert isAllocatedPtr(allocator, p[j])
-    echo($i, " used memory: ", $(allocator.currMem))
-    for j in countdown(iterations, 1):
-      #echo("j: ", $j)
-      dealloc(allocator, p[j])
-      assert(not isAllocatedPtr(allocator, p[j]))
-    echo($i, " after freeing: ", $(allocator.currMem))
-    
--- a/lib/system/assign.nim
+++ b/lib/system/assign.nim
@@ -24,7 +24,7 @@ proc genericAssignAux(dest, src: Pointer, n: ptr TNimNode, shallow: bool) =
            n.typ.size)
    var m = selectBranch(src, n)
    if m != nil: genericAssignAux(dest, src, m, shallow)
-  of nkNone: assert(false)
+  of nkNone: sysAssert(false)
  #else:
  #  echo "ugh memory corruption! ", n.kind
  #  quit 1
@@ -33,7 +33,7 @@ proc genericAssignAux(dest, src: Pointer, mt: PNimType, shallow: bool) =
  var
    d = cast[TAddress](dest)
    s = cast[TAddress](src)
-  assert(mt != nil)
+  sysAssert(mt != nil)
  case mt.Kind
  of tyString:
    var x = cast[ppointer](dest)
@@ -50,7 +50,7 @@ proc genericAssignAux(dest, src: Pointer, mt: PNimType, shallow: bool) =
      # this can happen! nil sequences are allowed
      unsureAsgnRef(x, s2)
      return
-    assert(dest != nil)
+    sysAssert(dest != nil)
    unsureAsgnRef(x, newObj(mt, seq.len * mt.base.size + GenericSeqSize))
    var dst = cast[taddress](cast[ppointer](dest)[])
    for i in 0..seq.len-1:
@@ -101,7 +101,7 @@ proc objectInit(dest: Pointer, typ: PNimType) {.compilerProc.}
 proc objectInitAux(dest: Pointer, n: ptr TNimNode) =
  var d = cast[TAddress](dest)
  case n.kind
-  of nkNone: assert(false)
+  of nkNone: sysAssert(false)
  of nkSLot: objectInit(cast[pointer](d +% n.offset), n.typ)
  of nkList:
    for i in 0..n.len-1:
@@ -134,7 +134,7 @@ proc genericReset(dest: Pointer, mt: PNimType) {.compilerProc.}
 proc genericResetAux(dest: Pointer, n: ptr TNimNode) =
  var d = cast[TAddress](dest)
  case n.kind
-  of nkNone: assert(false)
+  of nkNone: sysAssert(false)
  of nkSlot: genericReset(cast[pointer](d +% n.offset), n.typ)
  of nkList:
    for i in 0..n.len-1: genericResetAux(dest, n.sons[i])
@@ -145,7 +145,7 @@ proc genericResetAux(dest: Pointer, n: ptr TNimNode) =

 proc genericReset(dest: Pointer, mt: PNimType) =
  var d = cast[TAddress](dest)
-  assert(mt != nil)
+  sysAssert(mt != nil)
  case mt.Kind
  of tyString, tyRef, tySequence:
    unsureAsgnRef(cast[ppointer](dest), nil)
@@ -173,4 +173,4 @@ proc FieldDiscriminantCheck(oldDiscVal, newDiscVal: int,
  if newBranch != oldBranch and oldDiscVal != 0:
    raise newException(EInvalidField, 
                       "assignment to discriminant changes object branch")
-  
+
--- a/lib/system/atomics.nim
+++ b/lib/system/atomics.nim
@@ -22,14 +22,14 @@ else:
    inc(p, val)
    result = p

-proc atomicInc(memLoc: var int, x: int): int =
+proc atomicInc(memLoc: var int, x: int = 1): int =
  when hasThreadSupport:
    result = sync_add_and_fetch(memLoc, x)
  else:
    inc(memLoc, x)
    result = memLoc
  
-proc atomicDec(memLoc: var int, x: int): int =
+proc atomicDec(memLoc: var int, x: int = 1): int =
  when hasThreadSupport:
    when defined(sync_sub_and_fetch):
      result = sync_sub_and_fetch(memLoc, x)
--- a/lib/system/cellsets.nim
+++ b/lib/system/cellsets.nim
@@ -102,9 +102,9 @@ proc CellSetGet(t: TCellSet, key: TAddress): PPageDesc =
 proc CellSetRawInsert(t: TCellSet, data: PPageDescArray, desc: PPageDesc) =
  var h = cast[int](desc.key) and t.max
  while data[h] != nil:
-    assert(data[h] != desc)
+    sysAssert(data[h] != desc)
    h = nextTry(h, t.max)
-  assert(data[h] == nil)
+  sysAssert(data[h] == nil)
  data[h] = desc

 proc CellSetEnlarge(t: var TCellSet) =
@@ -130,7 +130,7 @@ proc CellSetPut(t: var TCellSet, key: TAddress): PPageDesc =
  inc(t.counter)
  h = cast[int](key) and t.max
  while t.data[h] != nil: h = nextTry(h, t.max)
-  assert(t.data[h] == nil)
+  sysAssert(t.data[h] == nil)
  # the new page descriptor goes into result
  result = cast[PPageDesc](unlockedAlloc0(sizeof(TPageDesc)))
  result.next = t.head
--- a/lib/system/ecmasys.nim
+++ b/lib/system/ecmasys.nim
@@ -408,7 +408,7 @@ proc NimCopy(x: pointer, ti: PNimType): pointer {.compilerproc.}

 proc NimCopyAux(dest, src: Pointer, n: ptr TNimNode) {.exportc.} =
  case n.kind
-  of nkNone: assert(false)
+  of nkNone: sysAssert(false)
  of nkSlot:
    asm "`dest`[`n`.offset] = NimCopy(`src`[`n`.offset], `n`.typ);"
  of nkList:
--- a/lib/system/gc.nim
+++ b/lib/system/gc.nim
@@ -53,17 +53,20 @@ type
  
  TGcHeap {.final, pure.} = object # this contains the zero count and
                                   # non-zero count table
+    stackBottom: pointer
+    cycleThreshold: int
    zct: TCellSeq            # the zero count table
    decStack: TCellSeq       # cells in the stack that are to decref again
    cycleRoots: TCellSet
    tempStack: TCellSeq      # temporary stack for recursion elimination
    recGcLock: int           # prevent recursion via finalizers; no thread lock
+    region: TMemRegion       # garbage collected region
    stat: TGcStat

 var
-  stackBottom {.rtlThreadVar.}: pointer
  gch {.rtlThreadVar.}: TGcHeap
-  cycleThreshold {.rtlThreadVar.}: int
+
+InstantiateForRegion(gch.region)

 proc acquire(gch: var TGcHeap) {.inline.} = 
  when hasThreadSupport and hasSharedHeap:
@@ -124,30 +127,30 @@ when traceGC:
    of csAllocated:
      if c in states[csAllocated]:
        writeCell("attempt to alloc an already allocated cell", c)
-        assert(false)
+        sysAssert(false)
      excl(states[csCycFreed], c)
      excl(states[csZctFreed], c)
    of csZctFreed:
      if c in states[csZctFreed]:
        writeCell("attempt to free zct cell twice", c)
-        assert(false)
+        sysAssert(false)
      if c in states[csCycFreed]:
        writeCell("attempt to free with zct, but already freed with cyc", c)
-        assert(false)
+        sysAssert(false)
      if c notin states[csAllocated]:
        writeCell("attempt to free not an allocated cell", c)
-        assert(false)
+        sysAssert(false)
      excl(states[csAllocated], c)
    of csCycFreed:
      if c notin states[csAllocated]:
        writeCell("attempt to free a not allocated cell", c)
-        assert(false)
+        sysAssert(false)
      if c in states[csCycFreed]:
        writeCell("attempt to free cyc cell twice", c)
-        assert(false)
+        sysAssert(false)
      if c in states[csZctFreed]:
        writeCell("attempt to free with cyc, but already freed with zct", c)
-        assert(false)
+        sysAssert(false)
      excl(states[csAllocated], c)
    incl(states[state], c)

@@ -216,7 +219,7 @@ proc decRef(c: PCell) {.inline.} =
  when stressGC:
    if c.refcount <% rcIncrement:
      writeCell("broken cell", c)
-  assert(c.refcount >=% rcIncrement)
+  sysAssert(c.refcount >=% rcIncrement)
  #if c.refcount <% rcIncrement: quit("leck mich")
  if --c.refcount:
    rtlAddZCT(c)
@@ -233,7 +236,7 @@ proc nimGCunref(p: pointer) {.compilerProc, inline.} = decRef(usrToCell(p))

 proc asgnRef(dest: ppointer, src: pointer) {.compilerProc, inline.} =
  # the code generator calls this proc!
-  assert(not isOnStack(dest))
+  sysAssert(not isOnStack(dest))
  # BUGFIX: first incRef then decRef!
  if src != nil: incRef(usrToCell(src))
  if dest[] != nil: decRef(usrToCell(dest[]))
@@ -267,7 +270,7 @@ proc initGC() =
  when not defined(useNimRtl):
    when traceGC:
      for i in low(TCellState)..high(TCellState): Init(states[i])
-    cycleThreshold = InitialCycleThreshold
+    gch.cycleThreshold = InitialCycleThreshold
    gch.stat.stackScans = 0
    gch.stat.cycleCollections = 0
    gch.stat.maxThreshold = 0
@@ -289,7 +292,7 @@ proc forAllSlotsAux(dest: pointer, n: ptr TNimNode, op: TWalkOp) =
  of nkCase:
    var m = selectBranch(dest, n)
    if m != nil: forAllSlotsAux(dest, m, op)
-  of nkNone: assert(false)
+  of nkNone: sysAssert(false)

 proc forAllChildrenAux(dest: Pointer, mt: PNimType, op: TWalkOp) =
  var d = cast[TAddress](dest)
@@ -306,9 +309,9 @@ proc forAllChildrenAux(dest: Pointer, mt: PNimType, op: TWalkOp) =
    else: nil

 proc forAllChildren(cell: PCell, op: TWalkOp) =
-  assert(cell != nil)
-  assert(cell.typ != nil)
-  assert cell.typ.kind in {tyRef, tySequence, tyString}
+  sysAssert(cell != nil)
+  sysAssert(cell.typ != nil)
+  sysAssert cell.typ.kind in {tyRef, tySequence, tyString}
  case cell.typ.Kind
  of tyRef: # common case
    forAllChildrenAux(cellToUsr(cell), cell.typ.base, op)
@@ -321,12 +324,7 @@ proc forAllChildren(cell: PCell, op: TWalkOp) =
          GenericSeqSize), cell.typ.base, op)
  else: nil

-proc checkCollection {.inline.} =
-  # checks if a collection should be done
-  if gch.recGcLock == 0:
-    collectCT(gch)
-
-proc addNewObjToZCT(res: PCell) {.inline.} =
+proc addNewObjToZCT(res: PCell, gch: var TGcHeap) {.inline.} =
  # we check the last 8 entries (cache line) for a slot that could be reused.
  # In 63% of all cases we succeed here! But we have to optimize the heck
  # out of this small linear search so that ``newObj`` is not slowed down.
@@ -370,14 +368,14 @@ proc addNewObjToZCT(res: PCell) {.inline.} =
        return
    add(gch.zct, res)

-proc newObj(typ: PNimType, size: int): pointer {.compilerRtl.} =
+proc newObj(typ: PNimType, size: int, gch: var TGcHeap): pointer =
  # generates a new object and sets its reference counter to 0
  acquire(gch)
-  assert(typ.kind in {tyRef, tyString, tySequence})
-  checkCollection()
-  var res = cast[PCell](rawAlloc(allocator, size + sizeof(TCell)))
+  sysAssert(typ.kind in {tyRef, tyString, tySequence})
+  collectCT(gch)
+  var res = cast[PCell](rawAlloc(gch.region, size + sizeof(TCell)))
  zeroMem(res, size+sizeof(TCell))
-  assert((cast[TAddress](res) and (MemAlign-1)) == 0)
+  sysAssert((cast[TAddress](res) and (MemAlign-1)) == 0)
  # now it is buffered in the ZCT
  res.typ = typ
  when debugGC and not hasThreadSupport:
@@ -385,13 +383,16 @@ proc newObj(typ: PNimType, size: int): pointer {.compilerRtl.} =
      res.filename = framePtr.prev.filename
      res.line = framePtr.prev.line
  res.refcount = rcZct # refcount is zero, but mark it to be in the ZCT  
-  assert(isAllocatedPtr(allocator, res))
+  sysAssert(isAllocatedPtr(gch.region, res))
  # its refcount is zero, so add it to the ZCT:
-  addNewObjToZCT(res)
+  addNewObjToZCT(res, gch)
  when logGC: writeCell("new cell", res)
  gcTrace(res, csAllocated)  
  release(gch)
-  result = cellToUsr(res)
+  result = cellToUsr(res)  
+
+proc newObj(typ: PNimType, size: int): pointer {.compilerRtl.} =
+  result = newObj(typ, size, gch)

 proc newSeq(typ: PNimType, len: int): pointer {.compilerRtl.} =
  # `newObj` already uses locks, so no need for them here.
@@ -399,23 +400,22 @@ proc newSeq(typ: PNimType, len: int): pointer {.compilerRtl.} =
  cast[PGenericSeq](result).len = len
  cast[PGenericSeq](result).space = len

-proc growObj(old: pointer, newsize: int): pointer {.rtl.} =
+proc growObj(old: pointer, newsize: int, gch: var TGcHeap): pointer =
  acquire(gch)
-  checkCollection()
+  collectCT(gch)
  var ol = usrToCell(old)
-  assert(ol.typ != nil)
-  assert(ol.typ.kind in {tyString, tySequence})
-  var res = cast[PCell](rawAlloc(allocator, newsize + sizeof(TCell)))
+  sysAssert(ol.typ != nil)
+  sysAssert(ol.typ.kind in {tyString, tySequence})
+  var res = cast[PCell](rawAlloc(gch.region, newsize + sizeof(TCell)))
  var elemSize = 1
-  if ol.typ.kind != tyString:
-    elemSize = ol.typ.base.size
+  if ol.typ.kind != tyString: elemSize = ol.typ.base.size
  
  var oldsize = cast[PGenericSeq](old).len*elemSize + GenericSeqSize
  copyMem(res, ol, oldsize + sizeof(TCell))
  zeroMem(cast[pointer](cast[TAddress](res)+% oldsize +% sizeof(TCell)),
          newsize-oldsize)
-  assert((cast[TAddress](res) and (MemAlign-1)) == 0)
-  assert(res.refcount shr rcShift <=% 1)
+  sysAssert((cast[TAddress](res) and (MemAlign-1)) == 0)
+  sysAssert(res.refcount shr rcShift <=% 1)
  #if res.refcount <% rcIncrement:
  #  add(gch.zct, res)
  #else: # XXX: what to do here?
@@ -434,29 +434,32 @@ proc growObj(old: pointer, newsize: int): pointer {.rtl.} =
    writeCell("growObj new cell", res)
  gcTrace(ol, csZctFreed)
  gcTrace(res, csAllocated)
-  when reallyDealloc: rawDealloc(allocator, ol)
+  when reallyDealloc: rawDealloc(gch.region, ol)
  else:
-    assert(ol.typ != nil)
+    sysAssert(ol.typ != nil)
    zeroMem(ol, sizeof(TCell))
  release(gch)
  result = cellToUsr(res)

+proc growObj(old: pointer, newsize: int): pointer {.rtl.} =
+  result = growObj(old, newsize, gch)
+
 # ---------------- cycle collector -------------------------------------------

 proc doOperation(p: pointer, op: TWalkOp) =
  if p == nil: return
  var c: PCell = usrToCell(p)
-  assert(c != nil)
+  sysAssert(c != nil)
  case op # faster than function pointers because of easy prediction
  of waZctDecRef:
-    assert(c.refcount >=% rcIncrement)
+    sysAssert(c.refcount >=% rcIncrement)
    c.refcount = c.refcount -% rcIncrement
    when logGC: writeCell("decref (from doOperation)", c)
    if c.refcount <% rcIncrement: addZCT(gch.zct, c)
  of waPush:
    add(gch.tempStack, c)
  of waCycleDecRef:
-    assert(c.refcount >=% rcIncrement)
+    sysAssert(c.refcount >=% rcIncrement)
    c.refcount = c.refcount -% rcIncrement

 # we now use a much simpler and non-recursive algorithm for cycle removal
@@ -496,20 +499,20 @@ proc collectCycles(gch: var TGcHeap) =
      prepareDealloc(c)
      gcTrace(c, csCycFreed)
      when logGC: writeCell("cycle collector dealloc cell", c)
-      when reallyDealloc: rawDealloc(allocator, c)
+      when reallyDealloc: rawDealloc(gch.region, c)
      else:
-        assert(c.typ != nil)
+        sysAssert(c.typ != nil)
        zeroMem(c, sizeof(TCell))
  Deinit(gch.cycleRoots)
  Init(gch.cycleRoots)

-proc gcMark(p: pointer) {.inline.} =
+proc gcMark(gch: var TGcHeap, p: pointer) {.inline.} =
  # the addresses are not as cells on the stack, so turn them to cells:
  var cell = usrToCell(p)
  var c = cast[TAddress](cell)
  if c >% PageSize and (c and (MemAlign-1)) == 0:
    # fast check: does it look like a cell?
-    if isAllocatedPtr(allocator, cell): 
+    if isAllocatedPtr(gch.region, cell): 
      # mark the cell:
      cell.refcount = cell.refcount +% rcIncrement
      add(gch.decStack, cell)
@@ -520,13 +523,13 @@ proc markThreadStacks(gch: var TGcHeap) =
    var it = threadList
    while it != nil:
      # mark registers: 
-      for i in 0 .. high(it.registers): gcMark(it.registers[i])
+      for i in 0 .. high(it.registers): gcMark(gch, it.registers[i])
      var sp = cast[TAddress](it.stackBottom)
      var max = cast[TAddress](it.stackTop)
      # XXX stack direction?
      # XXX unroll this loop:
      while sp <=% max:
-        gcMark(cast[ppointer](sp)[])
+        gcMark(gch, cast[ppointer](sp)[])
        sp = sp +% sizeof(pointer)
      it = it.next

@@ -545,24 +548,24 @@ when not defined(useNimRtl):
  proc setStackBottom(theStackBottom: pointer) =
    #c_fprintf(c_stdout, "stack bottom: %p;\n", theStackBottom)
    # the first init must be the one that defines the stack bottom:
-    if stackBottom == nil: stackBottom = theStackBottom
+    if gch.stackBottom == nil: gch.stackBottom = theStackBottom
    else:
      var a = cast[TAddress](theStackBottom) # and not PageMask - PageSize*2
-      var b = cast[TAddress](stackBottom)
+      var b = cast[TAddress](gch.stackBottom)
      when stackIncreases:
-        stackBottom = cast[pointer](min(a, b))
+        gch.stackBottom = cast[pointer](min(a, b))
      else:
-        stackBottom = cast[pointer](max(a, b))
+        gch.stackBottom = cast[pointer](max(a, b))

 proc stackSize(): int {.noinline.} =
  var stackTop {.volatile.}: pointer
-  result = abs(cast[int](addr(stackTop)) - cast[int](stackBottom))
+  result = abs(cast[int](addr(stackTop)) - cast[int](gch.stackBottom))

 when defined(sparc): # For SPARC architecture.
  proc isOnStack(p: pointer): bool =
    var stackTop {.volatile.}: pointer
    stackTop = addr(stackTop)
-    var b = cast[TAddress](stackBottom)
+    var b = cast[TAddress](gch.stackBottom)
    var a = cast[TAddress](stackTop)
    var x = cast[TAddress](p)
    result = a <=% x and x <=% b
@@ -574,13 +577,13 @@ when defined(sparc): # For SPARC architecture.
      asm  """"ta      0x3   ! ST_FLUSH_WINDOWS\n" """

    var
-      max = stackBottom
+      max = gch.stackBottom
      sp: PPointer
      stackTop: array[0..1, pointer]
    sp = addr(stackTop[0])
    # Addresses decrease as the stack grows.
    while sp <= max:
-      gcMark(sp[])
+      gcMark(gch, sp[])
      sp = cast[ppointer](cast[TAddress](sp) +% sizeof(pointer))

 elif defined(ELATE):
@@ -593,7 +596,7 @@ elif stackIncreases:
  proc isOnStack(p: pointer): bool =
    var stackTop {.volatile.}: pointer
    stackTop = addr(stackTop)
-    var a = cast[TAddress](stackBottom)
+    var a = cast[TAddress](gch.stackBottom)
    var b = cast[TAddress](stackTop)
    var x = cast[TAddress](p)
    result = a <=% x and x <=% b
@@ -606,12 +609,12 @@ elif stackIncreases:
  proc markStackAndRegisters(gch: var TGcHeap) {.noinline, cdecl.} =
    var registers: C_JmpBuf
    if c_setjmp(registers) == 0'i32: # To fill the C stack with registers.
-      var max = cast[TAddress](stackBottom)
+      var max = cast[TAddress](gch.stackBottom)
      var sp = cast[TAddress](addr(registers)) +% jmpbufSize -% sizeof(pointer)
      # sp will traverse the JMP_BUF as well (jmp_buf size is added,
      # otherwise sp would be below the registers structure).
      while sp >=% max:
-        gcMark(cast[ppointer](sp)[])
+        gcMark(gch, cast[ppointer](sp)[])
        sp = sp -% sizeof(pointer)

 else:
@@ -621,7 +624,7 @@ else:
  proc isOnStack(p: pointer): bool =
    var stackTop {.volatile.}: pointer
    stackTop = addr(stackTop)
-    var b = cast[TAddress](stackBottom)
+    var b = cast[TAddress](gch.stackBottom)
    var a = cast[TAddress](stackTop)
    var x = cast[TAddress](p)
    result = a <=% x and x <=% b
@@ -633,22 +636,22 @@ else:
    type PStackSlice = ptr array [0..7, pointer]
    var registers: C_JmpBuf
    if c_setjmp(registers) == 0'i32: # To fill the C stack with registers.
-      var max = cast[TAddress](stackBottom)
+      var max = cast[TAddress](gch.stackBottom)
      var sp = cast[TAddress](addr(registers))
      # loop unrolled:
      while sp <% max - 8*sizeof(pointer):
-        gcMark(cast[PStackSlice](sp)[0])
-        gcMark(cast[PStackSlice](sp)[1])
-        gcMark(cast[PStackSlice](sp)[2])
-        gcMark(cast[PStackSlice](sp)[3])
-        gcMark(cast[PStackSlice](sp)[4])
-        gcMark(cast[PStackSlice](sp)[5])
-        gcMark(cast[PStackSlice](sp)[6])
-        gcMark(cast[PStackSlice](sp)[7])
+        gcMark(gch, cast[PStackSlice](sp)[0])
+        gcMark(gch, cast[PStackSlice](sp)[1])
+        gcMark(gch, cast[PStackSlice](sp)[2])
+        gcMark(gch, cast[PStackSlice](sp)[3])
+        gcMark(gch, cast[PStackSlice](sp)[4])
+        gcMark(gch, cast[PStackSlice](sp)[5])
+        gcMark(gch, cast[PStackSlice](sp)[6])
+        gcMark(gch, cast[PStackSlice](sp)[7])
        sp = sp +% sizeof(pointer)*8
      # last few entries:
      while sp <=% max:
-        gcMark(cast[ppointer](sp)[])
+        gcMark(gch, cast[ppointer](sp)[])
        sp = sp +% sizeof(pointer)

 # ----------------------------------------------------------------------------
@@ -664,7 +667,7 @@ proc CollectZCT(gch: var TGcHeap) =
  while L[] > 0:
    var c = gch.zct.d[0]
    # remove from ZCT:
-    assert((c.refcount and colorMask) == rcZct)
+    sysAssert((c.refcount and colorMask) == rcZct)
    c.refcount = c.refcount and not colorMask
    gch.zct.d[0] = gch.zct.d[L[] - 1]
    dec(L[])
@@ -683,41 +686,42 @@ proc CollectZCT(gch: var TGcHeap) =
      # access invalid memory. This is done by prepareDealloc():
      prepareDealloc(c)
      forAllChildren(c, waZctDecRef)
-      when reallyDealloc: rawDealloc(allocator, c)
+      when reallyDealloc: rawDealloc(gch.region, c)
      else:
-        assert(c.typ != nil)
+        sysAssert(c.typ != nil)
        zeroMem(c, sizeof(TCell))

 proc unmarkStackAndRegisters(gch: var TGcHeap) = 
  var d = gch.decStack.d
  for i in 0..gch.decStack.len-1:
-    assert isAllocatedPtr(allocator, d[i])
+    sysAssert isAllocatedPtr(allocator, d[i])
    # decRef(d[i]) inlined: cannot create a cycle and must not acquire lock
    var c = d[i]
    # XXX no need for an atomic dec here:
    if --c.refcount:
      addZCT(gch.zct, c)
-    assert c.typ != nil
+    sysAssert c.typ != nil
  gch.decStack.len = 0

 proc collectCT(gch: var TGcHeap) =
-  if gch.zct.len >= ZctThreshold or (cycleGC and
-      getOccupiedMem() >= cycleThreshold) or stressGC:
+  if (gch.zct.len >= ZctThreshold or (cycleGC and
+      getOccupiedMem(gch.region) >= gch.cycleThreshold) or stressGC) and 
+      gch.recGcLock == 0:
    gch.stat.maxStackSize = max(gch.stat.maxStackSize, stackSize())
-    assert(gch.decStack.len == 0)
+    sysAssert(gch.decStack.len == 0)
    markStackAndRegisters(gch)
    markThreadStacks(gch)
    gch.stat.maxStackCells = max(gch.stat.maxStackCells, gch.decStack.len)
    inc(gch.stat.stackScans)
    collectZCT(gch)
    when cycleGC:
-      if getOccupiedMem() >= cycleThreshold or stressGC:
+      if getOccupiedMem() >= gch.cycleThreshold or stressGC:
        collectCycles(gch)
        collectZCT(gch)
        inc(gch.stat.cycleCollections)
-        cycleThreshold = max(InitialCycleThreshold, getOccupiedMem() *
-                             cycleIncrease)
-        gch.stat.maxThreshold = max(gch.stat.maxThreshold, cycleThreshold)
+        gch.cycleThreshold = max(InitialCycleThreshold, getOccupiedMem() *
+                                 cycleIncrease)
+        gch.stat.maxThreshold = max(gch.stat.maxThreshold, gch.cycleThreshold)
    unmarkStackAndRegisters(gch)

 when not defined(useNimRtl):
@@ -741,18 +745,18 @@ when not defined(useNimRtl):
    of gcOptimizeTime: nil

  proc GC_enableMarkAndSweep() =
-    cycleThreshold = InitialCycleThreshold
+    gch.cycleThreshold = InitialCycleThreshold

  proc GC_disableMarkAndSweep() =
-    cycleThreshold = high(cycleThreshold)-1
+    gch.cycleThreshold = high(gch.cycleThreshold)-1
    # set to the max value to suppress the cycle detector

  proc GC_fullCollect() =
    acquire(gch)
-    var oldThreshold = cycleThreshold
-    cycleThreshold = 0 # forces cycle collection
+    var oldThreshold = gch.cycleThreshold
+    gch.cycleThreshold = 0 # forces cycle collection
    collectCT(gch)
-    cycleThreshold = oldThreshold
+    gch.cycleThreshold = oldThreshold
    release(gch)

  proc GC_getStatistics(): string =
--- a/lib/system/inboxes.nim
+++ b/lib/system/inboxes.nim
@@ -0,0 +1,203 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2011 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Message passing for threads. The current implementation is slow and does
+## not work with cyclic data structures. But hey, it's better than nothing.
+
+type
+  pbytes = ptr array[0.. 0xffff, byte]
+  TInbox {.pure, final.} = object ## msg queue for a thread
+    rd, wr, count, mask: int
+    data: pbytes
+    lock: TSysLock
+    cond: TSysCond
+    elemType: PNimType
+    region: TMemRegion
+  PInbox = ptr TInbox
+  TLoadStoreMode = enum mStore, mLoad
+
+proc initInbox(p: pointer) =
+  var inbox = cast[PInbox](p)
+  initSysLock(inbox.lock)
+  initSysCond(inbox.cond)
+  inbox.mask = -1
+
+proc freeInbox(p: pointer) =
+  var inbox = cast[PInbox](p)
+  deallocOsPages(inbox.region)
+  deinitSys(inbox.lock)
+  deinitSysCond(inbox.cond)
+
+proc storeAux(dest, src: Pointer, mt: PNimType, t: PInbox, mode: TLoadStoreMode)
+proc storeAux(dest, src: Pointer, n: ptr TNimNode, t: PInbox,
+              mode: TLoadStoreMode) =
+  var
+    d = cast[TAddress](dest)
+    s = cast[TAddress](src)
+  case n.kind
+  of nkSlot: storeAux(cast[pointer](d +% n.offset), 
+                      cast[pointer](s +% n.offset), n.typ, t, mode)
+  of nkList:
+    for i in 0..n.len-1: storeAux(dest, src, n.sons[i], t, mode)
+  of nkCase:
+    copyMem(cast[pointer](d +% n.offset), cast[pointer](s +% n.offset),
+            n.typ.size)
+    var m = selectBranch(src, n)
+    if m != nil: storeAux(dest, src, m, t, mode)
+  of nkNone: sysAssert(false)
+
+proc storeAux(dest, src: Pointer, mt: PNimType, t: PInbox, 
+              mode: TLoadStoreMode) =
+  var
+    d = cast[TAddress](dest)
+    s = cast[TAddress](src)
+  sysAssert(mt != nil)
+  case mt.Kind
+  of tyString:
+    if mode == mStore:
+      var x = cast[ppointer](dest)
+      var s2 = cast[ppointer](s)[]
+      if s2 == nil: 
+        x[] = nil
+      else:
+        var ss = cast[NimString](s2)
+        var ns = cast[NimString](rawAlloc(t.region, ss.len+1 + GenericSeqSize))
+        copyMem(ns, ss, ss.len+1 + GenericSeqSize)
+        x[] = ns
+    else:
+      var x = cast[ppointer](dest)
+      var s2 = cast[ppointer](s)[]
+      if s2 == nil:
+        unsureAsgnRef(x, s2)
+      else:
+        unsureAsgnRef(x, copyString(cast[NimString](s2)))
+        rawDealloc(t.region, s2)
+  of tySequence:
+    var s2 = cast[ppointer](src)[]
+    var seq = cast[PGenericSeq](s2)
+    var x = cast[ppointer](dest)
+    if s2 == nil:
+      if mode == mStore:
+        x[] = nil
+      else:
+        unsureAsgnRef(x, nil)
+    else:
+      sysAssert(dest != nil)
+      if mode == mStore:
+        x[] = rawAlloc(t.region, seq.len *% mt.base.size +% GenericSeqSize)
+      else:
+        unsureAsgnRef(x, newObj(mt, seq.len * mt.base.size + GenericSeqSize))
+      var dst = cast[taddress](cast[ppointer](dest)[])
+      for i in 0..seq.len-1:
+        storeAux(
+          cast[pointer](dst +% i*% mt.base.size +% GenericSeqSize),
+          cast[pointer](cast[TAddress](s2) +% i *% mt.base.size +%
+                        GenericSeqSize),
+          mt.Base, t, mode)
+      var dstseq = cast[PGenericSeq](dst)
+      dstseq.len = seq.len
+      dstseq.space = seq.len
+      if mode != mStore: rawDealloc(t.region, s2)
+  of tyObject:
+    # copy type field:
+    var pint = cast[ptr PNimType](dest)
+    # XXX use dynamic type here!
+    pint[] = mt
+    storeAux(dest, src, mt.node, t, mode)
+  of tyTuple, tyPureObject:
+    storeAux(dest, src, mt.node, t, mode)
+  of tyArray, tyArrayConstr:
+    for i in 0..(mt.size div mt.base.size)-1:
+      storeAux(cast[pointer](d +% i*% mt.base.size),
+               cast[pointer](s +% i*% mt.base.size), mt.base, t, mode)
+  of tyRef:
+    var s = cast[ppointer](src)[]
+    var x = cast[ppointer](dest)
+    if s == nil:
+      if mode == mStore:
+        x[] = nil
+      else:
+        unsureAsgnRef(x, nil)
+    else:
+      if mode == mStore:
+        x[] = rawAlloc(t.region, mt.base.size)
+      else:
+        # XXX we should use the dynamic type here too, but that is not stored in
+        # the inbox at all --> use source[]'s object type? but how? we need a
+        # tyRef to the object!
+        var obj = newObj(mt.base, mt.base.size)
+        unsureAsgnRef(x, obj)
+      storeAux(x[], s, mt.base, t, mode)
+      if mode != mStore: rawDealloc(t.region, s)
+  else:
+    copyMem(dest, src, mt.size) # copy raw bits
+
+proc rawSend(q: PInbox, data: pointer, typ: PNimType) =
+  ## adds an `item` to the end of the queue `q`.
+  var cap = q.mask+1
+  if q.count >= cap:
+    # start with capicity for 2 entries in the queue:
+    if cap == 0: cap = 1
+    var n = cast[pbytes](rawAlloc0(q.region, cap*2*typ.size))
+    var z = 0
+    var i = q.rd
+    var c = q.count
+    while c > 0:
+      dec c
+      copyMem(addr(n[z*typ.size]), addr(q.data[i*typ.size]), typ.size)
+      i = (i + 1) and q.mask
+      inc z
+    if q.data != nil: rawDealloc(q.region, q.data)
+    q.data = n
+    q.mask = cap*2 - 1
+    q.wr = q.count
+    q.rd = 0
+    #echo "came here"
+  storeAux(addr(q.data[q.wr * typ.size]), data, typ, q, mStore)
+  inc q.count
+  q.wr = (q.wr + 1) and q.mask
+
+proc rawRecv(q: PInbox, data: pointer, typ: PNimType) =
+  assert q.count > 0
+  dec q.count
+  storeAux(data, addr(q.data[q.rd * typ.size]), typ, q, mLoad)
+  q.rd = (q.rd + 1) and q.mask
+
+template lockInbox(q: expr, action: stmt) =
+  acquireSys(q.lock)
+  action
+  releaseSys(q.lock)
+
+proc send*[TMsg](receiver: var TThread[TMsg], msg: TMsg) =
+  ## sends a message to a thread. `msg` is deeply copied.
+  var q = cast[PInbox](getInBoxMem(receiver))
+  acquireSys(q.lock)
+  var m: TMsg
+  shallowCopy(m, msg)
+  rawSend(q, addr(m), cast[PNimType](getTypeInfo(msg)))
+  releaseSys(q.lock)
+  SignalSysCond(q.cond)
+
+proc recv*[TMsg](): TMsg =
+  ## receives a message from its internal message queue. This blocks until
+  ## a message has arrived! You may use ``peek`` to avoid the blocking.
+  var q = cast[PInbox](getInBoxMem())
+  acquireSys(q.lock)
+  while q.count <= 0:
+    WaitSysCond(q.cond, q.lock)
+  rawRecv(q, addr(result), cast[PNimType](getTypeInfo(result)))
+  releaseSys(q.lock)
+
+proc peek*(): int =
+  ## returns the current number of messages in the inbox.
+  var q = cast[PInbox](getInBoxMem())
+  lockInbox(q):
+    result = q.count
+
+
--- a/lib/system/mmdisp.nim
+++ b/lib/system/mmdisp.nim
@@ -62,11 +62,10 @@ when defined(boehmgc):
    const boehmLib = "boehmgc.dll"
  elif defined(macosx):
    const boehmLib = "libgc.dylib"
-    
-    proc boehmGCinit {.importc: "GC_init", dynlib: boehmLib.}
  else:
    const boehmLib = "/usr/lib/libgc.so.1"
-
+    
+  proc boehmGCinit {.importc: "GC_init", dynlib: boehmLib.}
  proc boehmGC_disable {.importc: "GC_disable", dynlib: boehmLib.} 
  proc boehmGC_enable {.importc: "GC_enable", dynlib: boehmLib.} 
  proc boehmGCincremental {.
@@ -177,12 +176,20 @@ elif defined(nogc):
  proc asgnRefNoCycle(dest: ppointer, src: pointer) {.compilerproc, inline.} =
    dest[] = src

+  var allocator {.rtlThreadVar.}: TMemRegion
+  InstantiateForRegion(allocator)
+
  include "system/cellsets"

 else:
  include "system/alloc"
+
+  proc unlockedAlloc(size: int): pointer {.inline.} 
+  proc unlockedAlloc0(size: int): pointer {.inline.} 
+  proc unlockedDealloc(p: pointer) {.inline.} 
+  
  include "system/cellsets"
-  assert(sizeof(TCell) == sizeof(TFreeCell))
+  sysAssert(sizeof(TCell) == sizeof(TFreeCell))
  include "system/gc"
  
 {.pop.}
--- a/lib/system/repr.nim
+++ b/lib/system/repr.nim
@@ -158,7 +158,7 @@ when not defined(useNimRtl):
  proc reprRecordAux(result: var string, p: pointer, n: ptr TNimNode,
                     cl: var TReprClosure) =
    case n.kind
-    of nkNone: assert(false)
+    of nkNone: sysAssert(false)
    of nkSlot:
      add result, $n.name
      add result, " = "
@@ -206,7 +206,7 @@ when not defined(useNimRtl):
      var t = cast[ptr PNimType](p)[]
      reprRecord(result, p, t, cl)
    of tyRef, tyPtr:
-      assert(p != nil)
+      sysAssert(p != nil)
      if cast[ppointer](p)[] == nil: add result, "nil"
      else: reprRef(result, cast[ppointer](p)[], typ, cl)
    of tySequence:
--- a/lib/system/syslocks.nim
+++ b/lib/system/syslocks.nim
@@ -0,0 +1,101 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2011 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Low level system locks and condition vars.
+
+when defined(Windows):
+  type
+    THandle = int
+    TSysLock {.final, pure.} = object # CRITICAL_SECTION in WinApi
+      DebugInfo: pointer
+      LockCount: int32
+      RecursionCount: int32
+      OwningThread: int
+      LockSemaphore: int
+      Reserved: int32
+          
+    TSysCond = THandle
+          
+  proc InitSysLock(L: var TSysLock) {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "InitializeCriticalSection".}
+    ## Initializes the lock `L`.
+
+  proc TryAcquireSysAux(L: var TSysLock): int32 {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "TryEnterCriticalSection".}
+    ## Tries to acquire the lock `L`.
+    
+  proc TryAcquireSys(L: var TSysLock): bool {.inline.} = 
+    result = TryAcquireSysAux(L) != 0'i32
+
+  proc AcquireSys(L: var TSysLock) {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "EnterCriticalSection".}
+    ## Acquires the lock `L`.
+    
+  proc ReleaseSys(L: var TSysLock) {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "LeaveCriticalSection".}
+    ## Releases the lock `L`.
+
+  proc DeinitSys(L: var TSysLock) {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "DeleteCriticalSection".}
+
+  proc CreateEvent(lpEventAttributes: pointer, 
+                   bManualReset, bInitialState: int32,
+                   lpName: cstring): TSysCond {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "CreateEvent".}
+  
+  proc CloseHandle(hObject: THandle) {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "CloseHandle".}
+  proc WaitForSingleObject(hHandle: THandle, dwMilliseconds: int32): int32 {.
+    stdcall, dynlib: "kernel32", importc: "WaitForSingleObject".}
+
+  proc SignalSysCond(hEvent: TSysCond) {.stdcall, noSideEffect,
+    dynlib: "kernel32", importc: "SetEvent".}
+  
+  proc InitSysCond(cond: var TSysCond) {.inline.} =
+    cond = CreateEvent(nil, 0'i32, 0'i32, nil)
+  proc DeinitSysCond(cond: var TSysCond) {.inline.} =
+    CloseHandle(cond)
+  proc WaitSysCond(cond: var TSysCond, lock: var TSysLock) =
+    releaseSys(lock)
+    discard WaitForSingleObject(cond, -1'i32)
+    acquireSys(lock)
+
+else:
+  type
+    TSysLock {.importc: "pthread_mutex_t", pure, final,
+               header: "<sys/types.h>".} = object
+    TSysCond {.importc: "pthread_cond_t", pure, final,
+               header: "<sys/types.h>".} = object
+
+  proc InitSysLock(L: var TSysLock, attr: pointer = nil) {.
+    importc: "pthread_mutex_init", header: "<pthread.h>", noSideEffect.}
+
+  proc AcquireSys(L: var TSysLock) {.noSideEffect,
+    importc: "pthread_mutex_lock", header: "<pthread.h>".}
+  proc TryAcquireSysAux(L: var TSysLock): cint {.noSideEffect,
+    importc: "pthread_mutex_trylock", header: "<pthread.h>".}
+
+  proc TryAcquireSys(L: var TSysLock): bool {.inline.} = 
+    result = TryAcquireSysAux(L) == 0'i32
+
+  proc ReleaseSys(L: var TSysLock) {.noSideEffect,
+    importc: "pthread_mutex_unlock", header: "<pthread.h>".}
+  proc DeinitSys(L: var TSysLock) {.
+    importc: "pthread_mutex_destroy", header: "<pthread.h>".}
+
+  proc InitSysCond(cond: var TSysCond, cond_attr: pointer = nil) {.
+    importc: "pthread_cond_init", header: "<pthread.h>".}
+  proc WaitSysCond(cond: var TSysCond, lock: var TSysLock) {.
+    importc: "pthread_cond_wait", header: "<pthread.h>".}
+  proc SignalSysCond(cond: var TSysCond) {.
+    importc: "pthread_cond_signal", header: "<pthread.h>".}
+  
+  proc DeinitSysCond(cond: var TSysCond) {.
+    importc: "pthread_cond_destroy", header: "<pthread.h>".}
+  
--- a/lib/system/threads.nim
+++ b/lib/system/threads.nim
@@ -25,8 +25,8 @@
 ##    thr: array [0..4, TThread[tuple[a,b: int]]]
 ##    L: TLock
 ##  
-##  proc threadFunc(interval: tuple[a,b: int]) {.procvar.} = 
-##    for i in interval.a..interval.b: 
+##  proc threadFunc(interval: tuple[a,b: int]) {.thread.} =
+##    for i in interval.a..interval.b:
 ##      Acquire(L) # lock stdout
 ##      echo i
 ##      Release(L)
@@ -41,38 +41,13 @@ const
  maxRegisters = 256 # don't think there is an arch with more registers
  maxLocksPerThread* = 10 ## max number of locks a thread can hold
                          ## at the same time
+  useStackMaskHack = false ## use the stack mask hack for better performance
+  StackGuardSize = 4096
+  ThreadStackMask = 1024*256*sizeof(int)-1
+  ThreadStackSize = ThreadStackMask+1 - StackGuardSize

-when defined(Windows):
+when defined(windows):
  type
-    TSysLock {.final, pure.} = object # CRITICAL_SECTION in WinApi
-      DebugInfo: pointer
-      LockCount: int32
-      RecursionCount: int32
-      OwningThread: int
-      LockSemaphore: int
-      Reserved: int32
-          
-  proc InitSysLock(L: var TSysLock) {.stdcall, noSideEffect,
-    dynlib: "kernel32", importc: "InitializeCriticalSection".}
-    ## Initializes the lock `L`.
-
-  proc TryAcquireSysAux(L: var TSysLock): int32 {.stdcall, noSideEffect,
-    dynlib: "kernel32", importc: "TryEnterCriticalSection".}
-    ## Tries to acquire the lock `L`.
-    
-  proc TryAcquireSys(L: var TSysLock): bool {.inline.} = 
-    result = TryAcquireSysAux(L) != 0'i32
-
-  proc AcquireSys(L: var TSysLock) {.stdcall, noSideEffect,
-    dynlib: "kernel32", importc: "EnterCriticalSection".}
-    ## Acquires the lock `L`.
-    
-  proc ReleaseSys(L: var TSysLock) {.stdcall, noSideEffect,
-    dynlib: "kernel32", importc: "LeaveCriticalSection".}
-    ## Releases the lock `L`.
-
-  type
-    THandle = int
    TSysThread = THandle
    TWinThreadProc = proc (x: pointer): int32 {.stdcall.}

@@ -95,9 +70,6 @@ when defined(Windows):
                              dwMilliseconds: int32): int32 {.
    stdcall, dynlib: "kernel32", importc: "WaitForMultipleObjects".}

-  proc WaitForSingleObject(hHandle: TSysThread, dwMilliseconds: int32): int32 {.
-    stdcall, dynlib: "kernel32", importc: "WaitForSingleObject".}
-
  proc TerminateThread(hThread: TSysThread, dwExitCode: int32): int32 {.
    stdcall, dynlib: "kernel32", importc: "TerminateThread".}
    
@@ -115,24 +87,6 @@ else:
  {.passL: "-pthread".}
  {.passC: "-pthread".}

-  type
-    TSysLock {.importc: "pthread_mutex_t", pure, final,
-               header: "<sys/types.h>".} = object
-
-  proc InitSysLock(L: var TSysLock, attr: pointer = nil) {.
-    importc: "pthread_mutex_init", header: "<pthread.h>", noSideEffect.}
-
-  proc AcquireSys(L: var TSysLock) {.noSideEffect,
-    importc: "pthread_mutex_lock", header: "<pthread.h>".}
-  proc TryAcquireSysAux(L: var TSysLock): cint {.noSideEffect,
-    importc: "pthread_mutex_trylock", header: "<pthread.h>".}
-
-  proc TryAcquireSys(L: var TSysLock): bool {.inline.} = 
-    result = TryAcquireSysAux(L) == 0'i32
-
-  proc ReleaseSys(L: var TSysLock) {.noSideEffect,
-    importc: "pthread_mutex_unlock", header: "<pthread.h>".}
-
  type
    TSysThread {.importc: "pthread_t", header: "<sys/types.h>",
                 final, pure.} = object
@@ -191,57 +145,71 @@ else:
  proc ThreadVarGetValue(s: TThreadVarSlot): pointer {.inline.} =
    result = pthread_getspecific(s)

-const emulatedThreadVars = defined(macosx)
+  when useStackMaskHack:
+    proc pthread_attr_setstack(attr: var TPthread_attr, stackaddr: pointer,
+                               size: int): cint {.
+      importc: "pthread_attr_setstack", header: "<pthread.h>".}
+
+const
+  emulatedThreadVars = true

 when emulatedThreadVars:
  # the compiler generates this proc for us, so that we can get the size of
-  # the thread local var block:
+  # the thread local var block; we use this only for sanity checking though
  proc NimThreadVarsSize(): int {.noconv, importc: "NimThreadVarsSize".}

-proc ThreadVarsAlloc(size: int): pointer =
-  result = c_malloc(size)
-  zeroMem(result, size)
-proc ThreadVarsDealloc(p: pointer) {.importc: "free", nodecl.}
-
+# we preallocate a fixed size for thread local storage, so that no heap
+# allocations are needed. Currently less than 7K are used on a 64bit machine.
+# We use ``float`` for proper alignment:
 type
+  TThreadLocalStorage = array [0..1_000, float]
+
  PGcThread = ptr TGcThread
  TGcThread {.pure.} = object
    sys: TSysThread
    next, prev: PGcThread
-    stackBottom, stackTop, threadLocalStorage: pointer
+    stackBottom, stackTop: pointer
    stackSize: int
-    locksLen: int
-    locks: array [0..MaxLocksPerThread-1, pointer]
-    registers: array[0..maxRegisters-1, pointer] # register contents for GC
+    inbox: TThreadLocalStorage
+    when emulatedThreadVars and not useStackMaskHack:
+      tls: TThreadLocalStorage
+    else:
+      nil

 # XXX it'd be more efficient to not use a global variable for the 
 # thread storage slot, but to rely on the implementation to assign slot 0
 # for us... ;-)
 var globalsSlot = ThreadVarAlloc()
 #const globalsSlot = TThreadVarSlot(0)
-#assert checkSlot.int == globalsSlot.int
-  
-proc ThisThread(): PGcThread {.compilerRtl, inl.} =
-  result = cast[PGcThread](ThreadVarGetValue(globalsSlot))
+#sysAssert checkSlot.int == globalsSlot.int

 proc GetThreadLocalVars(): pointer {.compilerRtl, inl.} =
-  result = cast[PGcThread](ThreadVarGetValue(globalsSlot)).threadLocalStorage
+  result = addr(cast[PGcThread](ThreadVarGetValue(globalsSlot)).tls)
+
+when useStackMaskHack:
+  proc MaskStackPointer(offset: int): pointer {.compilerRtl, inl.} =
+    var x {.volatile.}: pointer
+    x = addr(x)
+    result = cast[pointer]((cast[int](x) and not ThreadStackMask) +% 
+      (0) +% offset)

 # create for the main thread. Note: do not insert this data into the list
 # of all threads; it's not to be stopped etc.
 when not defined(useNimRtl):
-  var mainThread: TGcThread
-  
-  ThreadVarSetValue(globalsSlot, addr(mainThread))
-  when emulatedThreadVars:
-    mainThread.threadLocalStorage = ThreadVarsAlloc(NimThreadVarsSize())
-
-  initStackBottom()
-  initGC()
+  when not useStackMaskHack:
+    var mainThread: TGcThread
+    ThreadVarSetValue(globalsSlot, addr(mainThread))
+    initStackBottom()
+    initGC()
  
  var heapLock: TSysLock
  InitSysLock(HeapLock)

+  when emulatedThreadVars:
+    if NimThreadVarsSize() > sizeof(TThreadLocalStorage):
+      echo "too large thread local storage size requested"
+      quit 1
+
  var
    threadList: PGcThread
    
@@ -251,11 +219,11 @@ when not defined(useNimRtl):
    t.prev = nil
    t.next = threadList
    if threadList != nil: 
-      assert(threadList.prev == nil)
+      sysAssert(threadList.prev == nil)
      threadList.prev = t
    threadList = t
    ReleaseSys(HeapLock)
-        
+  
  proc unregisterThread(t: PGcThread) =
    # we need to use the GC global lock here!
    AcquireSys(HeapLock)
@@ -270,9 +238,7 @@ when not defined(useNimRtl):
    
  # on UNIX, the GC uses ``SIGFREEZE`` to tell every thread to stop so that
  # the GC can examine the stacks?
-  
-  proc stopTheWord() =
-    nil
+  proc stopTheWord() = nil
    
 # We jump through some hops here to ensure that Nimrod thread procs can have
 # the Nimrod calling convention. This is needed because thread procs are 
@@ -286,26 +252,33 @@ type
    fn: proc (p: TParam)
    data: TParam

+proc initInbox(p: pointer)
+proc freeInbox(p: pointer)
 when not defined(boehmgc) and not hasSharedHeap:
  proc deallocOsPages()
  
 template ThreadProcWrapperBody(closure: expr) =
  ThreadVarSetValue(globalsSlot, closure)
  var t = cast[ptr TThread[TParam]](closure)
-  when emulatedThreadVars:
-    t.threadLocalStorage = ThreadVarsAlloc(NimThreadVarsSize())
+  when useStackMaskHack:
+    var tls: TThreadLocalStorage
  when not defined(boehmgc) and not hasSharedHeap:
    # init the GC for this thread:
    setStackBottom(addr(t))
    initGC()
  t.stackBottom = addr(t)
  registerThread(t)
+  initInbox(addr(t.inbox))
  try:
+    when false:
+      var a = addr(tls)
+      var b = MaskStackPointer(1293920-372736-303104-36864)
+      c_fprintf(c_stdout, "TLS:    %p\nmasked: %p\ndiff:   %ld\n",
+                a, b, cast[int](a) - cast[int](b))
    t.fn(t.data)
  finally:
    # XXX shut-down is not executed when the thread is forced down!
-    when emulatedThreadVars:
-      ThreadVarsDealloc(t.threadLocalStorage)
+    freeInbox(addr(t.inbox))
    unregisterThread(t)
    when defined(deallocOsPages): deallocOsPages()
  
@@ -330,7 +303,7 @@ proc joinThreads*[TParam](t: openArray[TThread[TParam]]) =
  ## waits for every thread in `t` to finish.
  when hostOS == "windows":
    var a: array[0..255, TSysThread]
-    assert a.len >= t.len
+    sysAssert a.len >= t.len
    for i in 0..t.high: a[i] = t[i].sys
    discard WaitForMultipleObjects(t.len, cast[ptr TSysThread](addr(a)), 1, -1)
  else:
@@ -338,7 +311,7 @@ proc joinThreads*[TParam](t: openArray[TThread[TParam]]) =

 when false:
  # XXX a thread should really release its heap here somehow:
-  proc destroyThread*[TParam](t: var TThread[TParam]) {.inline.} =
+  proc destroyThread*[TParam](t: var TThread[TParam]) =
    ## forces the thread `t` to terminate. This is potentially dangerous if
    ## you don't have full control over `t` and its acquired resources.
    when hostOS == "windows":
@@ -348,28 +321,32 @@ when false:
    unregisterThread(addr(t))

 proc createThread*[TParam](t: var TThread[TParam], 
-                           tp: proc (param: TParam), 
-                           param: TParam,
-                           stackSize = 1024*256*sizeof(int)) {.
-                           magic: "CreateThread".} = 
+                           tp: proc (param: TParam) {.thread.}, 
+                           param: TParam) =
  ## creates a new thread `t` and starts its execution. Entry point is the
  ## proc `tp`. `param` is passed to `tp`.
  t.data = param
  t.fn = tp
-  t.stackSize = stackSize
+  t.stackSize = ThreadStackSize
  when hostOS == "windows":
    var dummyThreadId: int32
-    t.sys = CreateThread(nil, stackSize, threadProcWrapper[TParam],
+    t.sys = CreateThread(nil, ThreadStackSize, threadProcWrapper[TParam],
                         addr(t), 0'i32, dummyThreadId)
    if t.sys <= 0:
      raise newException(EResourceExhausted, "cannot create thread")
  else:
    var a: Tpthread_attr
    pthread_attr_init(a)
-    pthread_attr_setstacksize(a, stackSize)
+    pthread_attr_setstacksize(a, ThreadStackSize)
    if pthread_create(t.sys, a, threadProcWrapper[TParam], addr(t)) != 0:
      raise newException(EResourceExhausted, "cannot create thread")

+when useStackMaskHack:
+  proc runMain(tp: proc (dummy: pointer) {.thread.}) {.compilerproc.} =
+    var mainThread: TThread[pointer]
+    createThread(mainThread, tp, nil)
+    joinThread(mainThread)
+
 # --------------------------- lock handling ----------------------------------

 type
@@ -380,18 +357,20 @@ const

 when nodeadlocks:
  var
-    deadlocksPrevented* = 0  ## counts the number of times a 
+    deadlocksPrevented*: int ## counts the number of times a 
                             ## deadlock has been prevented
+    locksLen {.threadvar.}: int
+    locks {.threadvar.}: array [0..MaxLocksPerThread-1, pointer]
+
+  proc OrderedLocks(): bool = 
+    for i in 0 .. locksLen-2:
+      if locks[i] >= locks[i+1]: return false
+    result = true

 proc InitLock*(lock: var TLock) {.inline.} =
  ## Initializes the lock `lock`.
  InitSysLock(lock)

-proc OrderedLocks(g: PGcThread): bool = 
-  for i in 0 .. g.locksLen-2:
-    if g.locks[i] >= g.locks[i+1]: return false
-  result = true
-
 proc TryAcquire*(lock: var TLock): bool {.inline.} = 
  ## Try to acquires the lock `lock`. Returns `true` on success.
  result = TryAcquireSys(lock)
@@ -399,88 +378,93 @@ proc TryAcquire*(lock: var TLock): bool {.inline.} =
    if not result: return
    # we have to add it to the ordered list. Oh, and we might fail if
    # there is no space in the array left ...
-    var g = ThisThread()
-    if g.locksLen >= len(g.locks):
+    if locksLen >= len(locks):
      ReleaseSys(lock)
      raise newException(EResourceExhausted, "cannot acquire additional lock")
    # find the position to add:
    var p = addr(lock)
-    var L = g.locksLen-1
+    var L = locksLen-1
    var i = 0
    while i <= L:
-      assert g.locks[i] != nil
-      if g.locks[i] < p: inc(i) # in correct order
-      elif g.locks[i] == p: return # thread already holds lock
+      sysAssert locks[i] != nil
+      if locks[i] < p: inc(i) # in correct order
+      elif locks[i] == p: return # thread already holds lock
      else:
        # do the crazy stuff here:
        while L >= i:
-          g.locks[L+1] = g.locks[L]
+          locks[L+1] = locks[L]
          dec L
-        g.locks[i] = p
-        inc(g.locksLen)
-        assert OrderedLocks(g)
+        locks[i] = p
+        inc(locksLen)
+        sysAssert OrderedLocks()
        return
    # simply add to the end:
-    g.locks[g.locksLen] = p
-    inc(g.locksLen)
-    assert OrderedLocks(g)
+    locks[locksLen] = p
+    inc(locksLen)
+    sysAssert OrderedLocks(g)

 proc Acquire*(lock: var TLock) =
  ## Acquires the lock `lock`.
  when nodeadlocks:
-    var g = ThisThread()
    var p = addr(lock)
-    var L = g.locksLen-1
+    var L = locksLen-1
    var i = 0
    while i <= L:
-      assert g.locks[i] != nil
-      if g.locks[i] < p: inc(i) # in correct order
-      elif g.locks[i] == p: return # thread already holds lock
+      sysAssert locks[i] != nil
+      if locks[i] < p: inc(i) # in correct order
+      elif locks[i] == p: return # thread already holds lock
      else:
        # do the crazy stuff here:
-        if g.locksLen >= len(g.locks):
+        if locksLen >= len(locks):
          raise newException(EResourceExhausted, 
              "cannot acquire additional lock")
        while L >= i:
-          ReleaseSys(cast[ptr TSysLock](g.locks[L])[])
-          g.locks[L+1] = g.locks[L]
+          ReleaseSys(cast[ptr TSysLock](locks[L])[])
+          locks[L+1] = locks[L]
          dec L
        # acquire the current lock:
        AcquireSys(lock)
-        g.locks[i] = p
-        inc(g.locksLen)
+        locks[i] = p
+        inc(locksLen)
        # acquire old locks in proper order again:
-        L = g.locksLen-1
+        L = locksLen-1
        inc i
        while i <= L:
-          AcquireSys(cast[ptr TSysLock](g.locks[i])[])
+          AcquireSys(cast[ptr TSysLock](locks[i])[])
          inc(i)
        # DANGER: We can only modify this global var if we gained every lock!
        # NO! We need an atomic increment. Crap.
        discard system.atomicInc(deadlocksPrevented, 1)
-        assert OrderedLocks(g)
+        sysAssert OrderedLocks(g)
        return
        
    # simply add to the end:
-    if g.locksLen >= len(g.locks):
+    if locksLen >= len(locks):
      raise newException(EResourceExhausted, "cannot acquire additional lock")
    AcquireSys(lock)
-    g.locks[g.locksLen] = p
-    inc(g.locksLen)
-    assert OrderedLocks(g)
+    locks[locksLen] = p
+    inc(locksLen)
+    sysAssert OrderedLocks(g)
  else:
    AcquireSys(lock)
  
 proc Release*(lock: var TLock) =
  ## Releases the lock `lock`.
  when nodeadlocks:
-    var g = ThisThread()
    var p = addr(lock)
-    var L = g.locksLen
+    var L = locksLen
    for i in countdown(L-1, 0):
-      if g.locks[i] == p: 
-        for j in i..L-2: g.locks[j] = g.locks[j+1]
-        dec g.locksLen
+      if locks[i] == p: 
+        for j in i..L-2: locks[j] = locks[j+1]
+        dec locksLen
        break
  ReleaseSys(lock)

+# ------------------------ message passing support ---------------------------
+
+proc getInBoxMem*[TMsg](t: var TThread[TMsg]): pointer {.inline.} =
+  result = addr(t.inbox)
+
+proc getInBoxMem*(): pointer {.inline.} =
+  result = addr(cast[PGcThread](ThreadVarGetValue(globalsSlot)).inbox)
+
--- a/tests/accept/run/tnodeadlocks.nim
+++ b/tests/accept/run/tnodeadlocks.nim
@@ -14,7 +14,7 @@ var

 proc doNothing() = nil

-proc threadFunc(interval: tuple[a, b: int]) {.procvar.} = 
+proc threadFunc(interval: tuple[a, b: int]) {.thread.} = 
  doNothing()
  for i in interval.a..interval.b: 
    when nodeadlocks:
--- a/todo.txt
+++ b/todo.txt
@@ -1,7 +1,15 @@
 High priority (version 0.8.12)
 ==============================
-* test threads on windows; thread analysis needs to be even more restrictive!
-* implement message passing built-ins: channels/queues
+* test threads on windows
+* test thread analysis: 
+  var x = globalString # ok, copied; `x` is mine!
+  vs
+  var x = globalRef # read access, `x` is theirs!
+
+* test message passing built-ins
+* make threadvar efficient again on linux after testing
+* document Nimrod's threads
+* document Nimrod's two phase symbol lookup for generics
 * bug: {:}.toTable[int, string]()


@@ -11,6 +19,7 @@ version 0.9.0
 - add --deadlock_prevention:on|off switch? timeout for locks?
 - bug: tfFinal not passed to generic
 - bug: forward proc for generic seems broken
+- ``var T`` as a return type; easy to prove that location is not on the stack
 - test the sort implementation again
 - warning for implicit openArray -> varargs convention
 - implement explicit varargs
@@ -74,7 +83,6 @@ Low priority

 - ``when T is int`` for generic code
 - ``when validCode( proc ()  )`` for generic code
- macros: ``typecheck`` pragma; this allows transformations based on types!
 - find a way for easy constructors and destructors; (destructors are much more
  important than constructors)
 - code generated for type information is wasteful
--- a/web/news.txt
+++ b/web/news.txt
@@ -56,6 +56,7 @@ Additions
 - Added ``lists`` module which contains generic linked lists.
 - Added ``sets`` module which contains generic hash sets.
 - Added ``tables`` module which contains generic hash tables.
+- Added ``queues`` module which contains generic sequence based queues.
 - Added ``intsets`` module which contains a specialized int set data type.
 - Added ``scgi`` module.
 - Added ``smtp`` module.
--- a/web/nimrod.ini
+++ b/web/nimrod.ini
@@ -39,7 +39,7 @@ srcdoc: "pure/xmlparser;pure/htmlparser;pure/xmltree;pure/colors"
 srcdoc: "pure/json;pure/base64;pure/scgi;pure/redis;impure/graphics"
 srcdoc: "impure/rdstdin;wrappers/zmq;wrappers/sphinx"
 srcdoc: "pure/collections/tables;pure/collections/sets;pure/collections/lists"
-srcdoc: "pure/collections/intsets;pure/encodings"
+srcdoc: "pure/collections/intsets;pure/collections/queues;pure/encodings"

 webdoc: "wrappers/libcurl;pure/md5;wrappers/mysql;wrappers/iup"
 webdoc: "wrappers/sqlite3;wrappers/postgres;wrappers/tinyc"