initial non-compiling version of 'parallel'

2026-06-06 03:44:14 +00:00 · 2014-05-12 11:12:37 +02:00
parent bdb2d21f27
commit 6195dbe491
16 changed files with 1058 additions and 91 deletions
--- a/compiler/guards.nim
+++ b/compiler/guards.nim
@@ -9,7 +9,8 @@

 ## This module implements the 'implies' relation for guards.

-import ast, astalgo, msgs, magicsys, nimsets, trees, types, renderer, idents
+import ast, astalgo, msgs, magicsys, nimsets, trees, types, renderer, idents,
+  saturate

 const
  someEq = {mEqI, mEqI64, mEqF64, mEqEnum, mEqCh, mEqB, mEqRef, mEqProc,
@@ -25,6 +26,17 @@ const

  someIn = {mInRange, mInSet}

+  someHigh = {mHigh}
+  # we don't list unsigned here because wrap around semantics suck for
+  # proving anything:
+  someAdd = {mAddI, mAddI64, mAddF64, mSucc}
+  someSub = {mSubI, mSubI64, mSubF64, mPred}
+  someMul = {mMulI, mMulI64, mMulF64}
+  someDiv = {mDivI, mDivI64, mDivF64}
+  someMod = {mModI, mModI64}
+  someMax = {mMaxI, mMaxI64, mMaxF64}
+  someMin = {mMinI, mMinI64, mMinF64}
+
 proc isValue(n: PNode): bool = n.kind in {nkCharLit..nkNilLit}
 proc isLocation(n: PNode): bool = not n.isValue

@@ -69,19 +81,24 @@ proc isLetLocation(m: PNode, isApprox: bool): bool =

 proc interestingCaseExpr*(m: PNode): bool = isLetLocation(m, true)

-proc getMagicOp(name: string, m: TMagic): PSym =
+proc createMagic*(name: string, m: TMagic): PSym =
  result = newSym(skProc, getIdent(name), nil, unknownLineInfo())
  result.magic = m

 let
-  opLe = getMagicOp("<=", mLeI)
-  opLt = getMagicOp("<", mLtI)
-  opAnd = getMagicOp("and", mAnd)
-  opOr = getMagicOp("or", mOr)
-  opNot = getMagicOp("not", mNot)
-  opIsNil = getMagicOp("isnil", mIsNil)
-  opContains = getMagicOp("contains", mInSet)
-  opEq = getMagicOp("==", mEqI)
+  opLe = createMagic("<=", mLeI)
+  opLt = createMagic("<", mLtI)
+  opAnd = createMagic("and", mAnd)
+  opOr = createMagic("or", mOr)
+  opNot = createMagic("not", mNot)
+  opIsNil = createMagic("isnil", mIsNil)
+  opContains = createMagic("contains", mInSet)
+  opEq = createMagic("==", mEqI)
+  opAdd = createMagic("+", mAddI)
+  opSub = createMagic("-", mSubI)
+  opMul = createMagic("*", mMulI)
+  opDiv = createMagic("div", mDivI)
+  opLen = createMagic("len", mLengthSeq)

 proc swapArgs(fact: PNode, newOp: PSym): PNode =
  result = newNodeI(nkCall, fact.info, 3)
@@ -137,17 +154,118 @@ proc neg(n: PNode): PNode =
    result.sons[0] = newSymNode(opNot)
    result.sons[1] = n

-proc buildIsNil(arg: PNode): PNode =
-  result = newNodeI(nkCall, arg.info, 2)
-  result.sons[0] = newSymNode(opIsNil)
-  result.sons[1] = arg
+proc buildCall(op: PSym; a: PNode): PNode =
+  result = newNodeI(nkCall, a.info, 2)
+  result.sons[0] = newSymNode(op)
+  result.sons[1] = a
+
+proc buildCall(op: PSym; a, b: PNode): PNode =
+  result = newNodeI(nkCall, a.info, 3)
+  result.sons[0] = newSymNode(op)
+  result.sons[1] = a
+  result.sons[2] = b
+
+proc `+@`*(a: PNode; b: BiggestInt): PNode =
+  opAdd.buildCall(a, nkIntLit.newIntNode(b))
+
+proc `|+|`(a, b: PNode): PNode =
+  result = copyNode(a)
+  if a.kind in {nkCharLit..nkUInt64Lit}: result.intVal = a.intVal |+| b.intVal
+  else: result.floatVal = a.floatVal + b.floatVal
+
+proc `|*|`(a, b: PNode): PNode =
+  result = copyNode(a)
+  if a.kind in {nkCharLit..nkUInt64Lit}: result.intVal = a.intVal |*| b.intVal
+  else: result.floatVal = a.floatVal * b.floatVal
+
+proc zero(): PNode = nkIntLit.newIntNode(0)
+proc one(): PNode = nkIntLit.newIntNode(1)
+proc minusOne(): PNode = nkIntLit.newIntNode(-1)
+
+proc lowBound*(x: PNode): PNode = nkIntLit.newIntNode(firstOrd(x.typ))
+proc highBound*(x: PNode): PNode =
+  if x.typ.skipTypes(abstractInst).kind == tyArray:
+    nkIntLit.newIntNode(lastOrd(x.typ))
+  else:
+    opAdd.buildCall(opLen.buildCall(x), minusOne())
+
+proc canon*(n: PNode): PNode =
+  # XXX for now only the new code in 'semparallel' uses this
+  if n.safeLen >= 1:
+    result = newNodeI(n.kind, n.info, n.len)
+    for i in 0 .. < n.safeLen:
+      result.sons[i] = canon(n.sons[i])
+  else:
+    result = n
+  case result.getMagic
+  of someEq, someAdd, someMul, someMin, someMax:
+    # these are symmetric; put value as last:
+    if result.sons[1].isValue and not result.sons[2].isValue:
+      result = swapArgs(result, result.sons[0].sym)
+      # (4 + foo) + 2 --> (foo + 4) + 2
+  of someHigh:
+    # high == len+(-1)
+    result = opAdd.buildCall(opLen.buildCall(result[1]), minusOne())
+  of mUnaryMinusI, mUnaryMinusI64:
+    result = buildCall(opAdd, result[1], newIntNode(nkIntLit, -1))
+  of someSub:
+    # x - 4  -->  x + (-4)
+    var b = result[2]
+    if b.kind in {nkCharLit..nkUInt64Lit} and b.intVal != low(BiggestInt):
+      b = copyNode(b)
+      b.intVal = -b.intVal
+      result = buildCall(opAdd, result[1], b)
+    elif b.kind in {nkFloatLit..nkFloat64Lit}:
+      b = copyNode(b)
+      b.floatVal = -b.floatVal
+      result = buildCall(opAdd, result[1], b)    
+  of someLen:
+    result.sons[0] = opLen.newSymNode
+  else: discard
+
+  # re-association:
+  # (foo+5)+5 --> foo+10;  same for '*'
+  case result.getMagic
+  of someAdd:
+    if result[2].isValue and 
+        result[1].getMagic in someAdd and result[1][2].isValue:
+      result = opAdd.buildCall(result[1][1], result[1][2] |+| result[2])
+  of someMul:
+    if result[2].isValue and 
+        result[1].getMagic in someMul and result[1][2].isValue:
+      result = opAdd.buildCall(result[1][1], result[1][2] |*| result[2])
+  else: discard
+
+  # most important rule: (x-4) < a.len -->  x < a.len+4
+  case result.getMagic
+  of someLe, someLt:
+    let x = result[1]
+    let y = result[2]
+    if x.kind in nkCallKinds and x.len == 3 and x[2].isValue and 
+        isLetLocation(x[1], true):
+      case x.getMagic
+      of someSub:
+        result = buildCall(result[0].sym, x[1], opAdd.buildCall(y, x[2]))
+      of someAdd:
+        result = buildCall(result[0].sym, x[1], opSub.buildCall(y, x[2]))
+      else: discard
+    elif y.kind in nkCallKinds and y.len == 3 and y[2].isValue and 
+        isLetLocation(y[1], true):
+      # a.len < x-3
+      case y.getMagic
+      of someSub:
+        result = buildCall(result[0].sym, y[1], opAdd.buildCall(x, y[2]))
+      of someAdd:
+        result = buildCall(result[0].sym, y[1], opSub.buildCall(x, y[2]))
+      else: discard
+  else: discard

 proc usefulFact(n: PNode): PNode =
  case n.getMagic
  of someEq:
    if skipConv(n.sons[2]).kind == nkNilLit and (
        isLetLocation(n.sons[1], false) or isVar(n.sons[1])):
-      result = buildIsNil(n.sons[1])
+      result = opIsNil.buildCall(n.sons[1])
    else:
      if isLetLocation(n.sons[1], true) or isLetLocation(n.sons[2], true):
        # XXX algebraic simplifications!  'i-1 < a.len' --> 'i < a.len+1'
@@ -217,7 +335,7 @@ proc addFactNeg*(m: var TModel, n: PNode) =
  let n = n.neg
  if n != nil: addFact(m, n)

-proc sameTree(a, b: PNode): bool = 
+proc sameTree*(a, b: PNode): bool = 
  result = false
  if a == b:
    result = true
@@ -519,7 +637,46 @@ proc doesImply*(facts: TModel, prop: PNode): TImplication =
      if result != impUnknown: return

 proc impliesNotNil*(facts: TModel, arg: PNode): TImplication =
-  result = doesImply(facts, buildIsNil(arg).neg)
+  result = doesImply(facts, opIsNil.buildCall(arg).neg)
+
+proc proveLe*(m: TModel; a, b: PNode): TImplication =
+  let res = canon(opLe.buildCall(a, b))
+  # we hardcode lots of axioms here:
+  let a = res[1]
+  let b = res[2]
+  #   0 <= 3
+  if a.isValue and b.isValue:
+    return if leValue(a, b): impYes else: impNo
+
+  # use type information too:  x <= 4  iff  high(x) <= 4
+  if b.isValue and a.typ != nil and a.typ.isOrdinalType:
+    if lastOrd(a.typ) <= b.intVal: return impYes
+  # 3 <= x   iff  low(x) <= 3
+  if a.isValue and b.typ != nil and b.typ.isOrdinalType:
+    if firstOrd(b.typ) <= a.intVal: return impYes
+
+  # x <= x
+  if sameTree(a, b): return impYes
+
+  #   x <= x+c  iff 0 <= c
+  if b.getMagic in someAdd and sameTree(a, b[1]):
+    return proveLe(m, zero(), b[2])
+
+  #   x <= x*c  if  1 <= c and 0 <= x:
+  if b.getMagic in someMul and sameTree(a, b[1]):
+    if proveLe(m, one(), b[2]) == impYes and proveLe(m, zero(), a) == impYes:
+      return impYes
+
+  #   x div c <= x   if   1 <= c  and  0 <= x:
+  if a.getMagic in someDiv and sameTree(a[1], b):
+    if proveLe(m, one(), a[2]) == impYes and proveLe(m, zero(), b) == impYes:
+      return impYes
+
+  # use the knowledge base:
+  return doesImply(m, res)
+
+proc addFactLe*(m: var TModel; a, b: PNode) =
+  m.add canon(opLe.buildCall(a, b))

 proc settype(n: PNode): PType =
  result = newType(tySet, n.typ.owner)
--- a/compiler/lowerings.nim
+++ b/compiler/lowerings.nim
@@ -114,11 +114,15 @@ proc callCodegenProc*(name: string, arg1: PNode;
    if arg3 != nil: result.add arg3

 proc createWrapperProc(f: PNode; threadParam, argsParam: PSym;
-                       varSection, call: PNode): PSym =
+                       varSection, call, barrier: PNode): PSym =
  var body = newNodeI(nkStmtList, f.info)
  body.add varSection
+  if barrier != nil:
+    body.add callCodeGenProc("barrierEnter", barrier)
  body.add callCodeGenProc("nimArgsPassingDone", newSymNode(threadParam))
  body.add call
+  if barrier != nil:
+    body.add callCodeGenProc("barrierLeave", barrier)

  var params = newNodeI(nkFormalParams, f.info)
  params.add emptyNode
@@ -146,7 +150,7 @@ proc createCastExpr(argsParam: PSym; objType: PType): PNode =
  result.typ = newType(tyPtr, objType.owner)
  result.typ.rawAddSon(objType)

-proc wrapProcForSpawn*(owner: PSym; n: PNode): PNode =
+proc wrapProcForSpawn*(owner: PSym; n: PNode; barrier: PNode = nil): PNode =
  result = newNodeI(nkStmtList, n.info)
  if n.kind notin nkCallKinds or not n.typ.isEmptyType:
    localError(n.info, "'spawn' takes a call expression of type void")
@@ -162,6 +166,7 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode): PNode =
    threadParam.typ = ptrType
    argsParam.typ = ptrType
    argsParam.position = 1
+
  var objType = createObj(owner, n.info)
  incl(objType.flags, tfFinal)
  let castExpr = createCastExpr(argsParam, objType)
@@ -223,6 +228,17 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode): PNode =

    call.add(newSymNode(temp))

-  let wrapper = createWrapperProc(fn, threadParam, argsParam, varSection, call)
+  var barrierAsExpr: PNode = nil
+  if barrier != nil:
+    let typ = newType(tyPtr, owner)
+    typ.rawAddSon(magicsys.getCompilerProc("Barrier").typ)
+    var field = newSym(skField, getIdent"barrier", owner, n.info)
+    field.typ = typ
+    objType.addField(field)
+    result.add newFastAsgnStmt(newDotExpr(scratchObj, field), barrier)
+    barrierAsExpr = indirectAccess(castExpr, field, n.info)
+
+  let wrapper = createWrapperProc(fn, threadParam, argsParam, varSection, call,
+                                  barrierAsExpr)
  result.add callCodeGenProc("nimSpawn", wrapper.newSymNode,
                             genAddrOf(scratchObj.newSymNode))
--- a/compiler/semparallel.nim
+++ b/compiler/semparallel.nim
@@ -0,0 +1,414 @@
+#
+#
+#           The Nimrod Compiler
+#        (c) Copyright 2014 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Semantic checking for 'parallel'.
+
+# - slices should become "nocopy" to openArray (+)
+#   - need to perform bound checks (+)
+#
+# - parallel needs to insert a barrier (+)
+# - passed arguments need to be ensured to be "const"
+#   - what about 'f(a)'? --> f shouldn't have side effects anyway
+# - passed arrays need to be ensured not to alias
+# - passed slices need to be ensured to be disjoint (+)
+# - output slices need special logic
+
+import lowerings, guards, sempass2
+
+discard """
+
+one major problem:
+  spawn f(a[i])
+  inc i
+  spawn f(a[i])
+is valid, but
+  spawn f(a[i])
+  spawn f(a[i])
+  inc i
+is not! However, 
+  spawn f(a[i])
+  if guard: inc i
+  spawn f(a[i])
+is not valid either! --> We need a flow dependent analysis here.
+
+However:
+  while foo:
+    spawn f(a[i])
+    inc i
+    spawn f(a[i])
+
+Is not valid either! --> We should really restrict 'inc' to loop endings?
+
+The heuristic that we implement here (that has no false positives) is: Usage
+of 'i' in a slice *after* we determined the stride is invalid!
+"""
+
+type
+  TDirection = enum
+    ascending, descending
+  MonotonicVar = object
+    v: PSym
+    lower, upper, stride: PNode
+    dir: TDirection
+    blacklisted: bool     # blacklisted variables that are not monotonic
+  AnalysisCtx = object
+    locals: seq[MonotonicVar]
+    slices: seq[tuple[x,a,b: PNode, spawnId: int, inLoop: bool]]
+    guards: TModel      # nested guards
+    args: seq[PSym]     # args must be deeply immutable
+    spawns: int         # we can check that at last 1 spawn is used in
+                        # the 'parallel' section
+    currentSpawnId: int
+    inLoop: int
+
+let opSlice = createMagic("slice", mSlice)
+
+proc initAnalysisCtx(): AnalysisCtx =
+  result.locals = @[]
+  result.slices = @[]
+  result.args = @[]
+  result.guards = @[]
+
+proc getSlot(c: var AnalysisCtx; s: PSym): ptr MonotonicVar =
+  var L = c.locals.len
+  for i in 0.. <L:
+    if c.locals[i].v == s: return addr(c.locals[i])
+  c.locals.setLen(L+1)
+  c.locals[L].v = s
+  return addr(c.locals[L])
+
+proc getRoot(n: PNode): PSym =
+  ## ``getRoot`` takes a *path* ``n``. A path is an lvalue expression
+  ## like ``obj.x[i].y``. The *root* of a path is the symbol that can be
+  ## determined as the owner; ``obj`` in the example.
+  case n.kind
+  of nkSym:
+    if n.sym.kind in {skVar, skResult, skTemp, skLet, skForVar}:
+      result = n.sym
+  of nkDotExpr, nkBracketExpr, nkHiddenDeref, nkDerefExpr,
+      nkObjUpConv, nkObjDownConv, nkCheckedFieldExpr:
+    result = getRoot(n.sons[0])
+  of nkHiddenStdConv, nkHiddenSubConv, nkConv:
+    result = getRoot(n.sons[1])
+  of nkCallKinds:
+    if getMagic(n) == mSlice: result = getRoot(n.sons[1])
+  else: discard
+
+proc gatherArgs(c: var AnalysisCtx; n: PNode) =
+  for i in 0.. <n.safeLen:
+    let root = getRoot n[i]
+    if root != nil:
+      block addRoot:
+        for r in items(c.args):
+          if r == root: break addRoot
+        c.args.add root
+    gatherArgs(c, n[i])
+
+proc isLocal(s: PSym): bool = 
+  s.kind in {skResult, skTemp, skForVar, skVar, skLet} and
+        {sfAddrTaken, sfGlobal} * s.flags == {}
+
+proc checkLocal(c: var AnalysisCtx; n: PNode) =
+  if n.kind == nkSym and isLocal(n.sym):
+    let slot = c.getSlot(n[1].sym)
+    if slot.stride != nil:
+      localError(n.info, "invalid usage of counter after increment")
+  else:
+    for i in 0 .. <n.safeLen: checkLocal(c, n.sons[i])
+
+proc checkLe(c: AnalysisCtx; a, b: PNode) =
+  case proveLe(c.guards, a, b)
+  of impUnkown:
+    localError(n.info, "cannot prove: " & a.renderTree & " <= " & b.renderTree)
+  of impYes: discard
+  of impNo:
+    localError(n.info, "can prove: " & a.renderTree & " > " & b.renderTree)
+
+proc checkBounds(c: AnalysisCtx; arr, idx: PNode) =
+  checkLe(c, arr.lowBound, idx)
+  checkLe(c, idx, arr.highBound)
+
+proc addLowerBoundAsFacts(c: var AnalysisCtx) =
+  for v in c.locals:
+    if not v.blacklisted:
+      c.guards.addFactLe(v.lower, newSymNode(v.v))
+
+proc addSlice(c: var AnalysisCtx; n: PNode; x, le, ri: int) =
+  checkLocal(c, n)
+  let le = n.sons[le]
+  let ri = n.sons[ri]
+  let x = n.sons[x]
+  # perform static bounds checking here; and not later!
+  let oldState = c.guards.len
+  addLowerBoundAsFacts(c)
+  c.checkBounds(x, le)
+  c.checkBounds(x, ri)
+  c.guards.setLen(oldState)
+  c.slices.add((x, le, ri, c.currentSpawnId, c.inLoop > 0))
+
+template `?`(x): expr = x.renderTree
+
+proc overlap(m: TModel; x,y,c,d: PNode) =
+  #  X..Y and C..D overlap iff (X <= D and Y >= C)
+  case proveLe(m, x, d)
+  of impUnkown:
+    localError(x.info,
+      "cannot prove: $# > $#; required for $#..$# disjoint from $#..$#" %
+        [?x, ?d, ?x, ?y, ?c, ?d])
+  of impYes:
+    case proveLe(m, y, c)
+    of impUnknown:
+      localError(x.info,
+        "cannot prove: $# > $#; required for $#..$# disjoint from $#..$#" %
+          [?y, ?d, ?x, ?y, ?c, ?d])
+    of impYes:
+      localError(x.info, "$#..$# not disjoint from $#..$#" % [?x, ?y, ?c, ?d])
+    of impNo: discard
+  of impNo: discard
+
+proc stride(c: AnalysisCtx; n: PNode): BiggestInt =
+  # note: 0 if it cannot be determined is just right because then
+  # we analyse 'i..i' and 'i+0 .. i+0' and these are not disjoint!
+  if n.kind == nkSym and isLocal(n.sym):
+    let slot = c.getSlot(n[1].sym)
+    if slot.stride != nil:
+      result = slot.stride.intVal
+  else:
+    for i in 0 .. <n.safeLen: inc(result, stride(c, n.sons[i]))
+
+proc checkSlicesAreDisjoint(c: var AnalysisCtx) =
+  # this is the only thing that we need to perform after we have traversed
+  # the whole tree so that the strides are available.
+  # First we need to add all the computed lower bounds:
+  addLowerBoundAsFacts(c)
+  # Every slice used in a loop needs to be disjoint with itself:
+  for x,a,b,id,inLoop in items(c.slices):
+    if inLoop: overlap(c.guards, a,b, a+@c.stride(a), b+@c.stride(b))
+  # Another tricky example is:
+  #   while true:
+  #     spawn f(a[i])
+  #     spawn f(a[i+1])
+  #     inc i  # inc i, 2  would be correct here
+  #
+  # Or even worse:
+  #   while true:
+  #     spawn f(a[i+1 .. i+3])
+  #     spawn f(a[i+4 .. i+5])
+  #     inc i, 4
+  # Prove that i*k*stride + 3 != i*k'*stride + 5
+  # For the correct example this amounts to
+  #   i*k*2 != i*k'*2 + 1
+  # which is true.
+  # For now, we don't try to prove things like that at all, even though it'd
+  # be feasible for many useful examples. Instead we attach the slice to
+  # a spawn and if the attached spawns differ, we bail out:
+  for i in 0 .. high(c.slices):
+    for j in 0 .. high(c.slices):
+      let x = c.slices[i]
+      let y = c.slices[j]
+      if i != j and x.spawnId != y.spawnId and guards.sameTree(x.x, y.x):
+        if not x.inLoop and not y.inLoop:
+          overlap(c.guards, x.a, x.b, y.a, y.b)
+        else:
+          # ah I cannot resists the temptation and add another sweet heuristic:
+          # if both slices have the form (i+c)..(i+c)  and (i+d)..(i+d) we
+          # check they are disjoint and c <= stride and d <= stride:
+          # XXX
+          localError(x.x.info, "cannot prove $#..$# disjoint from $#..$#" %
+            [?x.a, ?x.b, ?y.a, ?y.b])
+
+proc analyse(c: var AnalysisCtx; n: PNode)
+
+proc analyseSons(c: var AnalysisCtx; n: PNode) =
+  for i in 0 .. <safeLen(n): analyse(c, n[i])
+
+proc min(a, b: PNode): PNode =
+  if a.isNil: result = b
+  elif a.intVal < b.intVal: result = a
+  else: result = b
+
+proc analyseCall(c: var AnalysisCtx; n: PNode; op: PSym) =
+  if op.magic == mSpawn:
+    inc c.spawns
+    let oldSpawnId = c.currentSpawnId
+    c.currentSpawnId = c.spawns
+    gatherArgs(c, n[1])
+    analyseSons(c, n)
+    c.currentSpawnId = oldSpawnId
+  elif op.magic == mInc or (op.name.s == "+=" and sfSystemModule in op.owner.flags):
+    if n[1].kind == nkSym and n[1].isLocal:
+      let incr = n[1].skipConv
+      if incr.kind in {nkCharLit..nkUInt32Lit} and incr.intVal > 0:
+        let slot = c.getSlot(n[1].sym)
+        slot.stride = min(slot.stride, incr)
+    analyseSons(c, n)
+  elif op.name.s == "[]" and sfSystemModule in op.owner.flags:
+    c.addSlice(n, 1, 2, 3)
+    analyseSons(c, n)
+  elif op.name.s == "[]=" and sfSystemModule in op.owner.flags:
+    c.addSlice(n, 1, 2, 3)
+    analyseSons(c, n)
+  else:
+    analyseSons(c, n)
+
+proc analyseCase(c: var AnalysisCtx; n: PNode) =
+  analyse(c, n.sons[0])
+  #let oldState = c.locals.len
+  let oldFacts = c.guards.len
+  for i in 1.. <n.len:
+    let branch = n.sons[i]
+    #setLen(c.locals, oldState)
+    setLen(c.guards, oldFacts)
+    addCaseBranchFacts(c.guards, n, i)
+    for i in 0 .. <branch.len:
+      analyse(c, branch.sons[i])
+  #setLen(c.locals, oldState)
+  setLen(c.guards, oldFacts)
+
+proc analyseIf(c: var AnalysisCtx; n: PNode) =
+  analyse(c, n.sons[0].sons[0])
+  let oldFacts = c.guards.len
+  addFact(c.guards, n.sons[0].sons[0])
+  #let oldState = c.locals.len
+
+  analyse(c, n.sons[0].sons[1])
+  for i in 1.. <n.len:
+    let branch = n.sons[i]
+    setLen(c.guards, oldFacts)
+    for j in 0..i-1:
+      addFactNeg(c.guards, n.sons[j].sons[0])
+    if branch.len > 1:
+      addFact(c.guards, branch.sons[0])
+    #setLen(c.locals, oldState)
+    for i in 0 .. <branch.len:
+      analyse(c, branch.sons[i])
+  #setLen(c.locals, oldState)
+  setLen(c.guards, oldFacts)
+
+proc analyse(c: var AnalysisCtx; n: PNode) =
+  case n.kind
+  of nkAsgn, nkFastAsgn:
+    # since we already ensure sfAddrTaken is not in s.flags, we only need to
+    # prevent direct assignments to the monotonic variable:
+    if n[0].kind == nkSym and n[0].isLocal:
+      let slot = c.getSlot(it[j].sym)
+      slot.blackListed = true
+    invalidateFacts(c.guards, n.sons[0])
+    analyseSons(c, n)
+    addAsgnFact(c.guards, n.sons[0], n.sons[1])
+  of nkCallKinds:
+    # direct call:
+    if n[0].kind == nkSym: analyseCall(c, n, n[0].sym)
+    else: analyseSons(c, n)
+  of nkBracket:
+    c.addSlice(n, 0, 1, 1)
+    analyseSons(c, n)
+  of nkReturnStmt, nkRaiseStmt, nkTryStmt:
+    localError(n.info, "invalid control flow for 'parallel'")
+    # 'break' that leaves the 'parallel' section is not valid either
+    # or maybe we should generate a 'try' XXX
+  of nkVarSection:
+    for it in n:
+      if it.sons[it.len-1].kind != nkEmpty:
+        for j in 0 .. it.len-3:
+          if it[j].kind == nkSym and it[j].isLocal:
+            let slot = c.getSlot(it[j].sym)
+            if slot.lower.isNil: slot.lower = it.sons[it.len-1]
+            else: internalError(it.info, "slot already has a lower bound")
+    analyseSons(c, n)
+
+  of nkCaseStmt: analyseCase(c, n)
+  of nkIfStmt, nkIfExpr: analyseIf(c, n)
+  of nkWhileStmt:
+    analyse(c, n.sons[0])
+    # 'while true' loop?
+    inc c.inLoop
+    if isTrue(n.sons[0]):
+      analyseSons(c, n.sons[1])
+    else:
+      # loop may never execute:
+      let oldState = c.locals.len
+      let oldFacts = c.guards.len
+      addFact(c.guards, n.sons[0])
+      analyse(c, n.sons[1])
+      setLen(c.locals, oldState)
+      setLen(c.guards, oldFacts)
+      # we know after the loop the negation holds:
+      if not containsNode(n.sons[1], nkBreakStmt):
+        addFactNeg(c.guards, n.sons[0])
+    dec c.inLoop
+  of nkTypeSection, nkProcDef, nkConverterDef, nkMethodDef, nkIteratorDef,
+      nkMacroDef, nkTemplateDef, nkConstSection, nkPragma:
+    discard
+  else:
+    analyseSons(c, n)
+
+proc transformSlices(n: PNode): PNode =
+  if n.kind in nkCalls and n[0].kind == nkSym:
+    let op = n[0].sym
+    if op.name.s == "[]" and sfSystemModule in op.owner.flags:
+      result = copyTree(n)
+      result.sons[0] = opSlice
+      return result
+  if n.safeLen > 0:
+    result = copyNode(n.kind, n.info, n.len)
+    for i in 0 .. < n.len:
+      result.sons[i] = transformSlices(n.sons[i])
+  else:
+    result = n
+
+proc transformSpawn(owner: PSym; n, barrier: PNode): PNode =
+  if n.kind in nkCalls:
+    if n[0].kind == nkSym:
+      let op = n[0].sym
+      if op.magic == mSpawn:
+        result = transformSlices(n)
+        return wrapProcForSpawn(owner, result, barrier)
+  elif n.safeLen > 0:
+    result = copyNode(n.kind, n.info, n.len)
+    for i in 0 .. < n.len:
+      result.sons[i] = transformSpawn(owner, n.sons[i], barrier)
+  else:
+    result = n
+
+proc liftParallel*(owner: PSym; n: PNode): PNode =
+  # this needs to be called after the 'for' loop elimination
+
+  # first pass:
+  # - detect monotonic local integer variables
+  # - detect used slices
+  # - detect used arguments
+  
+  var a = initAnalysisCtx()
+  let body = n.lastSon
+  analyse(a, body)
+  if a.spawns == 0:
+    localError(n.info, "'parallel' section without 'spawn'")
+  checkSlices(a)
+  checkArgs(a, body)
+
+  var varSection = newNodeI(nkVarSection, n.info)
+  var temp = newSym(skTemp, "barrier", owner, n.info)
+  temp.typ = magicsys.getCompilerProc("Barrier").typ
+  incl(temp.flags, sfFromGeneric)
+
+  var vpart = newNodeI(nkIdentDefs, n.info, 3)
+  vpart.sons[0] = newSymNode(temp)
+  vpart.sons[1] = ast.emptyNode
+  vpart.sons[2] = indirectAccess(castExpr, field, n.info)
+  varSection.add vpart
+
+  barrier = genAddrOf(vpart[0])
+
+  result = newNodeI(nkStmtList, n.info)
+  generateAliasChecks(a, result)
+  result.add varSection
+  result.add callCodeGenProc("openBarrier", barrier)
+  result.add transformSpawn(owner, body, barrier)
+  result.add callCodeGenProc("closeBarrier", barrier)
--- a/compiler/sempass2.nim
+++ b/compiler/sempass2.nim
@@ -89,7 +89,7 @@ proc initVarViaNew(a: PEffects, n: PNode) =
  if n.kind != nkSym: return
  let s = n.sym
  if {tfNeedsInit, tfNotNil} * s.typ.flags <= {tfNotNil}:
-    # 'x' is not nil, but that doesn't mean it's not nil children
+    # 'x' is not nil, but that doesn't mean its "not nil" children
    # are initialized:
    initVar(a, n)

@@ -478,7 +478,7 @@ proc trackBlock(tracked: PEffects, n: PNode) =
  else:
    track(tracked, n)

-proc isTrue(n: PNode): bool =
+proc isTrue*(n: PNode): bool =
  n.kind == nkSym and n.sym.kind == skEnumField and n.sym.position != 0 or
    n.kind == nkIntLit and n.intVal != 0

--- a/compiler/vm.nim
+++ b/compiler/vm.nim
@@ -131,8 +131,9 @@ proc createStrKeepNode(x: var TFullReg) =
      nfAllConst in x.node.flags:
    # XXX this is hacky; tests/txmlgen triggers it:
    x.node = newNode(nkStrLit)
-    #  debug x.node
-    #assert x.node.kind in {nkStrLit..nkTripleStrLit}
+    # It not only hackey, it is also wrong for tgentemplate. The primary
+    # cause of bugs like these is that the VM does not properly distinguish
+    # between variable defintions (var foo = e) and variable updates (foo = e).

 template createStr(x) =
  x.node = newNode(nkStrLit)
--- a/config/nimrod.cfg
+++ b/config/nimrod.cfg
@@ -16,6 +16,7 @@ arm.linux.gcc.linkerexe = "arm-linux-gcc"
 path="$lib/core"
 path="$lib/pure"
 path="$lib/pure/collections"
+path="$lib/pure/concurrency"
 path="$lib/impure"
 path="$lib/wrappers"
 # path="$lib/wrappers/cairo"
--- a/lib/pure/concurrency/cpuinfo.nim
+++ b/lib/pure/concurrency/cpuinfo.nim
@@ -0,0 +1,58 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2014 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## This module implements procs to determine the number of CPUs / cores.
+
+include "system/inclrtl"
+
+import strutils, os
+
+when not defined(windows):
+  import posix
+
+when defined(linux):
+  import linux
+
+when defined(macosx) or defined(bsd):
+  const
+    CTL_HW = 6
+    HW_AVAILCPU = 25
+    HW_NCPU = 3
+  proc sysctl(x: ptr array[0..3, cint], y: cint, z: pointer,
+              a: var csize, b: pointer, c: int): cint {.
+             importc: "sysctl", header: "<sys/sysctl.h>".}
+
+proc countProcessors*(): int {.rtl, extern: "ncpi$1".} =
+  ## returns the numer of the processors/cores the machine has.
+  ## Returns 0 if it cannot be detected.
+  when defined(windows):
+    var x = getEnv("NUMBER_OF_PROCESSORS")
+    if x.len > 0: result = parseInt(x.string)
+  elif defined(macosx) or defined(bsd):
+    var
+      mib: array[0..3, cint]
+      numCPU: int
+      len: csize
+    mib[0] = CTL_HW
+    mib[1] = HW_AVAILCPU
+    len = sizeof(numCPU)
+    discard sysctl(addr(mib), 2, addr(numCPU), len, nil, 0)
+    if numCPU < 1:
+      mib[1] = HW_NCPU
+      discard sysctl(addr(mib), 2, addr(numCPU), len, nil, 0)
+    result = numCPU
+  elif defined(hpux):
+    result = mpctl(MPC_GETNUMSPUS, nil, nil)
+  elif defined(irix):
+    var SC_NPROC_ONLN {.importc: "_SC_NPROC_ONLN", header: "<unistd.h>".}: cint
+    result = sysconf(SC_NPROC_ONLN)
+  else:
+    result = sysconf(SC_NPROCESSORS_ONLN)
+  if result <= 0: result = 1
+
--- a/lib/pure/concurrency/cpuload.nim
+++ b/lib/pure/concurrency/cpuload.nim
@@ -0,0 +1,96 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2014 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## This module implements a helper for a thread pool to determine whether
+## creating a thread is a good idea.
+
+when defined(windows):
+  import winlean, os, strutils, math
+
+  proc `-`(a, b: TFILETIME): int64 = a.rdFileTime - b.rdFileTime
+elif defined(linux):
+  from cpuinfo import countProcessors
+
+type
+  ThreadPoolAdvice* = enum
+    doNothing,
+    doCreateThread,  # create additional thread for throughput
+    doShutdownThread # too many threads are busy, shutdown one
+
+  ThreadPoolState* = object
+    when defined(windows):
+      prevSysKernel, prevSysUser, prevProcKernel, prevProcUser: TFILETIME
+    calls*: int
+
+proc advice*(s: var ThreadPoolState): ThreadPoolAdvice =
+  when defined(windows):
+    var
+      sysIdle, sysKernel, sysUser,
+        procCreation, procExit, procKernel, procUser: TFILETIME
+    if getSystemTimes(sysIdle, sysKernel, sysUser) == 0 or
+        getProcessTimes(THandle(-1), procCreation, procExit, 
+                        procKernel, procUser) == 0:
+      return doNothing
+    if s.calls > 0:
+      let
+        sysKernelDiff = sysKernel - s.prevSysKernel
+        sysUserDiff = sysUser - s.prevSysUser
+
+        procKernelDiff = procKernel - s.prevProcKernel
+        procUserDiff = procUser - s.prevProcUser
+
+        sysTotal = int(sysKernelDiff + sysUserDiff)
+        procTotal = int(procKernelDiff + procUserDiff)
+      # total CPU usage < 85% --> create a new worker thread.
+      # Measurements show that 100% and often even 90% is not reached even
+      # if all my cores are busy.
+      if sysTotal == 0 or procTotal / sysTotal < 0.85:
+        result = doCreateThread
+    s.prevSysKernel = sysKernel
+    s.prevSysUser = sysUser
+    s.prevProcKernel = procKernel
+    s.prevProcUser = procUser
+  elif defined(linux):
+    proc fscanf(c: TFile, frmt: cstring) {.varargs, importc, 
+      header: "<stdio.h>".}
+
+    var f = open("/proc/loadavg")
+    var b: float
+    var busy, total: int
+    fscanf(f,"%lf %lf %lf %ld/%ld",
+           addr b, addr b, addr b, addr busy, addr total)
+    f.close()
+    let cpus = countProcessors()
+    if busy-1 < cpus:
+      result = doCreateThread
+    elif busy-1 >= cpus*2:
+      result = doShutdownThread
+    else:
+      result = doNothing
+  else:
+    # XXX implement this for other OSes
+    result = doNothing
+  inc s.calls
+
+when isMainModule:
+  proc busyLoop() =
+    while true:
+      discard random(80)
+      os.sleep(100)
+
+  spawn busyLoop()
+  spawn busyLoop()
+  spawn busyLoop()
+  spawn busyLoop()
+
+  var s: ThreadPoolState
+
+  for i in 1 .. 70:
+    echo advice(s)
+    os.sleep(1000)
--- a/lib/pure/concurrency/threadpool.nim
+++ b/lib/pure/concurrency/threadpool.nim
@@ -0,0 +1,210 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2014 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Implements Nimrod's 'spawn'.
+
+import cpuinfo, cpuload, locks
+
+{.push stackTrace:off.}
+
+type
+  CondVar = object
+    c: TCond
+    L: TLock
+    counter: int
+
+proc createCondVar(): CondVar =
+  initCond(result.c)
+  initLock(result.L)
+
+proc destroyCondVar(cv: var CondVar) {.inline.} =
+  deinitCond(cv.c)
+  deinitLock(cv.L)
+
+proc await(cv: var CondVar) =
+  acquire(cv.L)
+  while cv.counter <= 0:
+    wait(cv.c, cv.L)
+  dec cv.counter
+  release(cv.L)
+
+proc signal(cv: var CondVar) =
+  acquire(cv.L)
+  inc cv.counter
+  release(cv.L)
+  signal(cv.c)
+
+type
+  Barrier* {.compilerProc.} = object
+    counter: int
+    cv: CondVar
+
+proc barrierEnter*(b: ptr Barrier) {.compilerProc.} =
+  atomicInc b.counter
+
+proc barrierLeave*(b: ptr Barrier) {.compilerProc.} =
+  atomicDec b.counter
+  if b.counter <= 0: signal(b.cv)
+
+proc openBarrier*(b: ptr Barrier) {.compilerProc.} =
+  b.counter = 0
+  b.cv = createCondVar()
+
+proc closeBarrier*(b: ptr Barrier) {.compilerProc.} =
+  await(b.cv)
+  destroyCondVar(b.cv)
+
+{.pop.}
+
+# ----------------------------------------------------------------------------
+
+type
+  WorkerProc = proc (thread, args: pointer) {.nimcall, gcsafe.}
+  Worker = object
+    taskArrived: CondVar
+    taskStarted: CondVar #\
+    # task data:
+    f: WorkerProc
+    data: pointer
+    ready: bool # put it here for correct alignment!
+    initialized: bool # whether it has even been initialized
+
+proc nimArgsPassingDone(p: pointer) {.compilerProc.} =
+  let w = cast[ptr Worker](p)
+  signal(w.taskStarted)
+
+var
+  gSomeReady = createCondVar()
+  readyWorker: ptr Worker
+
+proc slave(w: ptr Worker) {.thread.} =
+  while true:
+    w.ready = true
+    readyWorker = w
+    signal(gSomeReady)
+    await(w.taskArrived)
+    assert(not w.ready)
+    w.f(w, w.data)
+
+const
+  MaxThreadPoolSize* = 256 ## maximal size of the thread pool. 256 threads
+                           ## should be good enough for anybody ;-)
+
+var
+  currentPoolSize: int
+  maxPoolSize = MaxThreadPoolSize
+  minPoolSize = 4
+
+proc setMinPoolSize*(size: range[1..MaxThreadPoolSize]) =
+  ## sets the minimal thread pool size. The default value of this is 4.
+  minPoolSize = size
+
+proc setMaxPoolSize*(size: range[1..MaxThreadPoolSize]) =
+  ## sets the minimal thread pool size. The default value of this
+  ## is ``MaxThreadPoolSize``.
+  maxPoolSize = size
+
+var
+  workers: array[MaxThreadPoolSize, TThread[ptr Worker]]
+  workersData: array[MaxThreadPoolSize, Worker]
+
+proc activateThread(i: int) {.noinline.} =
+  workersData[i].taskArrived = createCondVar()
+  workersData[i].taskStarted = createCondVar()
+  workersData[i].initialized = true
+  createThread(workers[i], slave, addr(workersData[i]))
+
+proc setup() =
+  currentPoolSize = min(countProcessors(), MaxThreadPoolSize)
+  readyWorker = addr(workersData[0])
+  for i in 0.. <currentPoolSize: activateThread(i)
+
+proc preferSpawn*(): bool =
+  ## Use this proc to determine quickly if a 'spawn' or a direct call is
+  ## preferable. If it returns 'true' a 'spawn' may make sense. In general
+  ## it is not necessary to call this directly; use 'spawnX' instead.
+  result = gSomeReady.counter > 0
+
+proc spawn*(call: stmt) {.magic: "Spawn".}
+  ## always spawns a new task, so that the 'call' is never executed on
+  ## the calling thread. 'call' has to be proc call 'p(...)' where 'p'
+  ## is gcsafe and has 'void' as the return type.
+
+template spawnX*(call: stmt) =
+  ## spawns a new task if a CPU core is ready, otherwise executes the
+  ## call in the calling thread. Usually it is advised to
+  ## use 'spawn' in order to not block the producer for an unknown
+  ## amount of time. 'call' has to be proc call 'p(...)' where 'p'
+  ## is gcsafe and has 'void' as the return type.
+  if preferSpawn(): spawn call
+  else: call
+
+proc parallel*(body: stmt) {.magic: "Parallel".}
+  ## a parallel section can be used to execute a block in parallel. ``body``
+  ## has to be in a DSL that is a particular subset of the language. Please
+  ## refer to the manual for further information.
+
+var
+  state: ThreadPoolState
+  stateLock: TLock
+
+initLock stateLock
+
+proc selectWorker(w: ptr Worker; fn: WorkerProc; data: pointer): bool =
+  if cas(addr w.ready, true, false):
+    w.data = data
+    w.f = fn
+    signal(w.taskArrived)
+    await(w.taskStarted)
+    result = true
+
+proc nimSpawn(fn: WorkerProc; data: pointer) {.compilerProc.} =
+  # implementation of 'spawn' that is used by the code generator.
+  while true:
+    if selectWorker(readyWorker, fn, data): return
+    for i in 0.. <currentPoolSize:
+      if selectWorker(addr(workersData[i]), fn, data): return
+    # determine what to do, but keep in mind this is expensive too:
+    # state.calls < maxPoolSize: warmup phase
+    # (state.calls and 127) == 0: periodic check
+    if state.calls < maxPoolSize or (state.calls and 127) == 0:
+      # ensure the call to 'advice' is atomic:
+      if tryAcquire(stateLock):
+        case advice(state)
+        of doNothing: discard
+        of doCreateThread:
+          if currentPoolSize < maxPoolSize:
+            if not workersData[currentPoolSize].initialized:
+              activateThread(currentPoolSize)
+            let w = addr(workersData[currentPoolSize])
+            inc currentPoolSize
+            if selectWorker(w, fn, data):
+              release(stateLock)
+              return
+            # else we didn't succeed but some other thread, so do nothing.
+        of doShutdownThread:
+          if currentPoolSize > minPoolSize: dec currentPoolSize
+          # we don't free anything here. Too dangerous.
+        release(stateLock)
+      # else the acquire failed, but this means some
+      # other thread succeeded, so we don't need to do anything here.
+    await(gSomeReady)
+
+proc sync*() =
+  ## a simple barrier to wait for all spawn'ed tasks. If you need more elaborate
+  ## waiting, you have to use an explicit barrier.
+  while true:
+    var allReady = true
+    for i in 0 .. <currentPoolSize:
+      if not allReady: break
+      allReady = allReady and workersData[i].ready
+    if allReady: break
+    await(gSomeReady)
+
+setup()
--- a/lib/pure/osproc.nim
+++ b/lib/pure/osproc.nim
@@ -1,7 +1,7 @@
 #
 #
 #            Nimrod's Runtime Library
-#        (c) Copyright 2013 Andreas Rumpf
+#        (c) Copyright 2014 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
@@ -13,7 +13,7 @@
 include "system/inclrtl"

 import
-  strutils, os, strtabs, streams
+  strutils, os, strtabs, streams, cpuinfo

 when defined(windows):
  import winlean
@@ -225,42 +225,10 @@ proc errorHandle*(p: PProcess): TFileHandle {.rtl, extern: "nosp$1",
  ## it is closed when closing the PProcess ``p``.
  result = p.errHandle

-when defined(macosx) or defined(bsd):
-  const
-    CTL_HW = 6
-    HW_AVAILCPU = 25
-    HW_NCPU = 3
-  proc sysctl(x: ptr array[0..3, cint], y: cint, z: pointer,
-              a: var csize, b: pointer, c: int): cint {.
-             importc: "sysctl", header: "<sys/sysctl.h>".}
-
 proc countProcessors*(): int {.rtl, extern: "nosp$1".} =
  ## returns the numer of the processors/cores the machine has.
  ## Returns 0 if it cannot be detected.
-  when defined(windows):
-    var x = getEnv("NUMBER_OF_PROCESSORS")
-    if x.len > 0: result = parseInt(x.string)
-  elif defined(macosx) or defined(bsd):
-    var
-      mib: array[0..3, cint]
-      numCPU: int
-      len: csize
-    mib[0] = CTL_HW
-    mib[1] = HW_AVAILCPU
-    len = sizeof(numCPU)
-    discard sysctl(addr(mib), 2, addr(numCPU), len, nil, 0)
-    if numCPU < 1:
-      mib[1] = HW_NCPU
-      discard sysctl(addr(mib), 2, addr(numCPU), len, nil, 0)
-    result = numCPU
-  elif defined(hpux):
-    result = mpctl(MPC_GETNUMSPUS, nil, nil)
-  elif defined(irix):
-    var SC_NPROC_ONLN {.importc: "_SC_NPROC_ONLN", header: "<unistd.h>".}: cint
-    result = sysconf(SC_NPROC_ONLN)
-  else:
-    result = sysconf(SC_NPROCESSORS_ONLN)
-  if result <= 0: result = 1
+  result = cpuinfo.countProcessors()

 proc execProcesses*(cmds: openArray[string],
                    options = {poStdErrToStdOut, poParentStreams},
--- a/lib/system.nim
+++ b/lib/system.nim
@@ -2934,6 +2934,3 @@ when not defined(booting):

  template isStatic*(x): expr = compiles(static(x))
    # checks whether `x` is a value known at compile-time
-
-when hasThreadSupport:
-  when hostOS != "standalone": include "system/sysspawn"
--- a/lib/system/atomics.nim
+++ b/lib/system/atomics.nim
@@ -1,13 +1,14 @@
 #
 #
 #            Nimrod's Runtime Library
-#        (c) Copyright 2012 Andreas Rumpf
+#        (c) Copyright 2014 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
 #

 ## Atomic operations for Nimrod.
+{.push stackTrace:off.}

 when (defined(gcc) or defined(llvm_gcc)) and hasThreadSupport:
  type 
@@ -203,3 +204,31 @@ proc atomicDec*(memLoc: var int, x: int = 1): int =
  else:
    dec(memLoc, x)
    result = memLoc
+
+when defined(windows) and not defined(gcc):
+  proc interlockedCompareExchange(p: pointer; exchange, comparand: int32): int32
+    {.importc: "InterlockedCompareExchange", header: "<windows.h>", cdecl.}
+
+  proc cas*[T: bool|int](p: ptr T; oldValue, newValue: T): bool =
+    interlockedCompareExchange(p, newValue.int32, oldValue.int32) != 0
+
+else:
+  # this is valid for GCC and Intel C++
+  proc cas*[T: bool|int](p: ptr T; oldValue, newValue: T): bool
+    {.importc: "__sync_bool_compare_and_swap", nodecl.}
+  # XXX is this valid for 'int'?
+
+
+when (defined(x86) or defined(amd64)) and defined(gcc):
+  proc cpuRelax {.inline.} =
+    {.emit: """asm volatile("pause" ::: "memory");""".}
+elif (defined(x86) or defined(amd64)) and defined(vcc):
+  proc cpuRelax {.importc: "YieldProcessor", header: "<windows.h>".}
+elif defined(intelc):
+  proc cpuRelax {.importc: "_mm_pause", header: "xmmintrin.h".}
+elif false:
+  from os import sleep
+
+  proc cpuRelax {.inline.} = os.sleep(1)
+
+{.pop.}
--- a/lib/system/sysspawn.nim
+++ b/lib/system/sysspawn.nim
@@ -14,30 +14,6 @@ when not defined(NimString):

 {.push stackTrace:off.}

-when (defined(x86) or defined(amd64)) and defined(gcc):
-  proc cpuRelax {.inline.} =
-    {.emit: """asm volatile("pause" ::: "memory");""".}
-elif (defined(x86) or defined(amd64)) and defined(vcc):
-  proc cpuRelax {.importc: "YieldProcessor", header: "<windows.h>".}
-elif defined(intelc):
-  proc cpuRelax {.importc: "_mm_pause", header: "xmmintrin.h".}
-elif false:
-  from os import sleep
-
-  proc cpuRelax {.inline.} = os.sleep(1)
-
-when defined(windows) and not defined(gcc):
-  proc interlockedCompareExchange(p: pointer; exchange, comparand: int32): int32
-    {.importc: "InterlockedCompareExchange", header: "<windows.h>", cdecl.}
-
-  proc cas(p: ptr bool; oldValue, newValue: bool): bool =
-    interlockedCompareExchange(p, newValue.int32, oldValue.int32) != 0
-
-else:
-  # this is valid for GCC and Intel C++
-  proc cas(p: ptr bool; oldValue, newValue: bool): bool
-    {.importc: "__sync_bool_compare_and_swap", nodecl.}
-
 # We declare our own condition variables here to get rid of the dummy lock
 # on Windows:

@@ -54,6 +30,9 @@ proc createCondVar(): CondVar =
    initSysLock(result.stupidLock)
    #acquireSys(result.stupidLock)

+proc destroyCondVar(c: var CondVar) {.inline.} =
+  deinitSysCond(c.c)
+
 proc await(cv: var CondVar) =
  when defined(posix):
    acquireSys(cv.stupidLock)
@@ -100,6 +79,26 @@ proc signal(cv: var FastCondVar) =
  #if cas(addr cv.slowPath, true, false):
  signal(cv.slow)

+type
+  Barrier* {.compilerProc.} = object
+    counter: int
+    cv: CondVar
+
+proc barrierEnter*(b: ptr Barrier) {.compilerProc.} =
+  atomicInc b.counter
+
+proc barrierLeave*(b: ptr Barrier) {.compilerProc.} =
+  atomicDec b.counter
+  if b.counter <= 0: signal(b.cv)
+
+proc openBarrier*(b: ptr Barrier) {.compilerProc.} =
+  b.counter = 0
+  b.cv = createCondVar()
+
+proc closeBarrier*(b: ptr Barrier) {.compilerProc.} =
+  await(b.cv)
+  destroyCondVar(b.cv)
+
 {.pop.}

 # ----------------------------------------------------------------------------
--- a/tests/system/tsysspawn.nim
+++ b/tests/system/tsysspawn.nim
@@ -4,20 +4,22 @@ discard """
  cmd: "nimrod $target --threads:on $options $file"
 """

+import threadpool
+
 var
  x, y = 0

 proc p1 =
-  for i in 0 .. 1_000_000:
+  for i in 0 .. 10_000:
    discard

-  inc x
+  atomicInc x

 proc p2 =
-  for i in 0 .. 1_000_000:
+  for i in 0 .. 10_000:
    discard

-  inc y, 2
+  atomicInc y, 2

 for i in 0.. 3:
  spawn(p1())
--- a/tests/system/tsysspawnbadarg.nim
+++ b/tests/system/tsysspawnbadarg.nim
@@ -4,4 +4,6 @@ discard """
  cmd: "nimrod $target --threads:on $options $file"
 """

+import threadpool
+
 spawn(1)
--- a/web/news.txt
+++ b/web/news.txt
@@ -2,6 +2,23 @@
 News
 ====

+..
+  2014-06-29 Version 0.9.6 released
+  =================================
+
+  Changes affecting backwards compatibility
+  -----------------------------------------
+
+  - ``spawn`` now uses an elaborate self-adapting thread pool and as such
+    has been moved into its own module. So to use it, you now have to import
+    ``threadpool``.
+
+
+  Library Additions
+  -----------------
+
+  - Added module ``cpuinfo``.
+  - Added module ``threadpool``.


 2014-04-21 Version 0.9.4 released