From 6195dbe491ccd864c5dcb59f87826291ac1f1ff4 Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Mon, 12 May 2014 11:12:37 +0200
Subject: [PATCH 01/13] initial non-compiling version of 'parallel'

---
 compiler/guards.nim                 | 191 +++++++++++--
 compiler/lowerings.nim              |  22 +-
 compiler/semparallel.nim            | 414 ++++++++++++++++++++++++++++
 compiler/sempass2.nim               |   4 +-
 compiler/vm.nim                     |   5 +-
 config/nimrod.cfg                   |   1 +
 lib/pure/concurrency/cpuinfo.nim    |  58 ++++
 lib/pure/concurrency/cpuload.nim    |  96 +++++++
 lib/pure/concurrency/threadpool.nim | 210 ++++++++++++++
 lib/pure/osproc.nim                 |  38 +--
 lib/system.nim                      |   3 -
 lib/system/atomics.nim              |  31 ++-
 lib/system/sysspawn.nim             |  47 ++--
 tests/system/tsysspawn.nim          |  10 +-
 tests/system/tsysspawnbadarg.nim    |   2 +
 web/news.txt                        |  17 ++
 16 files changed, 1058 insertions(+), 91 deletions(-)
 create mode 100644 compiler/semparallel.nim
 create mode 100644 lib/pure/concurrency/cpuinfo.nim
 create mode 100644 lib/pure/concurrency/cpuload.nim
 create mode 100644 lib/pure/concurrency/threadpool.nim

diff --git a/compiler/guards.nim b/compiler/guards.nim
index f475f50680..57cd73b11a 100644
--- a/compiler/guards.nim
+++ b/compiler/guards.nim
@@ -9,7 +9,8 @@
 
 ## This module implements the 'implies' relation for guards.
 
-import ast, astalgo, msgs, magicsys, nimsets, trees, types, renderer, idents
+import ast, astalgo, msgs, magicsys, nimsets, trees, types, renderer, idents,
+  saturate
 
 const
   someEq = {mEqI, mEqI64, mEqF64, mEqEnum, mEqCh, mEqB, mEqRef, mEqProc,
@@ -25,6 +26,17 @@ const
 
   someIn = {mInRange, mInSet}
 
+  someHigh = {mHigh}
+  # we don't list unsigned here because wrap around semantics suck for
+  # proving anything:
+  someAdd = {mAddI, mAddI64, mAddF64, mSucc}
+  someSub = {mSubI, mSubI64, mSubF64, mPred}
+  someMul = {mMulI, mMulI64, mMulF64}
+  someDiv = {mDivI, mDivI64, mDivF64}
+  someMod = {mModI, mModI64}
+  someMax = {mMaxI, mMaxI64, mMaxF64}
+  someMin = {mMinI, mMinI64, mMinF64}
+
 proc isValue(n: PNode): bool = n.kind in {nkCharLit..nkNilLit}
 proc isLocation(n: PNode): bool = not n.isValue
 
@@ -69,19 +81,24 @@ proc isLetLocation(m: PNode, isApprox: bool): bool =
 
 proc interestingCaseExpr*(m: PNode): bool = isLetLocation(m, true)
 
-proc getMagicOp(name: string, m: TMagic): PSym =
+proc createMagic*(name: string, m: TMagic): PSym =
   result = newSym(skProc, getIdent(name), nil, unknownLineInfo())
   result.magic = m
 
 let
-  opLe = getMagicOp("<=", mLeI)
-  opLt = getMagicOp("<", mLtI)
-  opAnd = getMagicOp("and", mAnd)
-  opOr = getMagicOp("or", mOr)
-  opNot = getMagicOp("not", mNot)
-  opIsNil = getMagicOp("isnil", mIsNil)
-  opContains = getMagicOp("contains", mInSet)
-  opEq = getMagicOp("==", mEqI)
+  opLe = createMagic("<=", mLeI)
+  opLt = createMagic("<", mLtI)
+  opAnd = createMagic("and", mAnd)
+  opOr = createMagic("or", mOr)
+  opNot = createMagic("not", mNot)
+  opIsNil = createMagic("isnil", mIsNil)
+  opContains = createMagic("contains", mInSet)
+  opEq = createMagic("==", mEqI)
+  opAdd = createMagic("+", mAddI)
+  opSub = createMagic("-", mSubI)
+  opMul = createMagic("*", mMulI)
+  opDiv = createMagic("div", mDivI)
+  opLen = createMagic("len", mLengthSeq)
 
 proc swapArgs(fact: PNode, newOp: PSym): PNode =
   result = newNodeI(nkCall, fact.info, 3)
@@ -137,17 +154,118 @@ proc neg(n: PNode): PNode =
     result.sons[0] = newSymNode(opNot)
     result.sons[1] = n
 
-proc buildIsNil(arg: PNode): PNode =
-  result = newNodeI(nkCall, arg.info, 2)
-  result.sons[0] = newSymNode(opIsNil)
-  result.sons[1] = arg
+proc buildCall(op: PSym; a: PNode): PNode =
+  result = newNodeI(nkCall, a.info, 2)
+  result.sons[0] = newSymNode(op)
+  result.sons[1] = a
+
+proc buildCall(op: PSym; a, b: PNode): PNode =
+  result = newNodeI(nkCall, a.info, 3)
+  result.sons[0] = newSymNode(op)
+  result.sons[1] = a
+  result.sons[2] = b
+
+proc `+@`*(a: PNode; b: BiggestInt): PNode =
+  opAdd.buildCall(a, nkIntLit.newIntNode(b))
+
+proc `|+|`(a, b: PNode): PNode =
+  result = copyNode(a)
+  if a.kind in {nkCharLit..nkUInt64Lit}: result.intVal = a.intVal |+| b.intVal
+  else: result.floatVal = a.floatVal + b.floatVal
+
+proc `|*|`(a, b: PNode): PNode =
+  result = copyNode(a)
+  if a.kind in {nkCharLit..nkUInt64Lit}: result.intVal = a.intVal |*| b.intVal
+  else: result.floatVal = a.floatVal * b.floatVal
+
+proc zero(): PNode = nkIntLit.newIntNode(0)
+proc one(): PNode = nkIntLit.newIntNode(1)
+proc minusOne(): PNode = nkIntLit.newIntNode(-1)
+
+proc lowBound*(x: PNode): PNode = nkIntLit.newIntNode(firstOrd(x.typ))
+proc highBound*(x: PNode): PNode =
+  if x.typ.skipTypes(abstractInst).kind == tyArray:
+    nkIntLit.newIntNode(lastOrd(x.typ))
+  else:
+    opAdd.buildCall(opLen.buildCall(x), minusOne())
+
+proc canon*(n: PNode): PNode =
+  # XXX for now only the new code in 'semparallel' uses this
+  if n.safeLen >= 1:
+    result = newNodeI(n.kind, n.info, n.len)
+    for i in 0 .. < n.safeLen:
+      result.sons[i] = canon(n.sons[i])
+  else:
+    result = n
+  case result.getMagic
+  of someEq, someAdd, someMul, someMin, someMax:
+    # these are symmetric; put value as last:
+    if result.sons[1].isValue and not result.sons[2].isValue:
+      result = swapArgs(result, result.sons[0].sym)
+      # (4 + foo) + 2 --> (foo + 4) + 2
+  of someHigh:
+    # high == len+(-1)
+    result = opAdd.buildCall(opLen.buildCall(result[1]), minusOne())
+  of mUnaryMinusI, mUnaryMinusI64:
+    result = buildCall(opAdd, result[1], newIntNode(nkIntLit, -1))
+  of someSub:
+    # x - 4  -->  x + (-4)
+    var b = result[2]
+    if b.kind in {nkCharLit..nkUInt64Lit} and b.intVal != low(BiggestInt):
+      b = copyNode(b)
+      b.intVal = -b.intVal
+      result = buildCall(opAdd, result[1], b)
+    elif b.kind in {nkFloatLit..nkFloat64Lit}:
+      b = copyNode(b)
+      b.floatVal = -b.floatVal
+      result = buildCall(opAdd, result[1], b)    
+  of someLen:
+    result.sons[0] = opLen.newSymNode
+  else: discard
+
+  # re-association:
+  # (foo+5)+5 --> foo+10;  same for '*'
+  case result.getMagic
+  of someAdd:
+    if result[2].isValue and 
+        result[1].getMagic in someAdd and result[1][2].isValue:
+      result = opAdd.buildCall(result[1][1], result[1][2] |+| result[2])
+  of someMul:
+    if result[2].isValue and 
+        result[1].getMagic in someMul and result[1][2].isValue:
+      result = opAdd.buildCall(result[1][1], result[1][2] |*| result[2])
+  else: discard
+
+  # most important rule: (x-4) < a.len -->  x < a.len+4
+  case result.getMagic
+  of someLe, someLt:
+    let x = result[1]
+    let y = result[2]
+    if x.kind in nkCallKinds and x.len == 3 and x[2].isValue and 
+        isLetLocation(x[1], true):
+      case x.getMagic
+      of someSub:
+        result = buildCall(result[0].sym, x[1], opAdd.buildCall(y, x[2]))
+      of someAdd:
+        result = buildCall(result[0].sym, x[1], opSub.buildCall(y, x[2]))
+      else: discard
+    elif y.kind in nkCallKinds and y.len == 3 and y[2].isValue and 
+        isLetLocation(y[1], true):
+      # a.len < x-3
+      case y.getMagic
+      of someSub:
+        result = buildCall(result[0].sym, y[1], opAdd.buildCall(x, y[2]))
+      of someAdd:
+        result = buildCall(result[0].sym, y[1], opSub.buildCall(x, y[2]))
+      else: discard
+  else: discard
 
 proc usefulFact(n: PNode): PNode =
   case n.getMagic
   of someEq:
     if skipConv(n.sons[2]).kind == nkNilLit and (
         isLetLocation(n.sons[1], false) or isVar(n.sons[1])):
-      result = buildIsNil(n.sons[1])
+      result = opIsNil.buildCall(n.sons[1])
     else:
       if isLetLocation(n.sons[1], true) or isLetLocation(n.sons[2], true):
         # XXX algebraic simplifications!  'i-1 < a.len' --> 'i < a.len+1'
@@ -217,7 +335,7 @@ proc addFactNeg*(m: var TModel, n: PNode) =
   let n = n.neg
   if n != nil: addFact(m, n)
 
-proc sameTree(a, b: PNode): bool = 
+proc sameTree*(a, b: PNode): bool = 
   result = false
   if a == b:
     result = true
@@ -519,7 +637,46 @@ proc doesImply*(facts: TModel, prop: PNode): TImplication =
       if result != impUnknown: return
 
 proc impliesNotNil*(facts: TModel, arg: PNode): TImplication =
-  result = doesImply(facts, buildIsNil(arg).neg)
+  result = doesImply(facts, opIsNil.buildCall(arg).neg)
+
+proc proveLe*(m: TModel; a, b: PNode): TImplication =
+  let res = canon(opLe.buildCall(a, b))
+  # we hardcode lots of axioms here:
+  let a = res[1]
+  let b = res[2]
+  #   0 <= 3
+  if a.isValue and b.isValue:
+    return if leValue(a, b): impYes else: impNo
+
+  # use type information too:  x <= 4  iff  high(x) <= 4
+  if b.isValue and a.typ != nil and a.typ.isOrdinalType:
+    if lastOrd(a.typ) <= b.intVal: return impYes
+  # 3 <= x   iff  low(x) <= 3
+  if a.isValue and b.typ != nil and b.typ.isOrdinalType:
+    if firstOrd(b.typ) <= a.intVal: return impYes
+
+  # x <= x
+  if sameTree(a, b): return impYes
+
+  #   x <= x+c  iff 0 <= c
+  if b.getMagic in someAdd and sameTree(a, b[1]):
+    return proveLe(m, zero(), b[2])
+
+  #   x <= x*c  if  1 <= c and 0 <= x:
+  if b.getMagic in someMul and sameTree(a, b[1]):
+    if proveLe(m, one(), b[2]) == impYes and proveLe(m, zero(), a) == impYes:
+      return impYes
+
+  #   x div c <= x   if   1 <= c  and  0 <= x:
+  if a.getMagic in someDiv and sameTree(a[1], b):
+    if proveLe(m, one(), a[2]) == impYes and proveLe(m, zero(), b) == impYes:
+      return impYes
+
+  # use the knowledge base:
+  return doesImply(m, res)
+
+proc addFactLe*(m: var TModel; a, b: PNode) =
+  m.add canon(opLe.buildCall(a, b))
 
 proc settype(n: PNode): PType =
   result = newType(tySet, n.typ.owner)
diff --git a/compiler/lowerings.nim b/compiler/lowerings.nim
index 1b9e5fe0f4..93bfd84257 100644
--- a/compiler/lowerings.nim
+++ b/compiler/lowerings.nim
@@ -114,11 +114,15 @@ proc callCodegenProc*(name: string, arg1: PNode;
     if arg3 != nil: result.add arg3
 
 proc createWrapperProc(f: PNode; threadParam, argsParam: PSym;
-                       varSection, call: PNode): PSym =
+                       varSection, call, barrier: PNode): PSym =
   var body = newNodeI(nkStmtList, f.info)
   body.add varSection
+  if barrier != nil:
+    body.add callCodeGenProc("barrierEnter", barrier)
   body.add callCodeGenProc("nimArgsPassingDone", newSymNode(threadParam))
   body.add call
+  if barrier != nil:
+    body.add callCodeGenProc("barrierLeave", barrier)
 
   var params = newNodeI(nkFormalParams, f.info)
   params.add emptyNode
@@ -146,7 +150,7 @@ proc createCastExpr(argsParam: PSym; objType: PType): PNode =
   result.typ = newType(tyPtr, objType.owner)
   result.typ.rawAddSon(objType)
 
-proc wrapProcForSpawn*(owner: PSym; n: PNode): PNode =
+proc wrapProcForSpawn*(owner: PSym; n: PNode; barrier: PNode = nil): PNode =
   result = newNodeI(nkStmtList, n.info)
   if n.kind notin nkCallKinds or not n.typ.isEmptyType:
     localError(n.info, "'spawn' takes a call expression of type void")
@@ -162,6 +166,7 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode): PNode =
     threadParam.typ = ptrType
     argsParam.typ = ptrType
     argsParam.position = 1
+
   var objType = createObj(owner, n.info)
   incl(objType.flags, tfFinal)
   let castExpr = createCastExpr(argsParam, objType)
@@ -223,6 +228,17 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode): PNode =
 
     call.add(newSymNode(temp))
 
-  let wrapper = createWrapperProc(fn, threadParam, argsParam, varSection, call)
+  var barrierAsExpr: PNode = nil
+  if barrier != nil:
+    let typ = newType(tyPtr, owner)
+    typ.rawAddSon(magicsys.getCompilerProc("Barrier").typ)
+    var field = newSym(skField, getIdent"barrier", owner, n.info)
+    field.typ = typ
+    objType.addField(field)
+    result.add newFastAsgnStmt(newDotExpr(scratchObj, field), barrier)
+    barrierAsExpr = indirectAccess(castExpr, field, n.info)
+
+  let wrapper = createWrapperProc(fn, threadParam, argsParam, varSection, call,
+                                  barrierAsExpr)
   result.add callCodeGenProc("nimSpawn", wrapper.newSymNode,
                              genAddrOf(scratchObj.newSymNode))
diff --git a/compiler/semparallel.nim b/compiler/semparallel.nim
new file mode 100644
index 0000000000..34a1f3af82
--- /dev/null
+++ b/compiler/semparallel.nim
@@ -0,0 +1,414 @@
+#
+#
+#           The Nimrod Compiler
+#        (c) Copyright 2014 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Semantic checking for 'parallel'.
+
+# - slices should become "nocopy" to openArray (+)
+#   - need to perform bound checks (+)
+#
+# - parallel needs to insert a barrier (+)
+# - passed arguments need to be ensured to be "const"
+#   - what about 'f(a)'? --> f shouldn't have side effects anyway
+# - passed arrays need to be ensured not to alias
+# - passed slices need to be ensured to be disjoint (+)
+# - output slices need special logic
+
+import lowerings, guards, sempass2
+
+discard """
+
+one major problem:
+  spawn f(a[i])
+  inc i
+  spawn f(a[i])
+is valid, but
+  spawn f(a[i])
+  spawn f(a[i])
+  inc i
+is not! However, 
+  spawn f(a[i])
+  if guard: inc i
+  spawn f(a[i])
+is not valid either! --> We need a flow dependent analysis here.
+
+However:
+  while foo:
+    spawn f(a[i])
+    inc i
+    spawn f(a[i])
+
+Is not valid either! --> We should really restrict 'inc' to loop endings?
+
+The heuristic that we implement here (that has no false positives) is: Usage
+of 'i' in a slice *after* we determined the stride is invalid!
+"""
+
+type
+  TDirection = enum
+    ascending, descending
+  MonotonicVar = object
+    v: PSym
+    lower, upper, stride: PNode
+    dir: TDirection
+    blacklisted: bool     # blacklisted variables that are not monotonic
+  AnalysisCtx = object
+    locals: seq[MonotonicVar]
+    slices: seq[tuple[x,a,b: PNode, spawnId: int, inLoop: bool]]
+    guards: TModel      # nested guards
+    args: seq[PSym]     # args must be deeply immutable
+    spawns: int         # we can check that at last 1 spawn is used in
+                        # the 'parallel' section
+    currentSpawnId: int
+    inLoop: int
+
+let opSlice = createMagic("slice", mSlice)
+
+proc initAnalysisCtx(): AnalysisCtx =
+  result.locals = @[]
+  result.slices = @[]
+  result.args = @[]
+  result.guards = @[]
+
+proc getSlot(c: var AnalysisCtx; s: PSym): ptr MonotonicVar =
+  var L = c.locals.len
+  for i in 0.. <L:
+    if c.locals[i].v == s: return addr(c.locals[i])
+  c.locals.setLen(L+1)
+  c.locals[L].v = s
+  return addr(c.locals[L])
+
+proc getRoot(n: PNode): PSym =
+  ## ``getRoot`` takes a *path* ``n``. A path is an lvalue expression
+  ## like ``obj.x[i].y``. The *root* of a path is the symbol that can be
+  ## determined as the owner; ``obj`` in the example.
+  case n.kind
+  of nkSym:
+    if n.sym.kind in {skVar, skResult, skTemp, skLet, skForVar}:
+      result = n.sym
+  of nkDotExpr, nkBracketExpr, nkHiddenDeref, nkDerefExpr,
+      nkObjUpConv, nkObjDownConv, nkCheckedFieldExpr:
+    result = getRoot(n.sons[0])
+  of nkHiddenStdConv, nkHiddenSubConv, nkConv:
+    result = getRoot(n.sons[1])
+  of nkCallKinds:
+    if getMagic(n) == mSlice: result = getRoot(n.sons[1])
+  else: discard
+
+proc gatherArgs(c: var AnalysisCtx; n: PNode) =
+  for i in 0.. <n.safeLen:
+    let root = getRoot n[i]
+    if root != nil:
+      block addRoot:
+        for r in items(c.args):
+          if r == root: break addRoot
+        c.args.add root
+    gatherArgs(c, n[i])
+
+proc isLocal(s: PSym): bool = 
+  s.kind in {skResult, skTemp, skForVar, skVar, skLet} and
+        {sfAddrTaken, sfGlobal} * s.flags == {}
+
+proc checkLocal(c: var AnalysisCtx; n: PNode) =
+  if n.kind == nkSym and isLocal(n.sym):
+    let slot = c.getSlot(n[1].sym)
+    if slot.stride != nil:
+      localError(n.info, "invalid usage of counter after increment")
+  else:
+    for i in 0 .. <n.safeLen: checkLocal(c, n.sons[i])
+
+proc checkLe(c: AnalysisCtx; a, b: PNode) =
+  case proveLe(c.guards, a, b)
+  of impUnkown:
+    localError(n.info, "cannot prove: " & a.renderTree & " <= " & b.renderTree)
+  of impYes: discard
+  of impNo:
+    localError(n.info, "can prove: " & a.renderTree & " > " & b.renderTree)
+
+proc checkBounds(c: AnalysisCtx; arr, idx: PNode) =
+  checkLe(c, arr.lowBound, idx)
+  checkLe(c, idx, arr.highBound)
+
+proc addLowerBoundAsFacts(c: var AnalysisCtx) =
+  for v in c.locals:
+    if not v.blacklisted:
+      c.guards.addFactLe(v.lower, newSymNode(v.v))
+
+proc addSlice(c: var AnalysisCtx; n: PNode; x, le, ri: int) =
+  checkLocal(c, n)
+  let le = n.sons[le]
+  let ri = n.sons[ri]
+  let x = n.sons[x]
+  # perform static bounds checking here; and not later!
+  let oldState = c.guards.len
+  addLowerBoundAsFacts(c)
+  c.checkBounds(x, le)
+  c.checkBounds(x, ri)
+  c.guards.setLen(oldState)
+  c.slices.add((x, le, ri, c.currentSpawnId, c.inLoop > 0))
+
+template `?`(x): expr = x.renderTree
+
+proc overlap(m: TModel; x,y,c,d: PNode) =
+  #  X..Y and C..D overlap iff (X <= D and Y >= C)
+  case proveLe(m, x, d)
+  of impUnkown:
+    localError(x.info,
+      "cannot prove: $# > $#; required for $#..$# disjoint from $#..$#" %
+        [?x, ?d, ?x, ?y, ?c, ?d])
+  of impYes:
+    case proveLe(m, y, c)
+    of impUnknown:
+      localError(x.info,
+        "cannot prove: $# > $#; required for $#..$# disjoint from $#..$#" %
+          [?y, ?d, ?x, ?y, ?c, ?d])
+    of impYes:
+      localError(x.info, "$#..$# not disjoint from $#..$#" % [?x, ?y, ?c, ?d])
+    of impNo: discard
+  of impNo: discard
+
+proc stride(c: AnalysisCtx; n: PNode): BiggestInt =
+  # note: 0 if it cannot be determined is just right because then
+  # we analyse 'i..i' and 'i+0 .. i+0' and these are not disjoint!
+  if n.kind == nkSym and isLocal(n.sym):
+    let slot = c.getSlot(n[1].sym)
+    if slot.stride != nil:
+      result = slot.stride.intVal
+  else:
+    for i in 0 .. <n.safeLen: inc(result, stride(c, n.sons[i]))
+
+proc checkSlicesAreDisjoint(c: var AnalysisCtx) =
+  # this is the only thing that we need to perform after we have traversed
+  # the whole tree so that the strides are available.
+  # First we need to add all the computed lower bounds:
+  addLowerBoundAsFacts(c)
+  # Every slice used in a loop needs to be disjoint with itself:
+  for x,a,b,id,inLoop in items(c.slices):
+    if inLoop: overlap(c.guards, a,b, a+@c.stride(a), b+@c.stride(b))
+  # Another tricky example is:
+  #   while true:
+  #     spawn f(a[i])
+  #     spawn f(a[i+1])
+  #     inc i  # inc i, 2  would be correct here
+  #
+  # Or even worse:
+  #   while true:
+  #     spawn f(a[i+1 .. i+3])
+  #     spawn f(a[i+4 .. i+5])
+  #     inc i, 4
+  # Prove that i*k*stride + 3 != i*k'*stride + 5
+  # For the correct example this amounts to
+  #   i*k*2 != i*k'*2 + 1
+  # which is true.
+  # For now, we don't try to prove things like that at all, even though it'd
+  # be feasible for many useful examples. Instead we attach the slice to
+  # a spawn and if the attached spawns differ, we bail out:
+  for i in 0 .. high(c.slices):
+    for j in 0 .. high(c.slices):
+      let x = c.slices[i]
+      let y = c.slices[j]
+      if i != j and x.spawnId != y.spawnId and guards.sameTree(x.x, y.x):
+        if not x.inLoop and not y.inLoop:
+          overlap(c.guards, x.a, x.b, y.a, y.b)
+        else:
+          # ah I cannot resists the temptation and add another sweet heuristic:
+          # if both slices have the form (i+c)..(i+c)  and (i+d)..(i+d) we
+          # check they are disjoint and c <= stride and d <= stride:
+          # XXX
+          localError(x.x.info, "cannot prove $#..$# disjoint from $#..$#" %
+            [?x.a, ?x.b, ?y.a, ?y.b])
+
+proc analyse(c: var AnalysisCtx; n: PNode)
+
+proc analyseSons(c: var AnalysisCtx; n: PNode) =
+  for i in 0 .. <safeLen(n): analyse(c, n[i])
+
+proc min(a, b: PNode): PNode =
+  if a.isNil: result = b
+  elif a.intVal < b.intVal: result = a
+  else: result = b
+
+proc analyseCall(c: var AnalysisCtx; n: PNode; op: PSym) =
+  if op.magic == mSpawn:
+    inc c.spawns
+    let oldSpawnId = c.currentSpawnId
+    c.currentSpawnId = c.spawns
+    gatherArgs(c, n[1])
+    analyseSons(c, n)
+    c.currentSpawnId = oldSpawnId
+  elif op.magic == mInc or (op.name.s == "+=" and sfSystemModule in op.owner.flags):
+    if n[1].kind == nkSym and n[1].isLocal:
+      let incr = n[1].skipConv
+      if incr.kind in {nkCharLit..nkUInt32Lit} and incr.intVal > 0:
+        let slot = c.getSlot(n[1].sym)
+        slot.stride = min(slot.stride, incr)
+    analyseSons(c, n)
+  elif op.name.s == "[]" and sfSystemModule in op.owner.flags:
+    c.addSlice(n, 1, 2, 3)
+    analyseSons(c, n)
+  elif op.name.s == "[]=" and sfSystemModule in op.owner.flags:
+    c.addSlice(n, 1, 2, 3)
+    analyseSons(c, n)
+  else:
+    analyseSons(c, n)
+
+proc analyseCase(c: var AnalysisCtx; n: PNode) =
+  analyse(c, n.sons[0])
+  #let oldState = c.locals.len
+  let oldFacts = c.guards.len
+  for i in 1.. <n.len:
+    let branch = n.sons[i]
+    #setLen(c.locals, oldState)
+    setLen(c.guards, oldFacts)
+    addCaseBranchFacts(c.guards, n, i)
+    for i in 0 .. <branch.len:
+      analyse(c, branch.sons[i])
+  #setLen(c.locals, oldState)
+  setLen(c.guards, oldFacts)
+
+proc analyseIf(c: var AnalysisCtx; n: PNode) =
+  analyse(c, n.sons[0].sons[0])
+  let oldFacts = c.guards.len
+  addFact(c.guards, n.sons[0].sons[0])
+  #let oldState = c.locals.len
+
+  analyse(c, n.sons[0].sons[1])
+  for i in 1.. <n.len:
+    let branch = n.sons[i]
+    setLen(c.guards, oldFacts)
+    for j in 0..i-1:
+      addFactNeg(c.guards, n.sons[j].sons[0])
+    if branch.len > 1:
+      addFact(c.guards, branch.sons[0])
+    #setLen(c.locals, oldState)
+    for i in 0 .. <branch.len:
+      analyse(c, branch.sons[i])
+  #setLen(c.locals, oldState)
+  setLen(c.guards, oldFacts)
+
+proc analyse(c: var AnalysisCtx; n: PNode) =
+  case n.kind
+  of nkAsgn, nkFastAsgn:
+    # since we already ensure sfAddrTaken is not in s.flags, we only need to
+    # prevent direct assignments to the monotonic variable:
+    if n[0].kind == nkSym and n[0].isLocal:
+      let slot = c.getSlot(it[j].sym)
+      slot.blackListed = true
+    invalidateFacts(c.guards, n.sons[0])
+    analyseSons(c, n)
+    addAsgnFact(c.guards, n.sons[0], n.sons[1])
+  of nkCallKinds:
+    # direct call:
+    if n[0].kind == nkSym: analyseCall(c, n, n[0].sym)
+    else: analyseSons(c, n)
+  of nkBracket:
+    c.addSlice(n, 0, 1, 1)
+    analyseSons(c, n)
+  of nkReturnStmt, nkRaiseStmt, nkTryStmt:
+    localError(n.info, "invalid control flow for 'parallel'")
+    # 'break' that leaves the 'parallel' section is not valid either
+    # or maybe we should generate a 'try' XXX
+  of nkVarSection:
+    for it in n:
+      if it.sons[it.len-1].kind != nkEmpty:
+        for j in 0 .. it.len-3:
+          if it[j].kind == nkSym and it[j].isLocal:
+            let slot = c.getSlot(it[j].sym)
+            if slot.lower.isNil: slot.lower = it.sons[it.len-1]
+            else: internalError(it.info, "slot already has a lower bound")
+    analyseSons(c, n)
+
+  of nkCaseStmt: analyseCase(c, n)
+  of nkIfStmt, nkIfExpr: analyseIf(c, n)
+  of nkWhileStmt:
+    analyse(c, n.sons[0])
+    # 'while true' loop?
+    inc c.inLoop
+    if isTrue(n.sons[0]):
+      analyseSons(c, n.sons[1])
+    else:
+      # loop may never execute:
+      let oldState = c.locals.len
+      let oldFacts = c.guards.len
+      addFact(c.guards, n.sons[0])
+      analyse(c, n.sons[1])
+      setLen(c.locals, oldState)
+      setLen(c.guards, oldFacts)
+      # we know after the loop the negation holds:
+      if not containsNode(n.sons[1], nkBreakStmt):
+        addFactNeg(c.guards, n.sons[0])
+    dec c.inLoop
+  of nkTypeSection, nkProcDef, nkConverterDef, nkMethodDef, nkIteratorDef,
+      nkMacroDef, nkTemplateDef, nkConstSection, nkPragma:
+    discard
+  else:
+    analyseSons(c, n)
+
+proc transformSlices(n: PNode): PNode =
+  if n.kind in nkCalls and n[0].kind == nkSym:
+    let op = n[0].sym
+    if op.name.s == "[]" and sfSystemModule in op.owner.flags:
+      result = copyTree(n)
+      result.sons[0] = opSlice
+      return result
+  if n.safeLen > 0:
+    result = copyNode(n.kind, n.info, n.len)
+    for i in 0 .. < n.len:
+      result.sons[i] = transformSlices(n.sons[i])
+  else:
+    result = n
+
+proc transformSpawn(owner: PSym; n, barrier: PNode): PNode =
+  if n.kind in nkCalls:
+    if n[0].kind == nkSym:
+      let op = n[0].sym
+      if op.magic == mSpawn:
+        result = transformSlices(n)
+        return wrapProcForSpawn(owner, result, barrier)
+  elif n.safeLen > 0:
+    result = copyNode(n.kind, n.info, n.len)
+    for i in 0 .. < n.len:
+      result.sons[i] = transformSpawn(owner, n.sons[i], barrier)
+  else:
+    result = n
+
+proc liftParallel*(owner: PSym; n: PNode): PNode =
+  # this needs to be called after the 'for' loop elimination
+
+  # first pass:
+  # - detect monotonic local integer variables
+  # - detect used slices
+  # - detect used arguments
+  
+  var a = initAnalysisCtx()
+  let body = n.lastSon
+  analyse(a, body)
+  if a.spawns == 0:
+    localError(n.info, "'parallel' section without 'spawn'")
+  checkSlices(a)
+  checkArgs(a, body)
+
+  var varSection = newNodeI(nkVarSection, n.info)
+  var temp = newSym(skTemp, "barrier", owner, n.info)
+  temp.typ = magicsys.getCompilerProc("Barrier").typ
+  incl(temp.flags, sfFromGeneric)
+
+  var vpart = newNodeI(nkIdentDefs, n.info, 3)
+  vpart.sons[0] = newSymNode(temp)
+  vpart.sons[1] = ast.emptyNode
+  vpart.sons[2] = indirectAccess(castExpr, field, n.info)
+  varSection.add vpart
+
+  barrier = genAddrOf(vpart[0])
+
+  result = newNodeI(nkStmtList, n.info)
+  generateAliasChecks(a, result)
+  result.add varSection
+  result.add callCodeGenProc("openBarrier", barrier)
+  result.add transformSpawn(owner, body, barrier)
+  result.add callCodeGenProc("closeBarrier", barrier)
diff --git a/compiler/sempass2.nim b/compiler/sempass2.nim
index 6afde5f059..c8ce5e7875 100644
--- a/compiler/sempass2.nim
+++ b/compiler/sempass2.nim
@@ -89,7 +89,7 @@ proc initVarViaNew(a: PEffects, n: PNode) =
   if n.kind != nkSym: return
   let s = n.sym
   if {tfNeedsInit, tfNotNil} * s.typ.flags <= {tfNotNil}:
-    # 'x' is not nil, but that doesn't mean it's not nil children
+    # 'x' is not nil, but that doesn't mean its "not nil" children
     # are initialized:
     initVar(a, n)
 
@@ -478,7 +478,7 @@ proc trackBlock(tracked: PEffects, n: PNode) =
   else:
     track(tracked, n)
 
-proc isTrue(n: PNode): bool =
+proc isTrue*(n: PNode): bool =
   n.kind == nkSym and n.sym.kind == skEnumField and n.sym.position != 0 or
     n.kind == nkIntLit and n.intVal != 0
 
diff --git a/compiler/vm.nim b/compiler/vm.nim
index 218369fa1b..0c2c23987b 100644
--- a/compiler/vm.nim
+++ b/compiler/vm.nim
@@ -131,8 +131,9 @@ proc createStrKeepNode(x: var TFullReg) =
       nfAllConst in x.node.flags:
     # XXX this is hacky; tests/txmlgen triggers it:
     x.node = newNode(nkStrLit)
-    #  debug x.node
-    #assert x.node.kind in {nkStrLit..nkTripleStrLit}
+    # It not only hackey, it is also wrong for tgentemplate. The primary
+    # cause of bugs like these is that the VM does not properly distinguish
+    # between variable defintions (var foo = e) and variable updates (foo = e).
 
 template createStr(x) =
   x.node = newNode(nkStrLit)
diff --git a/config/nimrod.cfg b/config/nimrod.cfg
index 2817eac559..df3835ace7 100644
--- a/config/nimrod.cfg
+++ b/config/nimrod.cfg
@@ -16,6 +16,7 @@ arm.linux.gcc.linkerexe = "arm-linux-gcc"
 path="$lib/core"
 path="$lib/pure"
 path="$lib/pure/collections"
+path="$lib/pure/concurrency"
 path="$lib/impure"
 path="$lib/wrappers"
 # path="$lib/wrappers/cairo"
diff --git a/lib/pure/concurrency/cpuinfo.nim b/lib/pure/concurrency/cpuinfo.nim
new file mode 100644
index 0000000000..dfa819f646
--- /dev/null
+++ b/lib/pure/concurrency/cpuinfo.nim
@@ -0,0 +1,58 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2014 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## This module implements procs to determine the number of CPUs / cores.
+
+include "system/inclrtl"
+
+import strutils, os
+
+when not defined(windows):
+  import posix
+
+when defined(linux):
+  import linux
+
+when defined(macosx) or defined(bsd):
+  const
+    CTL_HW = 6
+    HW_AVAILCPU = 25
+    HW_NCPU = 3
+  proc sysctl(x: ptr array[0..3, cint], y: cint, z: pointer,
+              a: var csize, b: pointer, c: int): cint {.
+             importc: "sysctl", header: "<sys/sysctl.h>".}
+
+proc countProcessors*(): int {.rtl, extern: "ncpi$1".} =
+  ## returns the numer of the processors/cores the machine has.
+  ## Returns 0 if it cannot be detected.
+  when defined(windows):
+    var x = getEnv("NUMBER_OF_PROCESSORS")
+    if x.len > 0: result = parseInt(x.string)
+  elif defined(macosx) or defined(bsd):
+    var
+      mib: array[0..3, cint]
+      numCPU: int
+      len: csize
+    mib[0] = CTL_HW
+    mib[1] = HW_AVAILCPU
+    len = sizeof(numCPU)
+    discard sysctl(addr(mib), 2, addr(numCPU), len, nil, 0)
+    if numCPU < 1:
+      mib[1] = HW_NCPU
+      discard sysctl(addr(mib), 2, addr(numCPU), len, nil, 0)
+    result = numCPU
+  elif defined(hpux):
+    result = mpctl(MPC_GETNUMSPUS, nil, nil)
+  elif defined(irix):
+    var SC_NPROC_ONLN {.importc: "_SC_NPROC_ONLN", header: "<unistd.h>".}: cint
+    result = sysconf(SC_NPROC_ONLN)
+  else:
+    result = sysconf(SC_NPROCESSORS_ONLN)
+  if result <= 0: result = 1
+
diff --git a/lib/pure/concurrency/cpuload.nim b/lib/pure/concurrency/cpuload.nim
new file mode 100644
index 0000000000..3cf6a73920
--- /dev/null
+++ b/lib/pure/concurrency/cpuload.nim
@@ -0,0 +1,96 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2014 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## This module implements a helper for a thread pool to determine whether
+## creating a thread is a good idea.
+
+when defined(windows):
+  import winlean, os, strutils, math
+
+  proc `-`(a, b: TFILETIME): int64 = a.rdFileTime - b.rdFileTime
+elif defined(linux):
+  from cpuinfo import countProcessors
+
+type
+  ThreadPoolAdvice* = enum
+    doNothing,
+    doCreateThread,  # create additional thread for throughput
+    doShutdownThread # too many threads are busy, shutdown one
+
+  ThreadPoolState* = object
+    when defined(windows):
+      prevSysKernel, prevSysUser, prevProcKernel, prevProcUser: TFILETIME
+    calls*: int
+
+proc advice*(s: var ThreadPoolState): ThreadPoolAdvice =
+  when defined(windows):
+    var
+      sysIdle, sysKernel, sysUser,
+        procCreation, procExit, procKernel, procUser: TFILETIME
+    if getSystemTimes(sysIdle, sysKernel, sysUser) == 0 or
+        getProcessTimes(THandle(-1), procCreation, procExit, 
+                        procKernel, procUser) == 0:
+      return doNothing
+    if s.calls > 0:
+      let
+        sysKernelDiff = sysKernel - s.prevSysKernel
+        sysUserDiff = sysUser - s.prevSysUser
+
+        procKernelDiff = procKernel - s.prevProcKernel
+        procUserDiff = procUser - s.prevProcUser
+
+        sysTotal = int(sysKernelDiff + sysUserDiff)
+        procTotal = int(procKernelDiff + procUserDiff)
+      # total CPU usage < 85% --> create a new worker thread.
+      # Measurements show that 100% and often even 90% is not reached even
+      # if all my cores are busy.
+      if sysTotal == 0 or procTotal / sysTotal < 0.85:
+        result = doCreateThread
+    s.prevSysKernel = sysKernel
+    s.prevSysUser = sysUser
+    s.prevProcKernel = procKernel
+    s.prevProcUser = procUser
+  elif defined(linux):
+    proc fscanf(c: TFile, frmt: cstring) {.varargs, importc, 
+      header: "<stdio.h>".}
+
+    var f = open("/proc/loadavg")
+    var b: float
+    var busy, total: int
+    fscanf(f,"%lf %lf %lf %ld/%ld",
+           addr b, addr b, addr b, addr busy, addr total)
+    f.close()
+    let cpus = countProcessors()
+    if busy-1 < cpus:
+      result = doCreateThread
+    elif busy-1 >= cpus*2:
+      result = doShutdownThread
+    else:
+      result = doNothing
+  else:
+    # XXX implement this for other OSes
+    result = doNothing
+  inc s.calls
+
+when isMainModule:
+  proc busyLoop() =
+    while true:
+      discard random(80)
+      os.sleep(100)
+
+  spawn busyLoop()
+  spawn busyLoop()
+  spawn busyLoop()
+  spawn busyLoop()
+
+  var s: ThreadPoolState
+
+  for i in 1 .. 70:
+    echo advice(s)
+    os.sleep(1000)
diff --git a/lib/pure/concurrency/threadpool.nim b/lib/pure/concurrency/threadpool.nim
new file mode 100644
index 0000000000..856820c6e0
--- /dev/null
+++ b/lib/pure/concurrency/threadpool.nim
@@ -0,0 +1,210 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2014 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Implements Nimrod's 'spawn'.
+
+import cpuinfo, cpuload, locks
+
+{.push stackTrace:off.}
+
+type
+  CondVar = object
+    c: TCond
+    L: TLock
+    counter: int
+
+proc createCondVar(): CondVar =
+  initCond(result.c)
+  initLock(result.L)
+
+proc destroyCondVar(cv: var CondVar) {.inline.} =
+  deinitCond(cv.c)
+  deinitLock(cv.L)
+
+proc await(cv: var CondVar) =
+  acquire(cv.L)
+  while cv.counter <= 0:
+    wait(cv.c, cv.L)
+  dec cv.counter
+  release(cv.L)
+
+proc signal(cv: var CondVar) =
+  acquire(cv.L)
+  inc cv.counter
+  release(cv.L)
+  signal(cv.c)
+
+type
+  Barrier* {.compilerProc.} = object
+    counter: int
+    cv: CondVar
+
+proc barrierEnter*(b: ptr Barrier) {.compilerProc.} =
+  atomicInc b.counter
+
+proc barrierLeave*(b: ptr Barrier) {.compilerProc.} =
+  atomicDec b.counter
+  if b.counter <= 0: signal(b.cv)
+
+proc openBarrier*(b: ptr Barrier) {.compilerProc.} =
+  b.counter = 0
+  b.cv = createCondVar()
+
+proc closeBarrier*(b: ptr Barrier) {.compilerProc.} =
+  await(b.cv)
+  destroyCondVar(b.cv)
+
+{.pop.}
+
+# ----------------------------------------------------------------------------
+
+type
+  WorkerProc = proc (thread, args: pointer) {.nimcall, gcsafe.}
+  Worker = object
+    taskArrived: CondVar
+    taskStarted: CondVar #\
+    # task data:
+    f: WorkerProc
+    data: pointer
+    ready: bool # put it here for correct alignment!
+    initialized: bool # whether it has even been initialized
+
+proc nimArgsPassingDone(p: pointer) {.compilerProc.} =
+  let w = cast[ptr Worker](p)
+  signal(w.taskStarted)
+
+var
+  gSomeReady = createCondVar()
+  readyWorker: ptr Worker
+
+proc slave(w: ptr Worker) {.thread.} =
+  while true:
+    w.ready = true
+    readyWorker = w
+    signal(gSomeReady)
+    await(w.taskArrived)
+    assert(not w.ready)
+    w.f(w, w.data)
+
+const
+  MaxThreadPoolSize* = 256 ## maximal size of the thread pool. 256 threads
+                           ## should be good enough for anybody ;-)
+
+var
+  currentPoolSize: int
+  maxPoolSize = MaxThreadPoolSize
+  minPoolSize = 4
+
+proc setMinPoolSize*(size: range[1..MaxThreadPoolSize]) =
+  ## sets the minimal thread pool size. The default value of this is 4.
+  minPoolSize = size
+
+proc setMaxPoolSize*(size: range[1..MaxThreadPoolSize]) =
+  ## sets the minimal thread pool size. The default value of this
+  ## is ``MaxThreadPoolSize``.
+  maxPoolSize = size
+
+var
+  workers: array[MaxThreadPoolSize, TThread[ptr Worker]]
+  workersData: array[MaxThreadPoolSize, Worker]
+
+proc activateThread(i: int) {.noinline.} =
+  workersData[i].taskArrived = createCondVar()
+  workersData[i].taskStarted = createCondVar()
+  workersData[i].initialized = true
+  createThread(workers[i], slave, addr(workersData[i]))
+
+proc setup() =
+  currentPoolSize = min(countProcessors(), MaxThreadPoolSize)
+  readyWorker = addr(workersData[0])
+  for i in 0.. <currentPoolSize: activateThread(i)
+
+proc preferSpawn*(): bool =
+  ## Use this proc to determine quickly if a 'spawn' or a direct call is
+  ## preferable. If it returns 'true' a 'spawn' may make sense. In general
+  ## it is not necessary to call this directly; use 'spawnX' instead.
+  result = gSomeReady.counter > 0
+
+proc spawn*(call: stmt) {.magic: "Spawn".}
+  ## always spawns a new task, so that the 'call' is never executed on
+  ## the calling thread. 'call' has to be proc call 'p(...)' where 'p'
+  ## is gcsafe and has 'void' as the return type.
+
+template spawnX*(call: stmt) =
+  ## spawns a new task if a CPU core is ready, otherwise executes the
+  ## call in the calling thread. Usually it is advised to
+  ## use 'spawn' in order to not block the producer for an unknown
+  ## amount of time. 'call' has to be proc call 'p(...)' where 'p'
+  ## is gcsafe and has 'void' as the return type.
+  if preferSpawn(): spawn call
+  else: call
+
+proc parallel*(body: stmt) {.magic: "Parallel".}
+  ## a parallel section can be used to execute a block in parallel. ``body``
+  ## has to be in a DSL that is a particular subset of the language. Please
+  ## refer to the manual for further information.
+
+var
+  state: ThreadPoolState
+  stateLock: TLock
+
+initLock stateLock
+
+proc selectWorker(w: ptr Worker; fn: WorkerProc; data: pointer): bool =
+  if cas(addr w.ready, true, false):
+    w.data = data
+    w.f = fn
+    signal(w.taskArrived)
+    await(w.taskStarted)
+    result = true
+
+proc nimSpawn(fn: WorkerProc; data: pointer) {.compilerProc.} =
+  # implementation of 'spawn' that is used by the code generator.
+  while true:
+    if selectWorker(readyWorker, fn, data): return
+    for i in 0.. <currentPoolSize:
+      if selectWorker(addr(workersData[i]), fn, data): return
+    # determine what to do, but keep in mind this is expensive too:
+    # state.calls < maxPoolSize: warmup phase
+    # (state.calls and 127) == 0: periodic check
+    if state.calls < maxPoolSize or (state.calls and 127) == 0:
+      # ensure the call to 'advice' is atomic:
+      if tryAcquire(stateLock):
+        case advice(state)
+        of doNothing: discard
+        of doCreateThread:
+          if currentPoolSize < maxPoolSize:
+            if not workersData[currentPoolSize].initialized:
+              activateThread(currentPoolSize)
+            let w = addr(workersData[currentPoolSize])
+            inc currentPoolSize
+            if selectWorker(w, fn, data):
+              release(stateLock)
+              return
+            # else we didn't succeed but some other thread, so do nothing.
+        of doShutdownThread:
+          if currentPoolSize > minPoolSize: dec currentPoolSize
+          # we don't free anything here. Too dangerous.
+        release(stateLock)
+      # else the acquire failed, but this means some
+      # other thread succeeded, so we don't need to do anything here.
+    await(gSomeReady)
+
+proc sync*() =
+  ## a simple barrier to wait for all spawn'ed tasks. If you need more elaborate
+  ## waiting, you have to use an explicit barrier.
+  while true:
+    var allReady = true
+    for i in 0 .. <currentPoolSize:
+      if not allReady: break
+      allReady = allReady and workersData[i].ready
+    if allReady: break
+    await(gSomeReady)
+
+setup()
diff --git a/lib/pure/osproc.nim b/lib/pure/osproc.nim
index 6e250f9d54..6250ce2577 100644
--- a/lib/pure/osproc.nim
+++ b/lib/pure/osproc.nim
@@ -1,7 +1,7 @@
 #
 #
 #            Nimrod's Runtime Library
-#        (c) Copyright 2013 Andreas Rumpf
+#        (c) Copyright 2014 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
@@ -13,7 +13,7 @@
 include "system/inclrtl"
 
 import
-  strutils, os, strtabs, streams
+  strutils, os, strtabs, streams, cpuinfo
 
 when defined(windows):
   import winlean
@@ -225,42 +225,10 @@ proc errorHandle*(p: PProcess): TFileHandle {.rtl, extern: "nosp$1",
   ## it is closed when closing the PProcess ``p``.
   result = p.errHandle
 
-when defined(macosx) or defined(bsd):
-  const
-    CTL_HW = 6
-    HW_AVAILCPU = 25
-    HW_NCPU = 3
-  proc sysctl(x: ptr array[0..3, cint], y: cint, z: pointer,
-              a: var csize, b: pointer, c: int): cint {.
-             importc: "sysctl", header: "<sys/sysctl.h>".}
-
 proc countProcessors*(): int {.rtl, extern: "nosp$1".} =
   ## returns the numer of the processors/cores the machine has.
   ## Returns 0 if it cannot be detected.
-  when defined(windows):
-    var x = getEnv("NUMBER_OF_PROCESSORS")
-    if x.len > 0: result = parseInt(x.string)
-  elif defined(macosx) or defined(bsd):
-    var
-      mib: array[0..3, cint]
-      numCPU: int
-      len: csize
-    mib[0] = CTL_HW
-    mib[1] = HW_AVAILCPU
-    len = sizeof(numCPU)
-    discard sysctl(addr(mib), 2, addr(numCPU), len, nil, 0)
-    if numCPU < 1:
-      mib[1] = HW_NCPU
-      discard sysctl(addr(mib), 2, addr(numCPU), len, nil, 0)
-    result = numCPU
-  elif defined(hpux):
-    result = mpctl(MPC_GETNUMSPUS, nil, nil)
-  elif defined(irix):
-    var SC_NPROC_ONLN {.importc: "_SC_NPROC_ONLN", header: "<unistd.h>".}: cint
-    result = sysconf(SC_NPROC_ONLN)
-  else:
-    result = sysconf(SC_NPROCESSORS_ONLN)
-  if result <= 0: result = 1
+  result = cpuinfo.countProcessors()
 
 proc execProcesses*(cmds: openArray[string],
                     options = {poStdErrToStdOut, poParentStreams},
diff --git a/lib/system.nim b/lib/system.nim
index ad98540a7c..fbd905afab 100644
--- a/lib/system.nim
+++ b/lib/system.nim
@@ -2934,6 +2934,3 @@ when not defined(booting):
 
   template isStatic*(x): expr = compiles(static(x))
     # checks whether `x` is a value known at compile-time
-
-when hasThreadSupport:
-  when hostOS != "standalone": include "system/sysspawn"
diff --git a/lib/system/atomics.nim b/lib/system/atomics.nim
index b1a96b2097..c6c603b19e 100644
--- a/lib/system/atomics.nim
+++ b/lib/system/atomics.nim
@@ -1,13 +1,14 @@
 #
 #
 #            Nimrod's Runtime Library
-#        (c) Copyright 2012 Andreas Rumpf
+#        (c) Copyright 2014 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
 #
 
 ## Atomic operations for Nimrod.
+{.push stackTrace:off.}
 
 when (defined(gcc) or defined(llvm_gcc)) and hasThreadSupport:
   type 
@@ -203,3 +204,31 @@ proc atomicDec*(memLoc: var int, x: int = 1): int =
   else:
     dec(memLoc, x)
     result = memLoc
+
+when defined(windows) and not defined(gcc):
+  proc interlockedCompareExchange(p: pointer; exchange, comparand: int32): int32
+    {.importc: "InterlockedCompareExchange", header: "<windows.h>", cdecl.}
+
+  proc cas*[T: bool|int](p: ptr T; oldValue, newValue: T): bool =
+    interlockedCompareExchange(p, newValue.int32, oldValue.int32) != 0
+
+else:
+  # this is valid for GCC and Intel C++
+  proc cas*[T: bool|int](p: ptr T; oldValue, newValue: T): bool
+    {.importc: "__sync_bool_compare_and_swap", nodecl.}
+  # XXX is this valid for 'int'?
+
+
+when (defined(x86) or defined(amd64)) and defined(gcc):
+  proc cpuRelax {.inline.} =
+    {.emit: """asm volatile("pause" ::: "memory");""".}
+elif (defined(x86) or defined(amd64)) and defined(vcc):
+  proc cpuRelax {.importc: "YieldProcessor", header: "<windows.h>".}
+elif defined(intelc):
+  proc cpuRelax {.importc: "_mm_pause", header: "xmmintrin.h".}
+elif false:
+  from os import sleep
+
+  proc cpuRelax {.inline.} = os.sleep(1)
+
+{.pop.}
diff --git a/lib/system/sysspawn.nim b/lib/system/sysspawn.nim
index dabf35a3e5..95cdba65d1 100644
--- a/lib/system/sysspawn.nim
+++ b/lib/system/sysspawn.nim
@@ -14,30 +14,6 @@ when not defined(NimString):
 
 {.push stackTrace:off.}
 
-when (defined(x86) or defined(amd64)) and defined(gcc):
-  proc cpuRelax {.inline.} =
-    {.emit: """asm volatile("pause" ::: "memory");""".}
-elif (defined(x86) or defined(amd64)) and defined(vcc):
-  proc cpuRelax {.importc: "YieldProcessor", header: "<windows.h>".}
-elif defined(intelc):
-  proc cpuRelax {.importc: "_mm_pause", header: "xmmintrin.h".}
-elif false:
-  from os import sleep
-
-  proc cpuRelax {.inline.} = os.sleep(1)
-
-when defined(windows) and not defined(gcc):
-  proc interlockedCompareExchange(p: pointer; exchange, comparand: int32): int32
-    {.importc: "InterlockedCompareExchange", header: "<windows.h>", cdecl.}
-
-  proc cas(p: ptr bool; oldValue, newValue: bool): bool =
-    interlockedCompareExchange(p, newValue.int32, oldValue.int32) != 0
-
-else:
-  # this is valid for GCC and Intel C++
-  proc cas(p: ptr bool; oldValue, newValue: bool): bool
-    {.importc: "__sync_bool_compare_and_swap", nodecl.}
-
 # We declare our own condition variables here to get rid of the dummy lock
 # on Windows:
 
@@ -54,6 +30,9 @@ proc createCondVar(): CondVar =
     initSysLock(result.stupidLock)
     #acquireSys(result.stupidLock)
 
+proc destroyCondVar(c: var CondVar) {.inline.} =
+  deinitSysCond(c.c)
+
 proc await(cv: var CondVar) =
   when defined(posix):
     acquireSys(cv.stupidLock)
@@ -100,6 +79,26 @@ proc signal(cv: var FastCondVar) =
   #if cas(addr cv.slowPath, true, false):
   signal(cv.slow)
 
+type
+  Barrier* {.compilerProc.} = object
+    counter: int
+    cv: CondVar
+
+proc barrierEnter*(b: ptr Barrier) {.compilerProc.} =
+  atomicInc b.counter
+
+proc barrierLeave*(b: ptr Barrier) {.compilerProc.} =
+  atomicDec b.counter
+  if b.counter <= 0: signal(b.cv)
+
+proc openBarrier*(b: ptr Barrier) {.compilerProc.} =
+  b.counter = 0
+  b.cv = createCondVar()
+
+proc closeBarrier*(b: ptr Barrier) {.compilerProc.} =
+  await(b.cv)
+  destroyCondVar(b.cv)
+
 {.pop.}
 
 # ----------------------------------------------------------------------------
diff --git a/tests/system/tsysspawn.nim b/tests/system/tsysspawn.nim
index 0388918aa8..fc7921b0e1 100644
--- a/tests/system/tsysspawn.nim
+++ b/tests/system/tsysspawn.nim
@@ -4,20 +4,22 @@ discard """
   cmd: "nimrod $target --threads:on $options $file"
 """
 
+import threadpool
+
 var
   x, y = 0
 
 proc p1 =
-  for i in 0 .. 1_000_000:
+  for i in 0 .. 10_000:
     discard
 
-  inc x
+  atomicInc x
 
 proc p2 =
-  for i in 0 .. 1_000_000:
+  for i in 0 .. 10_000:
     discard
 
-  inc y, 2
+  atomicInc y, 2
 
 for i in 0.. 3:
   spawn(p1())
diff --git a/tests/system/tsysspawnbadarg.nim b/tests/system/tsysspawnbadarg.nim
index ace074602c..ce3c5611b5 100644
--- a/tests/system/tsysspawnbadarg.nim
+++ b/tests/system/tsysspawnbadarg.nim
@@ -4,4 +4,6 @@ discard """
   cmd: "nimrod $target --threads:on $options $file"
 """
 
+import threadpool
+
 spawn(1)
diff --git a/web/news.txt b/web/news.txt
index 0bbae7b7b9..b7403a3c72 100644
--- a/web/news.txt
+++ b/web/news.txt
@@ -2,6 +2,23 @@
 News
 ====
 
+..
+  2014-06-29 Version 0.9.6 released
+  =================================
+
+  Changes affecting backwards compatibility
+  -----------------------------------------
+
+  - ``spawn`` now uses an elaborate self-adapting thread pool and as such
+    has been moved into its own module. So to use it, you now have to import
+    ``threadpool``.
+
+
+  Library Additions
+  -----------------
+
+  - Added module ``cpuinfo``.
+  - Added module ``threadpool``.
 
 
 2014-04-21 Version 0.9.4 released

From c43e8df90cc5d52c6c57452a28f433075bf66236 Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Wed, 14 May 2014 01:51:44 +0200
Subject: [PATCH 02/13] progress for the 'parallel' statement

---
 compiler/ast.nim                    |   4 +-
 compiler/ccgexprs.nim               |   3 +
 compiler/cgen.nim                   |   3 +-
 compiler/guards.nim                 |   4 +-
 compiler/lowerings.nim              |   1 +
 compiler/sem.nim                    |   3 +-
 compiler/semexprs.nim               |  15 ++-
 compiler/semmagic.nim               |   2 +-
 compiler/semparallel.nim            | 155 +++++++++++++++-------------
 lib/pure/concurrency/threadpool.nim |  26 +++--
 10 files changed, 122 insertions(+), 94 deletions(-)

diff --git a/compiler/ast.nim b/compiler/ast.nim
index 80b9e9bb27..58b01d5e8a 100644
--- a/compiler/ast.nim
+++ b/compiler/ast.nim
@@ -605,9 +605,9 @@ const
   # thus cannot be overloaded (also documented in the spec!):
   SpecialSemMagics* = {
     mDefined, mDefinedInScope, mCompiles, mLow, mHigh, mSizeOf, mIs, mOf, 
-    mEcho, mShallowCopy, mExpandToAst}
+    mEcho, mShallowCopy, mExpandToAst, mParallel}
 
-type 
+type
   PNode* = ref TNode
   TNodeSeq* = seq[PNode]
   PType* = ref TType
diff --git a/compiler/ccgexprs.nim b/compiler/ccgexprs.nim
index 94a6f4781b..7fb6af8965 100644
--- a/compiler/ccgexprs.nim
+++ b/compiler/ccgexprs.nim
@@ -1638,6 +1638,9 @@ proc genMagicExpr(p: BProc, e: PNode, d: var TLoc, op: TMagic) =
   of mSpawn:
     let n = lowerings.wrapProcForSpawn(p.module.module, e.sons[1])
     expr(p, n, d)
+  of mParallel:
+    let n = semparallel.liftParallel(p.module.module, e)
+    expr(p, n, d)
   else: internalError(e.info, "genMagicExpr: " & $op)
 
 proc genConstExpr(p: BProc, n: PNode): PRope
diff --git a/compiler/cgen.nim b/compiler/cgen.nim
index 8d66d7a3b1..3e5ac485b9 100644
--- a/compiler/cgen.nim
+++ b/compiler/cgen.nim
@@ -14,7 +14,8 @@ import
   options, intsets,
   nversion, nimsets, msgs, crc, bitsets, idents, lists, types, ccgutils, os,
   times, ropes, math, passes, rodread, wordrecg, treetab, cgmeth,
-  rodutils, renderer, idgen, cgendata, ccgmerge, semfold, aliases, lowerings
+  rodutils, renderer, idgen, cgendata, ccgmerge, semfold, aliases, lowerings,
+  semparallel
 
 when options.hasTinyCBackend:
   import tccgen
diff --git a/compiler/guards.nim b/compiler/guards.nim
index 57cd73b11a..551a112565 100644
--- a/compiler/guards.nim
+++ b/compiler/guards.nim
@@ -160,13 +160,13 @@ proc buildCall(op: PSym; a: PNode): PNode =
   result.sons[1] = a
 
 proc buildCall(op: PSym; a, b: PNode): PNode =
-  result = newNodeI(nkCall, a.info, 3)
+  result = newNodeI(nkInfix, a.info, 3)
   result.sons[0] = newSymNode(op)
   result.sons[1] = a
   result.sons[2] = b
 
 proc `+@`*(a: PNode; b: BiggestInt): PNode =
-  opAdd.buildCall(a, nkIntLit.newIntNode(b))
+  (if b != 0: opAdd.buildCall(a, nkIntLit.newIntNode(b)) else: a)
 
 proc `|+|`(a, b: PNode): PNode =
   result = copyNode(a)
diff --git a/compiler/lowerings.nim b/compiler/lowerings.nim
index 93bfd84257..704cfbcdd0 100644
--- a/compiler/lowerings.nim
+++ b/compiler/lowerings.nim
@@ -68,6 +68,7 @@ proc addField*(obj: PType; s: PSym) =
   var field = newSym(skField, getIdent(s.name.s & $s.id), s.owner, s.info)
   let t = skipIntLit(s.typ)
   field.typ = t
+  assert t.kind != tyStmt
   field.position = sonsLen(obj.n)
   addSon(obj.n, newSymNode(field))
 
diff --git a/compiler/sem.nim b/compiler/sem.nim
index 7d129caf44..cf52d1cf0c 100644
--- a/compiler/sem.nim
+++ b/compiler/sem.nim
@@ -15,7 +15,8 @@ import
   magicsys, parser, nversion, nimsets, semfold, importer,
   procfind, lookups, rodread, pragmas, passes, semdata, semtypinst, sigmatch,
   intsets, transf, vmdef, vm, idgen, aliases, cgmeth, lambdalifting,
-  evaltempl, patterns, parampatterns, sempass2, pretty, semmacrosanity
+  evaltempl, patterns, parampatterns, sempass2, pretty, semmacrosanity,
+  semparallel
 
 # implementation
 
diff --git a/compiler/semexprs.nim b/compiler/semexprs.nim
index 9ea93a15e9..505c289ea9 100644
--- a/compiler/semexprs.nim
+++ b/compiler/semexprs.nim
@@ -1388,11 +1388,6 @@ proc semDefined(c: PContext, n: PNode, onlyCurrentScope: bool): PNode =
   result.info = n.info
   result.typ = getSysType(tyBool)
 
-proc setMs(n: PNode, s: PSym): PNode = 
-  result = n
-  n.sons[0] = newSymNode(s)
-  n.sons[0].info = n.info
-
 proc expectMacroOrTemplateCall(c: PContext, n: PNode): PSym =
   ## The argument to the proc should be nkCall(...) or similar
   ## Returns the macro/template symbol
@@ -1584,6 +1579,11 @@ proc semShallowCopy(c: PContext, n: PNode, flags: TExprFlags): PNode =
   else:
     result = semDirectOp(c, n, flags)
 
+proc setMs(n: PNode, s: PSym): PNode = 
+  result = n
+  n.sons[0] = newSymNode(s)
+  n.sons[0].info = n.info
+
 proc semMagic(c: PContext, n: PNode, s: PSym, flags: TExprFlags): PNode = 
   # this is a hotspot in the compiler!
   # DON'T forget to update ast.SpecialSemMagics if you add a magic here!
@@ -1605,6 +1605,11 @@ proc semMagic(c: PContext, n: PNode, s: PSym, flags: TExprFlags): PNode =
     checkSonsLen(n, 2)
     result = newStrNodeT(renderTree(n[1], {renderNoComments}), n)
     result.typ = getSysType(tyString)
+  of mParallel:
+    result = setMs(n, s)
+    var x = n.lastSon
+    if x.kind == nkDo: x = x.sons[bodyPos]
+    result.sons[1] = semStmt(c, x)
   else: result = semDirectOp(c, n, flags)
 
 proc semWhen(c: PContext, n: PNode, semCheck = true): PNode =
diff --git a/compiler/semmagic.nim b/compiler/semmagic.nim
index 4caf1fb8e2..80e70b8c0a 100644
--- a/compiler/semmagic.nim
+++ b/compiler/semmagic.nim
@@ -1,7 +1,7 @@
 #
 #
 #           The Nimrod Compiler
-#        (c) Copyright 2013 Andreas Rumpf
+#        (c) Copyright 2014 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
diff --git a/compiler/semparallel.nim b/compiler/semparallel.nim
index 34a1f3af82..dd1584e7d0 100644
--- a/compiler/semparallel.nim
+++ b/compiler/semparallel.nim
@@ -19,7 +19,11 @@
 # - passed slices need to be ensured to be disjoint (+)
 # - output slices need special logic
 
-import lowerings, guards, sempass2
+import
+  ast, astalgo, idents, lowerings, magicsys, guards, sempass2, msgs,
+  renderer
+from trees import getMagic
+from strutils import `%`
 
 discard """
 
@@ -75,12 +79,17 @@ proc initAnalysisCtx(): AnalysisCtx =
   result.args = @[]
   result.guards = @[]
 
-proc getSlot(c: var AnalysisCtx; s: PSym): ptr MonotonicVar =
-  var L = c.locals.len
-  for i in 0.. <L:
-    if c.locals[i].v == s: return addr(c.locals[i])
+proc lookupSlot(c: AnalysisCtx; s: PSym): int =
+  for i in 0.. <c.locals.len:
+    if c.locals[i].v == s: return i
+  return -1
+
+proc getSlot(c: var AnalysisCtx; v: PSym): ptr MonotonicVar =
+  let s = lookupSlot(c, v)
+  if s >= 0: return addr(c.locals[s])
+  let L = c.locals.len
   c.locals.setLen(L+1)
-  c.locals[L].v = s
+  c.locals[L].v = v
   return addr(c.locals[L])
 
 proc getRoot(n: PNode): PSym =
@@ -110,25 +119,28 @@ proc gatherArgs(c: var AnalysisCtx; n: PNode) =
         c.args.add root
     gatherArgs(c, n[i])
 
-proc isLocal(s: PSym): bool = 
-  s.kind in {skResult, skTemp, skForVar, skVar, skLet} and
-        {sfAddrTaken, sfGlobal} * s.flags == {}
+proc isLocal(n: PNode): bool =
+  n.kind == nkSym and (let s = n.sym;
+    s.kind in {skResult, skTemp, skForVar, skVar, skLet} and
+          {sfAddrTaken, sfGlobal} * s.flags == {})
 
-proc checkLocal(c: var AnalysisCtx; n: PNode) =
-  if n.kind == nkSym and isLocal(n.sym):
-    let slot = c.getSlot(n[1].sym)
-    if slot.stride != nil:
+proc checkLocal(c: AnalysisCtx; n: PNode) =
+  if isLocal(n):
+    let s = c.lookupSlot(n.sym)
+    if s >= 0 and c.locals[s].stride != nil:
       localError(n.info, "invalid usage of counter after increment")
   else:
     for i in 0 .. <n.safeLen: checkLocal(c, n.sons[i])
 
+template `?`(x): expr = x.renderTree
+
 proc checkLe(c: AnalysisCtx; a, b: PNode) =
   case proveLe(c.guards, a, b)
-  of impUnkown:
-    localError(n.info, "cannot prove: " & a.renderTree & " <= " & b.renderTree)
+  of impUnknown:
+    localError(a.info, "cannot prove: " & ?a & " <= " & ?b)
   of impYes: discard
   of impNo:
-    localError(n.info, "can prove: " & a.renderTree & " > " & b.renderTree)
+    localError(a.info, "can prove: " & ?a & " > " & ?b)
 
 proc checkBounds(c: AnalysisCtx; arr, idx: PNode) =
   checkLe(c, arr.lowBound, idx)
@@ -139,11 +151,8 @@ proc addLowerBoundAsFacts(c: var AnalysisCtx) =
     if not v.blacklisted:
       c.guards.addFactLe(v.lower, newSymNode(v.v))
 
-proc addSlice(c: var AnalysisCtx; n: PNode; x, le, ri: int) =
+proc addSlice(c: var AnalysisCtx; n: PNode; x, le, ri: PNode) =
   checkLocal(c, n)
-  let le = n.sons[le]
-  let ri = n.sons[ri]
-  let x = n.sons[x]
   # perform static bounds checking here; and not later!
   let oldState = c.guards.len
   addLowerBoundAsFacts(c)
@@ -152,17 +161,15 @@ proc addSlice(c: var AnalysisCtx; n: PNode; x, le, ri: int) =
   c.guards.setLen(oldState)
   c.slices.add((x, le, ri, c.currentSpawnId, c.inLoop > 0))
 
-template `?`(x): expr = x.renderTree
-
 proc overlap(m: TModel; x,y,c,d: PNode) =
-  #  X..Y and C..D overlap iff (X <= D and Y >= C)
+  #  X..Y and C..D overlap iff (X <= D and C <= Y)
   case proveLe(m, x, d)
-  of impUnkown:
+  of impUnknown:
     localError(x.info,
       "cannot prove: $# > $#; required for $#..$# disjoint from $#..$#" %
         [?x, ?d, ?x, ?y, ?c, ?d])
   of impYes:
-    case proveLe(m, y, c)
+    case proveLe(m, c, y)
     of impUnknown:
       localError(x.info,
         "cannot prove: $# > $#; required for $#..$# disjoint from $#..$#" %
@@ -175,12 +182,12 @@ proc overlap(m: TModel; x,y,c,d: PNode) =
 proc stride(c: AnalysisCtx; n: PNode): BiggestInt =
   # note: 0 if it cannot be determined is just right because then
   # we analyse 'i..i' and 'i+0 .. i+0' and these are not disjoint!
-  if n.kind == nkSym and isLocal(n.sym):
-    let slot = c.getSlot(n[1].sym)
-    if slot.stride != nil:
-      result = slot.stride.intVal
+  if isLocal(n):
+    let s = c.lookupSlot(n.sym)
+    if s >= 0 and c.locals[s].stride != nil:
+      result = c.locals[s].stride.intVal
   else:
-    for i in 0 .. <n.safeLen: inc(result, stride(c, n.sons[i]))
+    for i in 0 .. <n.safeLen: result += stride(c, n.sons[i])
 
 proc checkSlicesAreDisjoint(c: var AnalysisCtx) =
   # this is the only thing that we need to perform after we have traversed
@@ -209,10 +216,10 @@ proc checkSlicesAreDisjoint(c: var AnalysisCtx) =
   # be feasible for many useful examples. Instead we attach the slice to
   # a spawn and if the attached spawns differ, we bail out:
   for i in 0 .. high(c.slices):
-    for j in 0 .. high(c.slices):
+    for j in i+1 .. high(c.slices):
       let x = c.slices[i]
       let y = c.slices[j]
-      if i != j and x.spawnId != y.spawnId and guards.sameTree(x.x, y.x):
+      if x.spawnId != y.spawnId and guards.sameTree(x.x, y.x):
         if not x.inLoop and not y.inLoop:
           overlap(c.guards, x.a, x.b, y.a, y.b)
         else:
@@ -233,6 +240,8 @@ proc min(a, b: PNode): PNode =
   elif a.intVal < b.intVal: result = a
   else: result = b
 
+proc fromSystem(op: PSym): bool = sfSystemModule in getModule(op).flags
+
 proc analyseCall(c: var AnalysisCtx; n: PNode; op: PSym) =
   if op.magic == mSpawn:
     inc c.spawns
@@ -241,18 +250,18 @@ proc analyseCall(c: var AnalysisCtx; n: PNode; op: PSym) =
     gatherArgs(c, n[1])
     analyseSons(c, n)
     c.currentSpawnId = oldSpawnId
-  elif op.magic == mInc or (op.name.s == "+=" and sfSystemModule in op.owner.flags):
-    if n[1].kind == nkSym and n[1].isLocal:
-      let incr = n[1].skipConv
+  elif op.magic == mInc or (op.name.s == "+=" and op.fromSystem):
+    if n[1].isLocal:
+      let incr = n[2].skipConv
       if incr.kind in {nkCharLit..nkUInt32Lit} and incr.intVal > 0:
         let slot = c.getSlot(n[1].sym)
         slot.stride = min(slot.stride, incr)
     analyseSons(c, n)
-  elif op.name.s == "[]" and sfSystemModule in op.owner.flags:
-    c.addSlice(n, 1, 2, 3)
+  elif op.name.s == "[]" and op.fromSystem:
+    c.addSlice(n, n[1], n[2][1], n[2][2])
     analyseSons(c, n)
-  elif op.name.s == "[]=" and sfSystemModule in op.owner.flags:
-    c.addSlice(n, 1, 2, 3)
+  elif op.name.s == "[]=" and op.fromSystem:
+    c.addSlice(n, n[1], n[2][1], n[2][2])
     analyseSons(c, n)
   else:
     analyseSons(c, n)
@@ -296,18 +305,18 @@ proc analyse(c: var AnalysisCtx; n: PNode) =
   of nkAsgn, nkFastAsgn:
     # since we already ensure sfAddrTaken is not in s.flags, we only need to
     # prevent direct assignments to the monotonic variable:
-    if n[0].kind == nkSym and n[0].isLocal:
-      let slot = c.getSlot(it[j].sym)
+    if n[0].isLocal:
+      let slot = c.getSlot(n[0].sym)
       slot.blackListed = true
-    invalidateFacts(c.guards, n.sons[0])
+    invalidateFacts(c.guards, n[0])
     analyseSons(c, n)
-    addAsgnFact(c.guards, n.sons[0], n.sons[1])
+    addAsgnFact(c.guards, n[0], n[1])
   of nkCallKinds:
     # direct call:
     if n[0].kind == nkSym: analyseCall(c, n, n[0].sym)
     else: analyseSons(c, n)
-  of nkBracket:
-    c.addSlice(n, 0, 1, 1)
+  of nkBracketExpr:
+    c.addSlice(n, n[0], n[1], n[1])
     analyseSons(c, n)
   of nkReturnStmt, nkRaiseStmt, nkTryStmt:
     localError(n.info, "invalid control flow for 'parallel'")
@@ -315,14 +324,14 @@ proc analyse(c: var AnalysisCtx; n: PNode) =
     # or maybe we should generate a 'try' XXX
   of nkVarSection:
     for it in n:
-      if it.sons[it.len-1].kind != nkEmpty:
+      let value = it.lastSon
+      if value.kind != nkEmpty:
         for j in 0 .. it.len-3:
-          if it[j].kind == nkSym and it[j].isLocal:
+          if it[j].isLocal:
             let slot = c.getSlot(it[j].sym)
-            if slot.lower.isNil: slot.lower = it.sons[it.len-1]
+            if slot.lower.isNil: slot.lower = value
             else: internalError(it.info, "slot already has a lower bound")
-    analyseSons(c, n)
-
+        analyse(c, value)
   of nkCaseStmt: analyseCase(c, n)
   of nkIfStmt, nkIfExpr: analyseIf(c, n)
   of nkWhileStmt:
@@ -340,7 +349,7 @@ proc analyse(c: var AnalysisCtx; n: PNode) =
       setLen(c.locals, oldState)
       setLen(c.guards, oldFacts)
       # we know after the loop the negation holds:
-      if not containsNode(n.sons[1], nkBreakStmt):
+      if not hasSubnodeWith(n.sons[1], nkBreakStmt):
         addFactNeg(c.guards, n.sons[0])
     dec c.inLoop
   of nkTypeSection, nkProcDef, nkConverterDef, nkMethodDef, nkIteratorDef,
@@ -350,33 +359,42 @@ proc analyse(c: var AnalysisCtx; n: PNode) =
     analyseSons(c, n)
 
 proc transformSlices(n: PNode): PNode =
-  if n.kind in nkCalls and n[0].kind == nkSym:
+  if n.kind in nkCallKinds and n[0].kind == nkSym:
     let op = n[0].sym
-    if op.name.s == "[]" and sfSystemModule in op.owner.flags:
-      result = copyTree(n)
-      result.sons[0] = opSlice
+    if op.name.s == "[]" and op.fromSystem:
+      result = copyNode(n)
+      result.add opSlice.newSymNode
+      result.add n[1]
+      result.add n[2][1]
+      result.add n[2][2]
       return result
   if n.safeLen > 0:
-    result = copyNode(n.kind, n.info, n.len)
+    result = copyNode(n)
     for i in 0 .. < n.len:
-      result.sons[i] = transformSlices(n.sons[i])
+      result.add transformSlices(n.sons[i])
   else:
     result = n
 
 proc transformSpawn(owner: PSym; n, barrier: PNode): PNode =
-  if n.kind in nkCalls:
+  if n.kind in nkCallKinds:
     if n[0].kind == nkSym:
       let op = n[0].sym
       if op.magic == mSpawn:
         result = transformSlices(n)
-        return wrapProcForSpawn(owner, result, barrier)
+        return wrapProcForSpawn(owner, result[1], barrier)
   elif n.safeLen > 0:
-    result = copyNode(n.kind, n.info, n.len)
+    result = copyNode(n)
     for i in 0 .. < n.len:
-      result.sons[i] = transformSpawn(owner, n.sons[i], barrier)
+      result.add transformSpawn(owner, n.sons[i], barrier)
   else:
     result = n
 
+proc checkArgs(a: var AnalysisCtx; n: PNode) =
+  discard "too implement"
+
+proc generateAliasChecks(a: AnalysisCtx; result: PNode) =
+  discard "too implement"
+
 proc liftParallel*(owner: PSym; n: PNode): PNode =
   # this needs to be called after the 'for' loop elimination
 
@@ -390,22 +408,17 @@ proc liftParallel*(owner: PSym; n: PNode): PNode =
   analyse(a, body)
   if a.spawns == 0:
     localError(n.info, "'parallel' section without 'spawn'")
-  checkSlices(a)
+  checkSlicesAreDisjoint(a)
   checkArgs(a, body)
 
   var varSection = newNodeI(nkVarSection, n.info)
-  var temp = newSym(skTemp, "barrier", owner, n.info)
+  var temp = newSym(skTemp, getIdent"barrier", owner, n.info)
   temp.typ = magicsys.getCompilerProc("Barrier").typ
   incl(temp.flags, sfFromGeneric)
+  let tempNode = newSymNode(temp)
+  varSection.addVar tempNode
 
-  var vpart = newNodeI(nkIdentDefs, n.info, 3)
-  vpart.sons[0] = newSymNode(temp)
-  vpart.sons[1] = ast.emptyNode
-  vpart.sons[2] = indirectAccess(castExpr, field, n.info)
-  varSection.add vpart
-
-  barrier = genAddrOf(vpart[0])
-
+  let barrier = genAddrOf(tempNode)
   result = newNodeI(nkStmtList, n.info)
   generateAliasChecks(a, result)
   result.add varSection
diff --git a/lib/pure/concurrency/threadpool.nim b/lib/pure/concurrency/threadpool.nim
index 856820c6e0..86819d25a7 100644
--- a/lib/pure/concurrency/threadpool.nim
+++ b/lib/pure/concurrency/threadpool.nim
@@ -74,12 +74,20 @@ type
     data: pointer
     ready: bool # put it here for correct alignment!
     initialized: bool # whether it has even been initialized
+    shutdown: bool # the pool requests to shut down this worker thread
 
 proc nimArgsPassingDone(p: pointer) {.compilerProc.} =
   let w = cast[ptr Worker](p)
   signal(w.taskStarted)
 
+const
+  MaxThreadPoolSize* = 256 ## maximal size of the thread pool. 256 threads
+                           ## should be good enough for anybody ;-)
+
 var
+  currentPoolSize: int
+  maxPoolSize = MaxThreadPoolSize
+  minPoolSize = 4
   gSomeReady = createCondVar()
   readyWorker: ptr Worker
 
@@ -91,15 +99,9 @@ proc slave(w: ptr Worker) {.thread.} =
     await(w.taskArrived)
     assert(not w.ready)
     w.f(w, w.data)
-
-const
-  MaxThreadPoolSize* = 256 ## maximal size of the thread pool. 256 threads
-                           ## should be good enough for anybody ;-)
-
-var
-  currentPoolSize: int
-  maxPoolSize = MaxThreadPoolSize
-  minPoolSize = 4
+    if w.shutdown:
+      w.shutdown = false
+      atomicDec currentPoolSize
 
 proc setMinPoolSize*(size: range[1..MaxThreadPoolSize]) =
   ## sets the minimal thread pool size. The default value of this is 4.
@@ -183,13 +185,15 @@ proc nimSpawn(fn: WorkerProc; data: pointer) {.compilerProc.} =
             if not workersData[currentPoolSize].initialized:
               activateThread(currentPoolSize)
             let w = addr(workersData[currentPoolSize])
-            inc currentPoolSize
+            atomicInc currentPoolSize
             if selectWorker(w, fn, data):
               release(stateLock)
               return
             # else we didn't succeed but some other thread, so do nothing.
         of doShutdownThread:
-          if currentPoolSize > minPoolSize: dec currentPoolSize
+          if currentPoolSize > minPoolSize:
+            let w = addr(workersData[currentPoolSize-1])
+            w.shutdown = true
           # we don't free anything here. Too dangerous.
         release(stateLock)
       # else the acquire failed, but this means some

From 31b8fd66b1bd54b665e52855909538a50d33d7c3 Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Wed, 14 May 2014 23:36:28 +0200
Subject: [PATCH 03/13] 'parallel' statement: next steps

---
 compiler/guards.nim                       | 112 +++++++++++++++-------
 compiler/semparallel.nim                  |  41 +++++---
 tests/parallel/tdisjoint_slice1.nim       |  21 ++++
 tests/parallel/tdisjoint_slice2.nim       |  21 ++++
 tests/parallel/tinvalid_array_bounds.nim  |  25 +++++
 tests/parallel/tinvalid_counter_usage.nim |  26 +++++
 tests/parallel/tnon_disjoint_slice1.nim   |  25 +++++
 7 files changed, 221 insertions(+), 50 deletions(-)
 create mode 100644 tests/parallel/tdisjoint_slice1.nim
 create mode 100644 tests/parallel/tdisjoint_slice2.nim
 create mode 100644 tests/parallel/tinvalid_array_bounds.nim
 create mode 100644 tests/parallel/tinvalid_counter_usage.nim
 create mode 100644 tests/parallel/tnon_disjoint_slice1.nim

diff --git a/compiler/guards.nim b/compiler/guards.nim
index 551a112565..de0ce1dcc9 100644
--- a/compiler/guards.nim
+++ b/compiler/guards.nim
@@ -1,7 +1,7 @@
 #
 #
 #           The Nimrod Compiler
-#        (c) Copyright 2013 Andreas Rumpf
+#        (c) Copyright 2014 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
@@ -165,9 +165,6 @@ proc buildCall(op: PSym; a, b: PNode): PNode =
   result.sons[1] = a
   result.sons[2] = b
 
-proc `+@`*(a: PNode; b: BiggestInt): PNode =
-  (if b != 0: opAdd.buildCall(a, nkIntLit.newIntNode(b)) else: a)
-
 proc `|+|`(a, b: PNode): PNode =
   result = copyNode(a)
   if a.kind in {nkCharLit..nkUInt64Lit}: result.intVal = a.intVal |+| b.intVal
@@ -178,22 +175,56 @@ proc `|*|`(a, b: PNode): PNode =
   if a.kind in {nkCharLit..nkUInt64Lit}: result.intVal = a.intVal |*| b.intVal
   else: result.floatVal = a.floatVal * b.floatVal
 
+proc negate(a, b, res: PNode): PNode =
+  if b.kind in {nkCharLit..nkUInt64Lit} and b.intVal != low(BiggestInt):
+    var b = copyNode(b)
+    b.intVal = -b.intVal
+    if a.kind in {nkCharLit..nkUInt64Lit}:
+      b.intVal = b.intVal |+| a.intVal
+      result = b
+    else:
+      result = buildCall(opAdd, a, b)
+  elif b.kind in {nkFloatLit..nkFloat64Lit}:
+    var b = copyNode(b)
+    b.floatVal = -b.floatVal
+    result = buildCall(opAdd, a, b)
+  else:
+    result = res
+
 proc zero(): PNode = nkIntLit.newIntNode(0)
 proc one(): PNode = nkIntLit.newIntNode(1)
 proc minusOne(): PNode = nkIntLit.newIntNode(-1)
 
-proc lowBound*(x: PNode): PNode = nkIntLit.newIntNode(firstOrd(x.typ))
+proc lowBound*(x: PNode): PNode = 
+  result = nkIntLit.newIntNode(firstOrd(x.typ))
+  result.info = x.info
+
 proc highBound*(x: PNode): PNode =
-  if x.typ.skipTypes(abstractInst).kind == tyArray:
-    nkIntLit.newIntNode(lastOrd(x.typ))
-  else:
-    opAdd.buildCall(opLen.buildCall(x), minusOne())
+  result = if x.typ.skipTypes(abstractInst).kind == tyArray:
+             nkIntLit.newIntNode(lastOrd(x.typ))
+           else:
+             opAdd.buildCall(opLen.buildCall(x), minusOne())
+  result.info = x.info
+
+proc reassociation(n: PNode): PNode =
+  result = n
+  # (foo+5)+5 --> foo+10;  same for '*'
+  case result.getMagic
+  of someAdd:
+    if result[2].isValue and 
+        result[1].getMagic in someAdd and result[1][2].isValue:
+      result = opAdd.buildCall(result[1][1], result[1][2] |+| result[2])
+  of someMul:
+    if result[2].isValue and 
+        result[1].getMagic in someMul and result[1][2].isValue:
+      result = opAdd.buildCall(result[1][1], result[1][2] |*| result[2])
+  else: discard
 
 proc canon*(n: PNode): PNode =
   # XXX for now only the new code in 'semparallel' uses this
   if n.safeLen >= 1:
-    result = newNodeI(n.kind, n.info, n.len)
-    for i in 0 .. < n.safeLen:
+    result = shallowCopy(n)
+    for i in 0 .. < n.len:
       result.sons[i] = canon(n.sons[i])
   else:
     result = n
@@ -210,32 +241,12 @@ proc canon*(n: PNode): PNode =
     result = buildCall(opAdd, result[1], newIntNode(nkIntLit, -1))
   of someSub:
     # x - 4  -->  x + (-4)
-    var b = result[2]
-    if b.kind in {nkCharLit..nkUInt64Lit} and b.intVal != low(BiggestInt):
-      b = copyNode(b)
-      b.intVal = -b.intVal
-      result = buildCall(opAdd, result[1], b)
-    elif b.kind in {nkFloatLit..nkFloat64Lit}:
-      b = copyNode(b)
-      b.floatVal = -b.floatVal
-      result = buildCall(opAdd, result[1], b)    
+    result = negate(result[1], result[2], result)
   of someLen:
     result.sons[0] = opLen.newSymNode
   else: discard
 
-  # re-association:
-  # (foo+5)+5 --> foo+10;  same for '*'
-  case result.getMagic
-  of someAdd:
-    if result[2].isValue and 
-        result[1].getMagic in someAdd and result[1][2].isValue:
-      result = opAdd.buildCall(result[1][1], result[1][2] |+| result[2])
-  of someMul:
-    if result[2].isValue and 
-        result[1].getMagic in someMul and result[1][2].isValue:
-      result = opAdd.buildCall(result[1][1], result[1][2] |*| result[2])
-  else: discard
-
+  result = reassociation(result)
   # most important rule: (x-4) < a.len -->  x < a.len+4
   case result.getMagic
   of someLe, someLt:
@@ -245,21 +256,32 @@ proc canon*(n: PNode): PNode =
         isLetLocation(x[1], true):
       case x.getMagic
       of someSub:
-        result = buildCall(result[0].sym, x[1], opAdd.buildCall(y, x[2]))
+        result = buildCall(result[0].sym, x[1], 
+                           reassociation(opAdd.buildCall(y, x[2])))
       of someAdd:
-        result = buildCall(result[0].sym, x[1], opSub.buildCall(y, x[2]))
+        # Rule A:
+        let plus = negate(y, x[2], nil).reassociation
+        if plus != nil: result = buildCall(result[0].sym, x[1], plus)
       else: discard
     elif y.kind in nkCallKinds and y.len == 3 and y[2].isValue and 
         isLetLocation(y[1], true):
       # a.len < x-3
       case y.getMagic
       of someSub:
-        result = buildCall(result[0].sym, y[1], opAdd.buildCall(x, y[2]))
+        result = buildCall(result[0].sym, y[1],
+                           reassociation(opAdd.buildCall(x, y[2])))
       of someAdd:
-        result = buildCall(result[0].sym, y[1], opSub.buildCall(x, y[2]))
+        let plus = negate(x, y[2], nil).reassociation
+        # ensure that Rule A will not trigger afterwards with the
+        # additional 'not isLetLocation' constraint:
+        if plus != nil and not isLetLocation(x, true):
+          result = buildCall(result[0].sym, plus, y[1])
       else: discard
   else: discard
 
+proc `+@`*(a: PNode; b: BiggestInt): PNode =
+  canon(if b != 0: opAdd.buildCall(a, nkIntLit.newIntNode(b)) else: a)
+
 proc usefulFact(n: PNode): PNode =
   case n.getMagic
   of someEq:
@@ -639,8 +661,20 @@ proc doesImply*(facts: TModel, prop: PNode): TImplication =
 proc impliesNotNil*(facts: TModel, arg: PNode): TImplication =
   result = doesImply(facts, opIsNil.buildCall(arg).neg)
 
+proc simpleSlice*(a, b: PNode): BiggestInt =
+  # returns 'c' if a..b matches (i+c)..(i+c), -1 otherwise. (i)..(i) is matched
+  # as if it is (i+0)..(i+0).
+  if guards.sameTree(a, b):
+    if a.getMagic in someAdd and a[2].kind in {nkCharLit..nkUInt64Lit}:
+      result = a[2].intVal
+    else:
+      result = 0
+  else:
+    result = -1
+
 proc proveLe*(m: TModel; a, b: PNode): TImplication =
   let res = canon(opLe.buildCall(a, b))
+  #echo renderTree(res)
   # we hardcode lots of axioms here:
   let a = res[1]
   let b = res[2]
@@ -662,6 +696,10 @@ proc proveLe*(m: TModel; a, b: PNode): TImplication =
   if b.getMagic in someAdd and sameTree(a, b[1]):
     return proveLe(m, zero(), b[2])
 
+  #   x+c <= x  iff c <= 0
+  if a.getMagic in someAdd and sameTree(b, a[1]):
+    return proveLe(m, a[2], zero())
+
   #   x <= x*c  if  1 <= c and 0 <= x:
   if b.getMagic in someMul and sameTree(a, b[1]):
     if proveLe(m, one(), b[2]) == impYes and proveLe(m, zero(), a) == impYes:
diff --git a/compiler/semparallel.nim b/compiler/semparallel.nim
index dd1584e7d0..7917cab908 100644
--- a/compiler/semparallel.nim
+++ b/compiler/semparallel.nim
@@ -9,6 +9,8 @@
 
 ## Semantic checking for 'parallel'.
 
+# - codegen needs to support mSlice
+# - lowerings must not perform unnecessary copies
 # - slices should become "nocopy" to openArray (+)
 #   - need to perform bound checks (+)
 #
@@ -153,6 +155,8 @@ proc addLowerBoundAsFacts(c: var AnalysisCtx) =
 
 proc addSlice(c: var AnalysisCtx; n: PNode; x, le, ri: PNode) =
   checkLocal(c, n)
+  let le = le.canon
+  let ri = ri.canon
   # perform static bounds checking here; and not later!
   let oldState = c.guards.len
   addLowerBoundAsFacts(c)
@@ -166,16 +170,16 @@ proc overlap(m: TModel; x,y,c,d: PNode) =
   case proveLe(m, x, d)
   of impUnknown:
     localError(x.info,
-      "cannot prove: $# > $#; required for $#..$# disjoint from $#..$#" %
+      "cannot prove: $# > $#; required for ($#)..($#) disjoint from ($#)..($#)" %
         [?x, ?d, ?x, ?y, ?c, ?d])
   of impYes:
     case proveLe(m, c, y)
     of impUnknown:
       localError(x.info,
-        "cannot prove: $# > $#; required for $#..$# disjoint from $#..$#" %
+        "cannot prove: $# > $#; required for ($#)..($#) disjoint from ($#)..($#)" %
           [?y, ?d, ?x, ?y, ?c, ?d])
     of impYes:
-      localError(x.info, "$#..$# not disjoint from $#..$#" % [?x, ?y, ?c, ?d])
+      localError(x.info, "($#)..($#) not disjoint from ($#)..($#)" % [?x, ?y, ?c, ?d])
     of impNo: discard
   of impNo: discard
 
@@ -220,14 +224,25 @@ proc checkSlicesAreDisjoint(c: var AnalysisCtx) =
       let x = c.slices[i]
       let y = c.slices[j]
       if x.spawnId != y.spawnId and guards.sameTree(x.x, y.x):
-        if not x.inLoop and not y.inLoop:
+        if not x.inLoop or not y.inLoop:
+          # XXX strictly speaking, 'or' is not correct here and it needs to
+          # be 'and'. However this prevents too many obviously correct programs
+          # like f(a[0..x]); for i in x+1 .. a.high: f(a[i])
           overlap(c.guards, x.a, x.b, y.a, y.b)
+        elif (let k = simpleSlice(x.a, x.b); let m = simpleSlice(y.a, y.b);
+              k >= 0 and m >= 0):
+          # ah I cannot resist the temptation and add another sweet heuristic:
+          # if both slices have the form (i+k)..(i+k)  and (i+m)..(i+m) we
+          # check they are disjoint and k < stride and m < stride:
+          overlap(c.guards, x.a, x.b, y.a, y.b)
+          let stride = min(c.stride(x.a), c.stride(y.a))
+          if k < stride and m < stride:
+            discard
+          else:
+            localError(x.x.info, "cannot prove ($#)..($#) disjoint from ($#)..($#)" %
+              [?x.a, ?x.b, ?y.a, ?y.b])
         else:
-          # ah I cannot resists the temptation and add another sweet heuristic:
-          # if both slices have the form (i+c)..(i+c)  and (i+d)..(i+d) we
-          # check they are disjoint and c <= stride and d <= stride:
-          # XXX
-          localError(x.x.info, "cannot prove $#..$# disjoint from $#..$#" %
+          localError(x.x.info, "cannot prove ($#)..($#) disjoint from ($#)..($#)" %
             [?x.a, ?x.b, ?y.a, ?y.b])
 
 proc analyse(c: var AnalysisCtx; n: PNode)
@@ -369,9 +384,9 @@ proc transformSlices(n: PNode): PNode =
       result.add n[2][2]
       return result
   if n.safeLen > 0:
-    result = copyNode(n)
+    result = shallowCopy(n)
     for i in 0 .. < n.len:
-      result.add transformSlices(n.sons[i])
+      result.sons[i] = transformSlices(n.sons[i])
   else:
     result = n
 
@@ -383,9 +398,9 @@ proc transformSpawn(owner: PSym; n, barrier: PNode): PNode =
         result = transformSlices(n)
         return wrapProcForSpawn(owner, result[1], barrier)
   elif n.safeLen > 0:
-    result = copyNode(n)
+    result = shallowCopy(n)
     for i in 0 .. < n.len:
-      result.add transformSpawn(owner, n.sons[i], barrier)
+      result.sons[i] = transformSpawn(owner, n.sons[i], barrier)
   else:
     result = n
 
diff --git a/tests/parallel/tdisjoint_slice1.nim b/tests/parallel/tdisjoint_slice1.nim
new file mode 100644
index 0000000000..2ca96d6ae2
--- /dev/null
+++ b/tests/parallel/tdisjoint_slice1.nim
@@ -0,0 +1,21 @@
+
+import threadpool
+
+proc f(a: openArray[int]) =
+  for x in a: echo x
+
+proc f(a: int) = echo a
+
+proc main() =
+  var a: array[0..30, int]
+  parallel:
+    #spawn f(a[0..15])
+    #spawn f(a[16..30])
+    var i = 0
+    while i <= 29:
+      spawn f(a[i])
+      spawn f(a[i+1])
+      inc i, 2
+      # is correct here
+
+main()
diff --git a/tests/parallel/tdisjoint_slice2.nim b/tests/parallel/tdisjoint_slice2.nim
new file mode 100644
index 0000000000..b26559fc21
--- /dev/null
+++ b/tests/parallel/tdisjoint_slice2.nim
@@ -0,0 +1,21 @@
+
+import threadpool
+
+proc f(a: openArray[int]) =
+  for x in a: echo x
+
+proc f(a: int) = echo a
+
+proc main() =
+  var a: array[0..30, int]
+  parallel:
+    spawn f(a[0..15])
+    #spawn f(a[16..30])
+    var i = 16
+    while i <= 29:
+      spawn f(a[i])
+      spawn f(a[i+1])
+      inc i, 2
+      # is correct here
+
+main()
diff --git a/tests/parallel/tinvalid_array_bounds.nim b/tests/parallel/tinvalid_array_bounds.nim
new file mode 100644
index 0000000000..337fae7291
--- /dev/null
+++ b/tests/parallel/tinvalid_array_bounds.nim
@@ -0,0 +1,25 @@
+discard """
+  errormsg: "cannot prove: i + 1 <= 30"
+  line: 21
+"""
+
+import threadpool
+
+proc f(a: openArray[int]) =
+  for x in a: echo x
+
+proc f(a: int) = echo a
+
+proc main() =
+  var a: array[0..30, int]
+  parallel:
+    spawn f(a[0..15])
+    spawn f(a[16..30])
+    var i = 0
+    while i <= 30:
+      spawn f(a[i])
+      spawn f(a[i+1])
+      inc i
+      #inc i  # inc i, 2  would be correct here
+
+main()
diff --git a/tests/parallel/tinvalid_counter_usage.nim b/tests/parallel/tinvalid_counter_usage.nim
new file mode 100644
index 0000000000..c6303c6517
--- /dev/null
+++ b/tests/parallel/tinvalid_counter_usage.nim
@@ -0,0 +1,26 @@
+discard """
+  errormsg: "invalid usage of counter after increment"
+  line: 21
+"""
+
+import threadpool
+
+proc f(a: openArray[int]) =
+  for x in a: echo x
+
+proc f(a: int) = echo a
+
+proc main() =
+  var a: array[0..30, int]
+  parallel:
+    spawn f(a[0..15])
+    spawn f(a[16..30])
+    var i = 0
+    while i <= 30:
+      inc i
+      spawn f(a[i])
+      inc i
+      #spawn f(a[i+1])
+      #inc i  # inc i, 2  would be correct here
+
+main()
diff --git a/tests/parallel/tnon_disjoint_slice1.nim b/tests/parallel/tnon_disjoint_slice1.nim
new file mode 100644
index 0000000000..72d008bbdb
--- /dev/null
+++ b/tests/parallel/tnon_disjoint_slice1.nim
@@ -0,0 +1,25 @@
+discard """
+  errormsg: "cannot prove (i)..(i) disjoint from (i + 1)..(i + 1)"
+  line: 20
+"""
+
+import threadpool
+
+proc f(a: openArray[int]) =
+  for x in a: echo x
+
+proc f(a: int) = echo a
+
+proc main() =
+  var a: array[0..30, int]
+  parallel:
+    #spawn f(a[0..15])
+    #spawn f(a[16..30])
+    var i = 0
+    while i <= 29:
+      spawn f(a[i])
+      spawn f(a[i+1])
+      inc i
+      #inc i  # inc i, 2  would be correct here
+
+main()

From 417b9f5a1d13f26842b1337395a0f5b57827cc12 Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Thu, 22 May 2014 08:41:50 +0200
Subject: [PATCH 04/13] 'parallel' statement almost working

---
 compiler/ccgexprs.nim                    |   2 +-
 compiler/guards.nim                      |  58 +++--
 compiler/lowerings.nim                   | 286 +++++++++++++++++++----
 compiler/semmagic.nim                    |  12 +-
 compiler/semparallel.nim                 |  89 ++++---
 doc/manual.txt                           |   2 +-
 lib/pure/concurrency/threadpool.nim      | 112 +++++++++
 lib/system/atomics.nim                   |   6 +-
 tests/parallel/tdisjoint_slice1.nim      |  16 +-
 tests/parallel/tinvalid_array_bounds.nim |   2 +-
 10 files changed, 470 insertions(+), 115 deletions(-)

diff --git a/compiler/ccgexprs.nim b/compiler/ccgexprs.nim
index 7fb6af8965..34fdf5bf17 100644
--- a/compiler/ccgexprs.nim
+++ b/compiler/ccgexprs.nim
@@ -1636,7 +1636,7 @@ proc genMagicExpr(p: BProc, e: PNode, d: var TLoc, op: TMagic) =
   of mSlurp..mQuoteAst:
     localError(e.info, errXMustBeCompileTime, e.sons[0].sym.name.s)
   of mSpawn:
-    let n = lowerings.wrapProcForSpawn(p.module.module, e.sons[1])
+    let n = lowerings.wrapProcForSpawn(p.module.module, e[1], e.typ, nil, nil)
     expr(p, n, d)
   of mParallel:
     let n = semparallel.liftParallel(p.module.module, e)
diff --git a/compiler/guards.nim b/compiler/guards.nim
index de0ce1dcc9..3df3bd1a81 100644
--- a/compiler/guards.nim
+++ b/compiler/guards.nim
@@ -672,12 +672,8 @@ proc simpleSlice*(a, b: PNode): BiggestInt =
   else:
     result = -1
 
-proc proveLe*(m: TModel; a, b: PNode): TImplication =
-  let res = canon(opLe.buildCall(a, b))
-  #echo renderTree(res)
-  # we hardcode lots of axioms here:
-  let a = res[1]
-  let b = res[2]
+proc ple(m: TModel; a, b: PNode): TImplication =  
+  template `<=?`(a,b): expr = ple(m,a,b) == impYes
   #   0 <= 3
   if a.isValue and b.isValue:
     return if leValue(a, b): impYes else: impNo
@@ -692,26 +688,46 @@ proc proveLe*(m: TModel; a, b: PNode): TImplication =
   # x <= x
   if sameTree(a, b): return impYes
 
-  #   x <= x+c  iff 0 <= c
-  if b.getMagic in someAdd and sameTree(a, b[1]):
-    return proveLe(m, zero(), b[2])
+  # 0 <= x.len
+  if b.getMagic in someLen and a.isValue:
+    if a.intVal <= 0: return impYes
 
-  #   x+c <= x  iff c <= 0
-  if a.getMagic in someAdd and sameTree(b, a[1]):
-    return proveLe(m, a[2], zero())
+  #   x <= y+c  if 0 <= c and x <= y
+  if b.getMagic in someAdd and zero() <=? b[2] and a <=? b[1]: return impYes
 
-  #   x <= x*c  if  1 <= c and 0 <= x:
-  if b.getMagic in someMul and sameTree(a, b[1]):
-    if proveLe(m, one(), b[2]) == impYes and proveLe(m, zero(), a) == impYes:
-      return impYes
+  #   x+c <= y  if c <= 0 and x <= y
+  if a.getMagic in someAdd and a[2] <=? zero() and a[1] <=? b: return impYes
 
-  #   x div c <= x   if   1 <= c  and  0 <= x:
-  if a.getMagic in someDiv and sameTree(a[1], b):
-    if proveLe(m, one(), a[2]) == impYes and proveLe(m, zero(), b) == impYes:
-      return impYes
+  #   x <= y*c  if  1 <= c and x <= y  and 0 <= y
+  if b.getMagic in someMul:
+    if a <=? b[1] and one() <=? b[2] and zero() <=? b[1]: return impYes
+
+  #   x div c <= y   if   1 <= c  and  0 <= y  and x <= y:
+  if a.getMagic in someDiv:
+    if one() <=? a[2] and zero() <=? b and a[1] <=? b: return impYes
+
+  # slightly subtle:
+  # x <= max(y, z)  iff x <= y or x <= z
+  # note that 'x <= max(x, z)' is a special case of the above rule
+  if b.getMagic in someMax:
+    if a <=? b[1] or a <=? b[2]: return impYes
+
+  # min(x, y) <= z  iff x <= z or y <= z
+  if a.getMagic in someMin:
+    if a[1] <=? b or a[2] <=? b: return impYes
 
   # use the knowledge base:
-  return doesImply(m, res)
+  return doesImply(m, opLe.buildCall(a, b))
+
+proc proveLe*(m: TModel; a, b: PNode): TImplication =
+  #echo "ROOT ", renderTree(a), " <=? ", b.rendertree
+  let x = canon(opLe.buildCall(a, b))
+  #echo renderTree(res)
+  result = ple(m, x[1], x[2])
+  if result == impUnknown:
+    # try an alternative:  a <= b  iff  not (b < a)  iff  not (b+1 <= a):
+    let y = canon(opLe.buildCall(opAdd.buildCall(b, one()), a))
+    result = ~ple(m, y[1], y[2])
 
 proc addFactLe*(m: var TModel; a, b: PNode) =
   m.add canon(opLe.buildCall(a, b))
diff --git a/compiler/lowerings.nim b/compiler/lowerings.nim
index 704cfbcdd0..2a1a8e577a 100644
--- a/compiler/lowerings.nim
+++ b/compiler/lowerings.nim
@@ -13,6 +13,8 @@ const
   genPrefix* = ":tmp"         # prefix for generated names
 
 import ast, astalgo, types, idents, magicsys, msgs, options
+from guards import createMagic
+from trees import getMagic
 
 proc newTupleAccess*(tup: PNode, i: int): PNode =
   result = newNodeIT(nkBracketExpr, tup.info, tup.typ.skipTypes(
@@ -80,19 +82,23 @@ proc newDotExpr(obj, b: PSym): PNode =
   addSon(result, newSymNode(field))
   result.typ = field.typ
 
-proc indirectAccess*(a: PNode, b: PSym, info: TLineInfo): PNode = 
+proc indirectAccess*(a: PNode, b: string, info: TLineInfo): PNode = 
   # returns a[].b as a node
   var deref = newNodeI(nkHiddenDeref, info)
-  deref.typ = a.typ.sons[0]
+  deref.typ = a.typ.skipTypes(abstractInst).sons[0]
   assert deref.typ.kind == tyObject
-  let field = getSymFromList(deref.typ.n, getIdent(b.name.s & $b.id))
-  assert field != nil, b.name.s
+  let field = getSymFromList(deref.typ.n, getIdent(b))
+  assert field != nil, b
   addSon(deref, a)
   result = newNodeI(nkDotExpr, info)
   addSon(result, deref)
   addSon(result, newSymNode(field))
   result.typ = field.typ
 
+proc indirectAccess*(a: PNode, b: PSym, info: TLineInfo): PNode = 
+  # returns a[].b as a node
+  result = indirectAccess(a, b.name.s & $b.id, info)
+
 proc indirectAccess*(a, b: PSym, info: TLineInfo): PNode =
   result = indirectAccess(newSymNode(a), b, info)
 
@@ -102,6 +108,11 @@ proc genAddrOf*(n: PNode): PNode =
   result.typ = newType(tyPtr, n.typ.owner)
   result.typ.rawAddSon(n.typ)
 
+proc genDeref*(n: PNode): PNode =
+  result = newNodeIT(nkHiddenDeref, n.info, 
+                     n.typ.skipTypes(abstractInst).sons[0])
+  result.add n
+
 proc callCodegenProc*(name: string, arg1: PNode; 
                       arg2, arg3: PNode = nil): PNode =
   result = newNodeI(nkCall, arg1.info)
@@ -114,14 +125,83 @@ proc callCodegenProc*(name: string, arg1: PNode;
     if arg2 != nil: result.add arg2
     if arg3 != nil: result.add arg3
 
+# we have 4 cases to consider:
+# - a void proc --> nothing to do
+# - a proc returning GC'ed memory --> requires a future
+# - a proc returning non GC'ed memory --> pass as hidden 'var' parameter
+# - not in a parallel environment --> requires a future for memory safety
+type
+  TSpawnResult = enum
+    srVoid, srFuture, srByVar
+  TFutureKind = enum
+    futInvalid # invalid type T for 'Future[T]'
+    futGC      # Future of a GC'ed type
+    futBlob    # Future of a blob type
+
+proc spawnResult(t: PType; inParallel: bool): TSpawnResult =
+  if t.isEmptyType: srVoid
+  elif inParallel and not containsGarbageCollectedRef(t): srByVar
+  else: srFuture
+
+proc futureKind(t: PType): TFutureKind =
+  if t.skipTypes(abstractInst).kind in {tyRef, tyString, tySequence}: futGC
+  elif containsGarbageCollectedRef(t): futInvalid
+  else: futBlob
+
+discard """
+We generate roughly this:
+
+proc f_wrapper(args) =
+  var a = args.a # copy strings/seqs; thread transfer; not generated for
+                 # the 'parallel' statement
+  var b = args.b
+
+  args.fut = createFuture(thread, sizeof(T)) # optional
+  nimArgsPassingDone() # signal parent that the work is done
+  args.fut.blob = f(a, b, ...)
+  # - or -
+  f(a, b, ...)
+
+stmtList:
+  var scratchObj
+  scratchObj.a = a
+  scratchObj.b = b
+
+  nimSpawn(f_wrapper, addr scratchObj)
+  scratchObj.fut # optional
+
+"""
+
+proc createNimCreateFutureCall(fut, threadParam: PNode): PNode =
+  let size = newNodeIT(nkCall, fut.info, getSysType(tyInt))
+  size.add newSymNode(createMagic("sizeof", mSizeOf))
+  assert fut.typ.kind == tyGenericInst
+  size.add newNodeIT(nkType, fut.info, fut.typ.sons[1])
+
+  let castExpr = newNodeIT(nkCast, fut.info, fut.typ)
+  castExpr.add emptyNode
+  castExpr.add callCodeGenProc("nimCreateFuture", threadParam, size)
+  result = newFastAsgnStmt(fut, castExpr)
+
 proc createWrapperProc(f: PNode; threadParam, argsParam: PSym;
-                       varSection, call, barrier: PNode): PSym =
+                       varSection, call, barrier, fut: PNode): PSym =
   var body = newNodeI(nkStmtList, f.info)
   body.add varSection
   if barrier != nil:
     body.add callCodeGenProc("barrierEnter", barrier)
-  body.add callCodeGenProc("nimArgsPassingDone", newSymNode(threadParam))
-  body.add call
+  if fut != nil:
+    body.add createNimCreateFutureCall(fut, threadParam.newSymNode)
+    if barrier == nil:
+      body.add callCodeGenProc("nimFutureCreateCondVar", fut)
+
+  body.add callCodeGenProc("nimArgsPassingDone", threadParam.newSymNode)
+  if fut != nil:
+    body.add newAsgnStmt(indirectAccess(fut, 
+        if fut.typ.futureKind==futGC: "data" else: "blob", fut.info), call)
+    if barrier == nil:
+      body.add callCodeGenProc("nimFutureSignal", fut)
+  else:
+    body.add call
   if barrier != nil:
     body.add callCodeGenProc("barrierLeave", barrier)
 
@@ -151,10 +231,148 @@ proc createCastExpr(argsParam: PSym; objType: PType): PNode =
   result.typ = newType(tyPtr, objType.owner)
   result.typ.rawAddSon(objType)
 
-proc wrapProcForSpawn*(owner: PSym; n: PNode; barrier: PNode = nil): PNode =
-  result = newNodeI(nkStmtList, n.info)
-  if n.kind notin nkCallKinds or not n.typ.isEmptyType:
-    localError(n.info, "'spawn' takes a call expression of type void")
+proc setupArgsForConcurrency(n: PNode; objType: PType; scratchObj: PSym, 
+                             castExpr, call, varSection, result: PNode) =
+  let formals = n[0].typ.n
+  let tmpName = getIdent(genPrefix)
+  for i in 1 .. <n.len:
+    # we pick n's type here, which hopefully is 'tyArray' and not
+    # 'tyOpenArray':
+    var argType = n[i].typ.skipTypes(abstractInst)
+    if i < formals.len and formals[i].typ.kind == tyVar:
+      localError(n[i].info, "'spawn'ed function cannot have a 'var' parameter")
+    elif containsTyRef(argType):
+      localError(n[i].info, "'spawn'ed function cannot refer to 'ref'/closure")
+
+    let fieldname = if i < formals.len: formals[i].sym.name else: tmpName
+    var field = newSym(skField, fieldname, objType.owner, n.info)
+    field.typ = argType
+    objType.addField(field)
+    result.add newFastAsgnStmt(newDotExpr(scratchObj, field), n[i])
+
+    var temp = newSym(skTemp, tmpName, objType.owner, n.info)
+    temp.typ = argType
+    incl(temp.flags, sfFromGeneric)
+
+    var vpart = newNodeI(nkIdentDefs, n.info, 3)
+    vpart.sons[0] = newSymNode(temp)
+    vpart.sons[1] = ast.emptyNode
+    vpart.sons[2] = indirectAccess(castExpr, field, n.info)
+    varSection.add vpart
+    
+    call.add(newSymNode(temp))
+
+proc getRoot*(n: PNode): PSym =
+  ## ``getRoot`` takes a *path* ``n``. A path is an lvalue expression
+  ## like ``obj.x[i].y``. The *root* of a path is the symbol that can be
+  ## determined as the owner; ``obj`` in the example.
+  case n.kind
+  of nkSym:
+    if n.sym.kind in {skVar, skResult, skTemp, skLet, skForVar}:
+      result = n.sym
+  of nkDotExpr, nkBracketExpr, nkHiddenDeref, nkDerefExpr,
+      nkObjUpConv, nkObjDownConv, nkCheckedFieldExpr:
+    result = getRoot(n.sons[0])
+  of nkHiddenStdConv, nkHiddenSubConv, nkConv:
+    result = getRoot(n.sons[1])
+  of nkCallKinds:
+    if getMagic(n) == mSlice: result = getRoot(n.sons[1])
+  else: discard
+
+proc newIntLit(value: BiggestInt): PNode =
+  result = nkIntLit.newIntNode(value)
+  result.typ = getSysType(tyInt)
+
+proc genHigh(n: PNode): PNode =
+  if skipTypes(n.typ, abstractVar).kind in {tyArrayConstr, tyArray}:
+    result = newIntLit(lastOrd(skipTypes(n.typ, abstractVar)))
+  else:
+    result = newNodeI(nkCall, n.info, 2)
+    result.typ = getSysType(tyInt)
+    result.sons[0] = newSymNode(createMagic("high", mHigh))
+    result.sons[1] = n
+
+proc setupArgsForParallelism(n: PNode; objType: PType; scratchObj: PSym;
+                             castExpr, call, result: PNode) =
+  let formals = n[0].typ.n
+  let tmpName = getIdent(genPrefix)
+  for i in 1 .. <n.len:
+    let n = n[i]
+    let argType = skipTypes(if i < formals.len: formals[i].typ else: n.typ,
+                            abstractInst)
+    if containsTyRef(argType):
+      localError(n.info, "'spawn'ed function cannot refer to 'ref'/closure")
+
+    let fieldname = if i < formals.len: formals[i].sym.name else: tmpName
+    var field = newSym(skField, fieldname, objType.owner, n.info)
+
+    if argType.kind in {tyVarargs, tyOpenArray}:
+      # important special case: we always create a zero-copy slice:
+      let slice = newNodeI(nkCall, n.info, 4)
+      slice.typ = n.typ
+      slice.sons[0] = newSymNode(createMagic("slice", mSlice))
+      var fieldB = newSym(skField, tmpName, objType.owner, n.info)
+      fieldB.typ = getSysType(tyInt)
+      objType.addField(fieldB)
+      
+      if getMagic(n) == mSlice:
+        let a = genAddrOf(n[0])
+        field.typ = a.typ
+        objType.addField(field)
+        result.add newFastAsgnStmt(newDotExpr(scratchObj, field), a)
+
+        var fieldA = newSym(skField, tmpName, objType.owner, n.info)
+        fieldA.typ = getSysType(tyInt)
+        objType.addField(fieldA)
+        result.add newFastAsgnStmt(newDotExpr(scratchObj, fieldA), n[2])
+        result.add newFastAsgnStmt(newDotExpr(scratchObj, fieldB), n[3])
+
+        slice.sons[2] = indirectAccess(castExpr, fieldA, n.info)
+      else:
+        let a = genAddrOf(n)
+        field.typ = a.typ
+        objType.addField(field)
+        result.add newFastAsgnStmt(newDotExpr(scratchObj, field), a)
+        result.add newFastAsgnStmt(newDotExpr(scratchObj, fieldB), genHigh(n))
+
+        slice.sons[2] = newIntLit(0)
+        
+      slice.sons[1] = genDeref(indirectAccess(castExpr, field, n.info))
+      slice.sons[3] = indirectAccess(castExpr, fieldB, n.info)
+      call.add slice
+    elif (let size = computeSize(argType); size < 0 or size > 16) and
+        n.getRoot != nil:
+      # it is more efficient to pass a pointer instead:
+      let a = genAddrOf(n)
+      field.typ = a.typ
+      objType.addField(field)
+      result.add newFastAsgnStmt(newDotExpr(scratchObj, field), a)
+      call.add(genDeref(indirectAccess(castExpr, field, n.info)))
+    else:
+      # boring case
+      field.typ = argType
+      objType.addField(field)
+      result.add newFastAsgnStmt(newDotExpr(scratchObj, field), n)
+      call.add(indirectAccess(castExpr, field, n.info))
+
+proc wrapProcForSpawn*(owner: PSym; n: PNode; retType: PType; 
+                       barrier, dest: PNode = nil): PNode =
+  # if 'barrier' != nil, then it is in a 'parallel' section and we
+  # generate quite different code
+  let spawnKind = spawnResult(retType, barrier!=nil)
+  case spawnKind
+  of srVoid:
+    internalAssert dest == nil
+    result = newNodeI(nkStmtList, n.info)
+  of srFuture:
+    internalAssert dest == nil
+    result = newNodeIT(nkStmtListExpr, n.info, retType)
+  of srByVar:
+    if dest == nil: localError(n.info, "'spawn' must not be discarded")
+    result = newNodeI(nkStmtList, n.info)
+  
+  if n.kind notin nkCallKinds:
+    localError(n.info, "'spawn' takes a call expression")
     return
   if optThreadAnalysis in gGlobalOptions:
     if {tfThread, tfNoSideEffect} * n[0].typ.flags == {}:
@@ -180,7 +398,7 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode; barrier: PNode = nil): PNode =
     varSectionB.addVar(scratchObj.newSymNode)
     result.add varSectionB
 
-  var call = newNodeI(nkCall, n.info)
+  var call = newNodeIT(nkCall, n.info, n.typ)
   var fn = n.sons[0]
   # templates and macros are in fact valid here due to the nature of
   # the transformation:
@@ -200,34 +418,10 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode; barrier: PNode = nil): PNode =
 
   call.add(fn)
   var varSection = newNodeI(nkVarSection, n.info)
-  let formals = n[0].typ.n
-  let tmpName = getIdent(genPrefix)
-  for i in 1 .. <n.len:
-    # we pick n's type here, which hopefully is 'tyArray' and not
-    # 'tyOpenArray':
-    var argType = n[i].typ.skipTypes(abstractInst)
-    if i < formals.len and formals[i].typ.kind == tyVar:
-      localError(n[i].info, "'spawn'ed function cannot have a 'var' parameter")
-    elif containsTyRef(argType):
-      localError(n[i].info, "'spawn'ed function cannot refer to 'ref'/closure")
-
-    let fieldname = if i < formals.len: formals[i].sym.name else: tmpName
-    var field = newSym(skField, fieldname, owner, n.info)
-    field.typ = argType
-    objType.addField(field)
-    result.add newFastAsgnStmt(newDotExpr(scratchObj, field), n[i])
-
-    var temp = newSym(skTemp, tmpName, owner, n.info)
-    temp.typ = argType
-    incl(temp.flags, sfFromGeneric)
-
-    var vpart = newNodeI(nkIdentDefs, n.info, 3)
-    vpart.sons[0] = newSymNode(temp)
-    vpart.sons[1] = ast.emptyNode
-    vpart.sons[2] = indirectAccess(castExpr, field, n.info)
-    varSection.add vpart
-
-    call.add(newSymNode(temp))
+  if barrier.isNil:
+    setupArgsForConcurrency(n, objType, scratchObj, castExpr, call, varSection, result)
+  else: 
+    setupArgsForParallelism(n, objType, scratchObj, castExpr, call, result)
 
   var barrierAsExpr: PNode = nil
   if barrier != nil:
@@ -239,7 +433,17 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode; barrier: PNode = nil): PNode =
     result.add newFastAsgnStmt(newDotExpr(scratchObj, field), barrier)
     barrierAsExpr = indirectAccess(castExpr, field, n.info)
 
+  var futField, futAsExpr: PNode = nil
+  if spawnKind == srFuture:
+    var field = newSym(skField, getIdent"fut", owner, n.info)
+    field.typ = retType
+    objType.addField(field)
+    futField = newDotExpr(scratchObj, field)
+    futAsExpr = indirectAccess(castExpr, field, n.info)
+
   let wrapper = createWrapperProc(fn, threadParam, argsParam, varSection, call,
-                                  barrierAsExpr)
+                                  barrierAsExpr, futAsExpr)
   result.add callCodeGenProc("nimSpawn", wrapper.newSymNode,
                              genAddrOf(scratchObj.newSymNode))
+
+  if spawnKind == srFuture: result.add futField
diff --git a/compiler/semmagic.nim b/compiler/semmagic.nim
index 80e70b8c0a..3a6bfcf676 100644
--- a/compiler/semmagic.nim
+++ b/compiler/semmagic.nim
@@ -115,6 +115,12 @@ proc semLocals(c: PContext, n: PNode): PNode =
         if it.typ.skipTypes({tyGenericInst}).kind == tyVar: a = newDeref(a)
         result.add(a)
 
+proc createFuture(c: PContext; t: PType; info: TLineInfo): PType =
+  result = newType(tyGenericInvokation, c.module)
+  addSonSkipIntLit(result, magicsys.getCompilerProc("Future").typ)
+  addSonSkipIntLit(result, t)
+  result = instGenericContainer(c, info, result, allowMetaTypes = false)
+
 proc semShallowCopy(c: PContext, n: PNode, flags: TExprFlags): PNode
 proc magicsAfterOverloadResolution(c: PContext, n: PNode, 
                                    flags: TExprFlags): PNode =
@@ -130,5 +136,9 @@ proc magicsAfterOverloadResolution(c: PContext, n: PNode,
   of mShallowCopy: result = semShallowCopy(c, n, flags)
   of mNBindSym: result = semBindSym(c, n)
   of mLocals: result = semLocals(c, n)
+  of mSpawn:
+    result = n
+    # later passes may transform the type 'Future[T]' back into 'T'
+    if not n[1].typ.isEmptyType:
+      result.typ = createFuture(c, n[1].typ, n.info)
   else: result = n
-
diff --git a/compiler/semparallel.nim b/compiler/semparallel.nim
index 7917cab908..b135420382 100644
--- a/compiler/semparallel.nim
+++ b/compiler/semparallel.nim
@@ -9,8 +9,8 @@
 
 ## Semantic checking for 'parallel'.
 
-# - codegen needs to support mSlice
-# - lowerings must not perform unnecessary copies
+# - codegen needs to support mSlice (+)
+# - lowerings must not perform unnecessary copies (+)
 # - slices should become "nocopy" to openArray (+)
 #   - need to perform bound checks (+)
 #
@@ -19,7 +19,7 @@
 #   - what about 'f(a)'? --> f shouldn't have side effects anyway
 # - passed arrays need to be ensured not to alias
 # - passed slices need to be ensured to be disjoint (+)
-# - output slices need special logic
+# - output slices need special logic (+)
 
 import
   ast, astalgo, idents, lowerings, magicsys, guards, sempass2, msgs,
@@ -94,23 +94,6 @@ proc getSlot(c: var AnalysisCtx; v: PSym): ptr MonotonicVar =
   c.locals[L].v = v
   return addr(c.locals[L])
 
-proc getRoot(n: PNode): PSym =
-  ## ``getRoot`` takes a *path* ``n``. A path is an lvalue expression
-  ## like ``obj.x[i].y``. The *root* of a path is the symbol that can be
-  ## determined as the owner; ``obj`` in the example.
-  case n.kind
-  of nkSym:
-    if n.sym.kind in {skVar, skResult, skTemp, skLet, skForVar}:
-      result = n.sym
-  of nkDotExpr, nkBracketExpr, nkHiddenDeref, nkDerefExpr,
-      nkObjUpConv, nkObjDownConv, nkCheckedFieldExpr:
-    result = getRoot(n.sons[0])
-  of nkHiddenStdConv, nkHiddenSubConv, nkConv:
-    result = getRoot(n.sons[1])
-  of nkCallKinds:
-    if getMagic(n) == mSlice: result = getRoot(n.sons[1])
-  else: discard
-
 proc gatherArgs(c: var AnalysisCtx; n: PNode) =
   for i in 0.. <n.safeLen:
     let root = getRoot n[i]
@@ -184,8 +167,6 @@ proc overlap(m: TModel; x,y,c,d: PNode) =
   of impNo: discard
 
 proc stride(c: AnalysisCtx; n: PNode): BiggestInt =
-  # note: 0 if it cannot be determined is just right because then
-  # we analyse 'i..i' and 'i+0 .. i+0' and these are not disjoint!
   if isLocal(n):
     let s = c.lookupSlot(n.sym)
     if s >= 0 and c.locals[s].stride != nil:
@@ -193,6 +174,20 @@ proc stride(c: AnalysisCtx; n: PNode): BiggestInt =
   else:
     for i in 0 .. <n.safeLen: result += stride(c, n.sons[i])
 
+proc subStride(c: AnalysisCtx; n: PNode): PNode =
+  # substitute with stride:
+  if isLocal(n):
+    let s = c.lookupSlot(n.sym)
+    if s >= 0 and c.locals[s].stride != nil:
+      result = n +@ c.locals[s].stride.intVal
+    else:
+      result = n
+  elif n.safeLen > 0:
+    result = shallowCopy(n)
+    for i in 0 .. <n.len: result.sons[i] = subStride(c, n.sons[i])
+  else:
+    result = n
+
 proc checkSlicesAreDisjoint(c: var AnalysisCtx) =
   # this is the only thing that we need to perform after we have traversed
   # the whole tree so that the strides are available.
@@ -200,7 +195,7 @@ proc checkSlicesAreDisjoint(c: var AnalysisCtx) =
   addLowerBoundAsFacts(c)
   # Every slice used in a loop needs to be disjoint with itself:
   for x,a,b,id,inLoop in items(c.slices):
-    if inLoop: overlap(c.guards, a,b, a+@c.stride(a), b+@c.stride(b))
+    if inLoop: overlap(c.guards, a,b, c.subStride(a), c.subStride(b))
   # Another tricky example is:
   #   while true:
   #     spawn f(a[i])
@@ -283,23 +278,19 @@ proc analyseCall(c: var AnalysisCtx; n: PNode; op: PSym) =
 
 proc analyseCase(c: var AnalysisCtx; n: PNode) =
   analyse(c, n.sons[0])
-  #let oldState = c.locals.len
   let oldFacts = c.guards.len
   for i in 1.. <n.len:
     let branch = n.sons[i]
-    #setLen(c.locals, oldState)
     setLen(c.guards, oldFacts)
     addCaseBranchFacts(c.guards, n, i)
     for i in 0 .. <branch.len:
       analyse(c, branch.sons[i])
-  #setLen(c.locals, oldState)
   setLen(c.guards, oldFacts)
 
 proc analyseIf(c: var AnalysisCtx; n: PNode) =
   analyse(c, n.sons[0].sons[0])
   let oldFacts = c.guards.len
   addFact(c.guards, n.sons[0].sons[0])
-  #let oldState = c.locals.len
 
   analyse(c, n.sons[0].sons[1])
   for i in 1.. <n.len:
@@ -309,10 +300,8 @@ proc analyseIf(c: var AnalysisCtx; n: PNode) =
       addFactNeg(c.guards, n.sons[j].sons[0])
     if branch.len > 1:
       addFact(c.guards, branch.sons[0])
-    #setLen(c.locals, oldState)
     for i in 0 .. <branch.len:
       analyse(c, branch.sons[i])
-  #setLen(c.locals, oldState)
   setLen(c.guards, oldFacts)
 
 proc analyse(c: var AnalysisCtx; n: PNode) =
@@ -390,17 +379,40 @@ proc transformSlices(n: PNode): PNode =
   else:
     result = n
 
+proc transformSpawn(owner: PSym; n, barrier: PNode): PNode
+proc transformSpawnSons(owner: PSym; n, barrier: PNode): PNode =
+  result = shallowCopy(n)
+  for i in 0 .. < n.len:
+    result.sons[i] = transformSpawn(owner, n.sons[i], barrier)
+
 proc transformSpawn(owner: PSym; n, barrier: PNode): PNode =
-  if n.kind in nkCallKinds:
-    if n[0].kind == nkSym:
-      let op = n[0].sym
-      if op.magic == mSpawn:
-        result = transformSlices(n)
-        return wrapProcForSpawn(owner, result[1], barrier)
+  case n.kind
+  of nkVarSection:
+    result = nil
+    for it in n:
+      let b = it.lastSon
+      if getMagic(b) == mSpawn:
+        if it.len != 3: localError(it.info, "invalid context for 'spawn'")
+        let m = transformSlices(b)
+        if result.isNil:
+          result = newNodeI(nkStmtList, n.info)
+          result.add n
+        result.add wrapProcForSpawn(owner, m[1], b.typ, barrier, it[0])
+        it.sons[it.len-1] = emptyNode
+    if result.isNil: result = n
+  of nkAsgn, nkFastAsgn:
+    let b = n[1]
+    if getMagic(b) == mSpawn:
+      let m = transformSlices(b)
+      return wrapProcForSpawn(owner, m[1], b.typ, barrier, n[0])
+    result = transformSpawnSons(owner, n, barrier)
+  of nkCallKinds:
+    if getMagic(n) == mSpawn:
+      result = transformSlices(n)
+      return wrapProcForSpawn(owner, result[1], n.typ, barrier, nil)
+    result = transformSpawnSons(owner, n, barrier)
   elif n.safeLen > 0:
-    result = shallowCopy(n)
-    for i in 0 .. < n.len:
-      result.sons[i] = transformSpawn(owner, n.sons[i], barrier)
+    result = transformSpawnSons(owner, n, barrier)
   else:
     result = n
 
@@ -440,3 +452,4 @@ proc liftParallel*(owner: PSym; n: PNode): PNode =
   result.add callCodeGenProc("openBarrier", barrier)
   result.add transformSpawn(owner, body, barrier)
   result.add callCodeGenProc("closeBarrier", barrier)
+
diff --git a/doc/manual.txt b/doc/manual.txt
index 39e2bad2aa..b2e0089693 100644
--- a/doc/manual.txt
+++ b/doc/manual.txt
@@ -2748,7 +2748,7 @@ The following builtin procs cannot be overloaded for reasons of implementation
 simplicity (they require specialized semantic checking)::
 
   defined, definedInScope, compiles, low, high, sizeOf, 
-  is, of, echo, shallowCopy, getAst
+  is, of, echo, shallowCopy, getAst, spawn
 
 Thus they act more like keywords than like ordinary identifiers; unlike a 
 keyword however, a redefinition may `shadow`:idx: the definition in 
diff --git a/lib/pure/concurrency/threadpool.nim b/lib/pure/concurrency/threadpool.nim
index 86819d25a7..583c60c66d 100644
--- a/lib/pure/concurrency/threadpool.nim
+++ b/lib/pure/concurrency/threadpool.nim
@@ -65,6 +65,30 @@ proc closeBarrier*(b: ptr Barrier) {.compilerProc.} =
 # ----------------------------------------------------------------------------
 
 type
+  AwaitInfo = object
+    cv: CondVar
+    idx: int
+
+  RawFuture* = ptr RawFutureObj ## untyped base class for 'Future[T]'
+  RawFutureObj {.inheritable.} = object # \
+    # we allocate this with the thread local allocator; this
+    # is possible since we already need to do the GC_unref
+    # on the owning thread
+    ready, usesCondVar: bool
+    cv: CondVar #\
+    # for 'awaitAny' support
+    ai: ptr AwaitInfo
+    idx: int
+    data: PObject  # we incRef and unref it to keep it alive
+    owner: ptr Worker
+    next: RawFuture
+    align: float64 # a float for proper alignment
+
+  Future* {.compilerProc.} [T] = ptr object of RawFutureObj
+    blob: T  ## the underlying value, if available. Note that usually
+             ## you should not access this field directly! However it can
+             ## sometimes be more efficient than getting the value via ``^``.
+
   WorkerProc = proc (thread, args: pointer) {.nimcall, gcsafe.}
   Worker = object
     taskArrived: CondVar
@@ -75,6 +99,92 @@ type
     ready: bool # put it here for correct alignment!
     initialized: bool # whether it has even been initialized
     shutdown: bool # the pool requests to shut down this worker thread
+    futureLock: TLock
+    head: RawFuture
+
+proc finished*(fut: RawFuture) =
+  ## This MUST be called for every created future to free its associated
+  ## resources. Note that the default reading operation ``^`` is destructive
+  ## and calls ``finished``.
+  doAssert fut.ai.isNil, "future is still attached to an 'awaitAny'"
+  assert fut.next == nil
+  let w = fut.owner
+  acquire(w.futureLock)
+  fut.next = w.head
+  w.head = fut
+  release(w.futureLock)
+
+proc cleanFutures(w: ptr Worker) =
+  var it = w.head
+  acquire(w.futureLock)
+  while it != nil:
+    let nxt = it.next
+    if it.usesCondVar: destroyCondVar(it.cv)
+    if it.data != nil: GC_unref(it.data)
+    dealloc(it)
+    it = nxt
+  w.head = nil
+  release(w.futureLock)
+
+proc nimCreateFuture(owner: pointer; blobSize: int): RawFuture {.
+                     compilerProc.} =
+  result = cast[RawFuture](alloc0(RawFutureObj.sizeof + blobSize))
+  result.owner = cast[ptr Worker](owner)
+
+proc nimFutureCreateCondVar(fut: RawFuture) {.compilerProc.} =
+  fut.cv = createCondVar()
+  fut.usesCondVar = true
+
+proc nimFutureSignal(fut: RawFuture) {.compilerProc.} =
+  assert fut.usesCondVar
+  signal(fut.cv)
+
+proc await*[T](fut: Future[T]) =
+  ## waits until the value for the future arrives.
+  if fut.usesCondVar: await(fut.cv)
+
+proc `^`*[T](fut: Future[T]): T =
+  ## blocks until the value is available and then returns this value. Note
+  ## this reading is destructive for reasons of efficiency and convenience.
+  ## This calls ``finished(fut)``.
+  await(fut)
+  when T is string or T is seq or T is ref:
+    result = cast[T](fut.data)
+  else:
+    result = fut.payload
+  finished(fut)
+
+proc notify*(fut: RawFuture) {.compilerproc.} =
+  if fut.ai != nil:
+    acquire(fut.ai.cv.L)
+    fut.ai.idx = fut.idx
+    inc fut.ai.cv.counter
+    release(fut.ai.cv.L)
+    signal(fut.ai.cv.c)
+  if fut.usesCondVar: signal(fut.cv)
+
+proc awaitAny*(futures: openArray[RawFuture]): int =
+  # awaits any of the given futures. Returns the index of one future for which
+  ## a value arrived. A future only supports one call to 'awaitAny' at the
+  ## same time. That means if you await([a,b]) and await([b,c]) the second
+  ## call will only await 'c'. If there is no future left to be able to wait
+  ## on, -1 is returned.
+  var ai: AwaitInfo
+  ai.cv = createCondVar()
+  var conflicts = 0
+  for i in 0 .. futures.high:
+    if cas(addr futures[i].ai, nil, addr ai):
+      futures[i].idx = i
+    else:
+      inc conflicts
+  if conflicts < futures.len:
+    await(ai.cv)
+    result = ai.idx
+    for i in 0 .. futures.high:
+      discard cas(addr futures[i].ai, addr ai, nil)
+  else:
+    result = -1
+  destroyCondVar(ai.cv)
 
 proc nimArgsPassingDone(p: pointer) {.compilerProc.} =
   let w = cast[ptr Worker](p)
@@ -99,6 +209,7 @@ proc slave(w: ptr Worker) {.thread.} =
     await(w.taskArrived)
     assert(not w.ready)
     w.f(w, w.data)
+    if w.head != nil: w.cleanFutures
     if w.shutdown:
       w.shutdown = false
       atomicDec currentPoolSize
@@ -119,6 +230,7 @@ var
 proc activateThread(i: int) {.noinline.} =
   workersData[i].taskArrived = createCondVar()
   workersData[i].taskStarted = createCondVar()
+  initLock workersData[i].futureLock
   workersData[i].initialized = true
   createThread(workers[i], slave, addr(workersData[i]))
 
diff --git a/lib/system/atomics.nim b/lib/system/atomics.nim
index c6c603b19e..96246ba01d 100644
--- a/lib/system/atomics.nim
+++ b/lib/system/atomics.nim
@@ -209,12 +209,12 @@ when defined(windows) and not defined(gcc):
   proc interlockedCompareExchange(p: pointer; exchange, comparand: int32): int32
     {.importc: "InterlockedCompareExchange", header: "<windows.h>", cdecl.}
 
-  proc cas*[T: bool|int](p: ptr T; oldValue, newValue: T): bool =
+  proc cas*[T: bool|int|ptr](p: ptr T; oldValue, newValue: T): bool =
     interlockedCompareExchange(p, newValue.int32, oldValue.int32) != 0
-
+  # XXX fix for 64 bit build
 else:
   # this is valid for GCC and Intel C++
-  proc cas*[T: bool|int](p: ptr T; oldValue, newValue: T): bool
+  proc cas*[T: bool|int|ptr](p: ptr T; oldValue, newValue: T): bool
     {.importc: "__sync_bool_compare_and_swap", nodecl.}
   # XXX is this valid for 'int'?
 
diff --git a/tests/parallel/tdisjoint_slice1.nim b/tests/parallel/tdisjoint_slice1.nim
index 2ca96d6ae2..c1d0e52f8f 100644
--- a/tests/parallel/tdisjoint_slice1.nim
+++ b/tests/parallel/tdisjoint_slice1.nim
@@ -1,20 +1,20 @@
+discard """
+  outputsub: "EVEN 28"
+"""
 
 import threadpool
 
-proc f(a: openArray[int]) =
-  for x in a: echo x
-
-proc f(a: int) = echo a
+proc odd(a: int) =  echo "ODD  ", a
+proc even(a: int) = echo "EVEN ", a
 
 proc main() =
   var a: array[0..30, int]
+  for i in low(a)..high(a): a[i] = i
   parallel:
-    #spawn f(a[0..15])
-    #spawn f(a[16..30])
     var i = 0
     while i <= 29:
-      spawn f(a[i])
-      spawn f(a[i+1])
+      spawn even(a[i])
+      spawn odd(a[i+1])
       inc i, 2
       # is correct here
 
diff --git a/tests/parallel/tinvalid_array_bounds.nim b/tests/parallel/tinvalid_array_bounds.nim
index 337fae7291..4c6065fd6f 100644
--- a/tests/parallel/tinvalid_array_bounds.nim
+++ b/tests/parallel/tinvalid_array_bounds.nim
@@ -1,5 +1,5 @@
 discard """
-  errormsg: "cannot prove: i + 1 <= 30"
+  errormsg: "can prove: i + 1 > 30"
   line: 21
 """
 

From d2dbcf2fa44aa76c6c7ed2c07641560640e6bc6b Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Fri, 23 May 2014 08:57:16 +0200
Subject: [PATCH 05/13] progress with futures

---
 compiler/ast.nim                              |  2 +-
 compiler/lowerings.nim                        | 27 ++++++++++++---
 compiler/semexprs.nim                         | 12 +++++++
 compiler/semmagic.nim                         | 11 -------
 lib/pure/concurrency/threadpool.nim           | 33 +++++++++----------
 tests/parallel/tflowvar.nim                   | 17 ++++++++++
 tests/{system => parallel}/tsysspawn.nim      |  0
 .../{system => parallel}/tsysspawnbadarg.nim  |  2 +-
 8 files changed, 68 insertions(+), 36 deletions(-)
 create mode 100644 tests/parallel/tflowvar.nim
 rename tests/{system => parallel}/tsysspawn.nim (100%)
 rename tests/{system => parallel}/tsysspawnbadarg.nim (64%)

diff --git a/compiler/ast.nim b/compiler/ast.nim
index 58b01d5e8a..c47407ee2b 100644
--- a/compiler/ast.nim
+++ b/compiler/ast.nim
@@ -605,7 +605,7 @@ const
   # thus cannot be overloaded (also documented in the spec!):
   SpecialSemMagics* = {
     mDefined, mDefinedInScope, mCompiles, mLow, mHigh, mSizeOf, mIs, mOf, 
-    mEcho, mShallowCopy, mExpandToAst, mParallel}
+    mEcho, mShallowCopy, mExpandToAst, mParallel, mSpawn}
 
 type
   PNode* = ref TNode
diff --git a/compiler/lowerings.nim b/compiler/lowerings.nim
index 2a1a8e577a..047bdf832c 100644
--- a/compiler/lowerings.nim
+++ b/compiler/lowerings.nim
@@ -86,8 +86,14 @@ proc indirectAccess*(a: PNode, b: string, info: TLineInfo): PNode =
   # returns a[].b as a node
   var deref = newNodeI(nkHiddenDeref, info)
   deref.typ = a.typ.skipTypes(abstractInst).sons[0]
-  assert deref.typ.kind == tyObject
-  let field = getSymFromList(deref.typ.n, getIdent(b))
+  var t = deref.typ
+  var field: PSym
+  while true:
+    assert t.kind == tyObject
+    field = getSymFromList(t.n, getIdent(b))
+    if field != nil: break
+    t = t.sons[0]
+    if t == nil: break
   assert field != nil, b
   addSon(deref, a)
   result = newNodeI(nkDotExpr, info)
@@ -124,6 +130,7 @@ proc callCodegenProc*(name: string, arg1: PNode;
     result.add arg1
     if arg2 != nil: result.add arg2
     if arg3 != nil: result.add arg3
+    result.typ = sym.typ.sons[0]
 
 # we have 4 cases to consider:
 # - a void proc --> nothing to do
@@ -152,15 +159,21 @@ discard """
 We generate roughly this:
 
 proc f_wrapper(args) =
+  barrierEnter(args.barrier)  # for parallel statement
   var a = args.a # copy strings/seqs; thread transfer; not generated for
                  # the 'parallel' statement
   var b = args.b
 
-  args.fut = createFuture(thread, sizeof(T)) # optional
+  args.fut = nimCreateFuture(thread, sizeof(T)) # optional
+  nimFutureCreateCondVar(args.fut)  # optional
   nimArgsPassingDone() # signal parent that the work is done
+  # 
   args.fut.blob = f(a, b, ...)
+  nimFutureSignal(args.fut)
+  
   # - or -
   f(a, b, ...)
+  barrierLeave(args.barrier)  # for parallel statement
 
 stmtList:
   var scratchObj
@@ -196,8 +209,12 @@ proc createWrapperProc(f: PNode; threadParam, argsParam: PSym;
 
   body.add callCodeGenProc("nimArgsPassingDone", threadParam.newSymNode)
   if fut != nil:
-    body.add newAsgnStmt(indirectAccess(fut, 
-        if fut.typ.futureKind==futGC: "data" else: "blob", fut.info), call)
+    let fk = fut.typ.sons[1].futureKind
+    if fk == futInvalid:
+      localError(f.info, "cannot create a future of type: " & 
+        typeToString(fut.typ.sons[1]))
+    body.add newAsgnStmt(indirectAccess(fut,
+      if fk == futGC: "data" else: "blob", fut.info), call)
     if barrier == nil:
       body.add callCodeGenProc("nimFutureSignal", fut)
   else:
diff --git a/compiler/semexprs.nim b/compiler/semexprs.nim
index 505c289ea9..4e3d2f3cef 100644
--- a/compiler/semexprs.nim
+++ b/compiler/semexprs.nim
@@ -1579,6 +1579,12 @@ proc semShallowCopy(c: PContext, n: PNode, flags: TExprFlags): PNode =
   else:
     result = semDirectOp(c, n, flags)
 
+proc createFuture(c: PContext; t: PType; info: TLineInfo): PType =
+  result = newType(tyGenericInvokation, c.module)
+  addSonSkipIntLit(result, magicsys.getCompilerProc("Future").typ)
+  addSonSkipIntLit(result, t)
+  result = instGenericContainer(c, info, result, allowMetaTypes = false)
+
 proc setMs(n: PNode, s: PSym): PNode = 
   result = n
   n.sons[0] = newSymNode(s)
@@ -1610,6 +1616,12 @@ proc semMagic(c: PContext, n: PNode, s: PSym, flags: TExprFlags): PNode =
     var x = n.lastSon
     if x.kind == nkDo: x = x.sons[bodyPos]
     result.sons[1] = semStmt(c, x)
+  of mSpawn:
+    result = setMs(n, s)
+    result.sons[1] = semExpr(c, n.sons[1])
+    # later passes may transform the type 'Future[T]' back into 'T'
+    if not result[1].typ.isEmptyType:
+      result.typ = createFuture(c, result[1].typ, n.info)
   else: result = semDirectOp(c, n, flags)
 
 proc semWhen(c: PContext, n: PNode, semCheck = true): PNode =
diff --git a/compiler/semmagic.nim b/compiler/semmagic.nim
index 3a6bfcf676..f943e70061 100644
--- a/compiler/semmagic.nim
+++ b/compiler/semmagic.nim
@@ -115,12 +115,6 @@ proc semLocals(c: PContext, n: PNode): PNode =
         if it.typ.skipTypes({tyGenericInst}).kind == tyVar: a = newDeref(a)
         result.add(a)
 
-proc createFuture(c: PContext; t: PType; info: TLineInfo): PType =
-  result = newType(tyGenericInvokation, c.module)
-  addSonSkipIntLit(result, magicsys.getCompilerProc("Future").typ)
-  addSonSkipIntLit(result, t)
-  result = instGenericContainer(c, info, result, allowMetaTypes = false)
-
 proc semShallowCopy(c: PContext, n: PNode, flags: TExprFlags): PNode
 proc magicsAfterOverloadResolution(c: PContext, n: PNode, 
                                    flags: TExprFlags): PNode =
@@ -136,9 +130,4 @@ proc magicsAfterOverloadResolution(c: PContext, n: PNode,
   of mShallowCopy: result = semShallowCopy(c, n, flags)
   of mNBindSym: result = semBindSym(c, n)
   of mLocals: result = semLocals(c, n)
-  of mSpawn:
-    result = n
-    # later passes may transform the type 'Future[T]' back into 'T'
-    if not n[1].typ.isEmptyType:
-      result.typ = createFuture(c, n[1].typ, n.info)
   else: result = n
diff --git a/lib/pure/concurrency/threadpool.nim b/lib/pure/concurrency/threadpool.nim
index 583c60c66d..41c1adca03 100644
--- a/lib/pure/concurrency/threadpool.nim
+++ b/lib/pure/concurrency/threadpool.nim
@@ -57,7 +57,7 @@ proc openBarrier*(b: ptr Barrier) {.compilerProc.} =
   b.cv = createCondVar()
 
 proc closeBarrier*(b: ptr Barrier) {.compilerProc.} =
-  await(b.cv)
+  while b.counter > 0: await(b.cv)
   destroyCondVar(b.cv)
 
 {.pop.}
@@ -136,8 +136,13 @@ proc nimFutureCreateCondVar(fut: RawFuture) {.compilerProc.} =
   fut.usesCondVar = true
 
 proc nimFutureSignal(fut: RawFuture) {.compilerProc.} =
-  assert fut.usesCondVar
-  signal(fut.cv)
+  if fut.ai != nil:
+    acquire(fut.ai.cv.L)
+    fut.ai.idx = fut.idx
+    inc fut.ai.cv.counter
+    release(fut.ai.cv.L)
+    signal(fut.ai.cv.c)
+  if fut.usesCondVar: signal(fut.cv)
 
 proc await*[T](fut: Future[T]) =
   ## waits until the value for the future arrives.
@@ -147,28 +152,21 @@ proc `^`*[T](fut: Future[T]): T =
   ## blocks until the value is available and then returns this value. Note
   ## this reading is destructive for reasons of efficiency and convenience.
   ## This calls ``finished(fut)``.
-  await(fut)
+  if fut.usesCondVar: await(fut)
   when T is string or T is seq or T is ref:
     result = cast[T](fut.data)
   else:
-    result = fut.payload
+    result = fut.blob
   finished(fut)
 
-proc notify*(fut: RawFuture) {.compilerproc.} =
-  if fut.ai != nil:
-    acquire(fut.ai.cv.L)
-    fut.ai.idx = fut.idx
-    inc fut.ai.cv.counter
-    release(fut.ai.cv.L)
-    signal(fut.ai.cv.c)
-  if fut.usesCondVar: signal(fut.cv)
-
 proc awaitAny*(futures: openArray[RawFuture]): int =
   # awaits any of the given futures. Returns the index of one future for which
   ## a value arrived. A future only supports one call to 'awaitAny' at the
   ## same time. That means if you await([a,b]) and await([b,c]) the second
   ## call will only await 'c'. If there is no future left to be able to wait
   ## on, -1 is returned.
+  ## **Note**: This results in non-deterministic behaviour and so should be
+  ## avoided.
   var ai: AwaitInfo
   ai.cv = createCondVar()
   var conflicts = 0
@@ -245,19 +243,18 @@ proc preferSpawn*(): bool =
   ## it is not necessary to call this directly; use 'spawnX' instead.
   result = gSomeReady.counter > 0
 
-proc spawn*(call: stmt) {.magic: "Spawn".}
+proc spawn*(call: expr): expr {.magic: "Spawn".}
   ## always spawns a new task, so that the 'call' is never executed on
   ## the calling thread. 'call' has to be proc call 'p(...)' where 'p'
   ## is gcsafe and has 'void' as the return type.
 
-template spawnX*(call: stmt) =
+template spawnX*(call: expr): expr =
   ## spawns a new task if a CPU core is ready, otherwise executes the
   ## call in the calling thread. Usually it is advised to
   ## use 'spawn' in order to not block the producer for an unknown
   ## amount of time. 'call' has to be proc call 'p(...)' where 'p'
   ## is gcsafe and has 'void' as the return type.
-  if preferSpawn(): spawn call
-  else: call
+  (if preferSpawn(): spawn call else: call)
 
 proc parallel*(body: stmt) {.magic: "Parallel".}
   ## a parallel section can be used to execute a block in parallel. ``body``
diff --git a/tests/parallel/tflowvar.nim b/tests/parallel/tflowvar.nim
new file mode 100644
index 0000000000..77fab14b5c
--- /dev/null
+++ b/tests/parallel/tflowvar.nim
@@ -0,0 +1,17 @@
+discard """
+  output: '''foobarfoobarbazbearbazbear'''
+  cmd: "nimrod $target --threads:on $options $file"
+"""
+
+import threadpool
+
+proc computeSomething(a, b: string): string = a & b & a & b
+
+proc main =
+  let fvA = spawn computeSomething("foo", "bar")
+  let fvB = spawn computeSomething("baz", "bear")
+
+  echo(^fvA, ^fvB)
+
+main()
+sync()
diff --git a/tests/system/tsysspawn.nim b/tests/parallel/tsysspawn.nim
similarity index 100%
rename from tests/system/tsysspawn.nim
rename to tests/parallel/tsysspawn.nim
diff --git a/tests/system/tsysspawnbadarg.nim b/tests/parallel/tsysspawnbadarg.nim
similarity index 64%
rename from tests/system/tsysspawnbadarg.nim
rename to tests/parallel/tsysspawnbadarg.nim
index ce3c5611b5..120975ed54 100644
--- a/tests/system/tsysspawnbadarg.nim
+++ b/tests/parallel/tsysspawnbadarg.nim
@@ -1,6 +1,6 @@
 discard """
   line: 7
-  errormsg: "'spawn' takes a call expression of type void"
+  errormsg: "'spawn' takes a call expression"
   cmd: "nimrod $target --threads:on $options $file"
 """
 

From 030eac86c05427792d3c3c00b56fbe764d783a40 Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Sun, 25 May 2014 15:19:46 +0200
Subject: [PATCH 06/13] bugfix: regionized pointers in a generic context;
 renamed 'Future' to 'Promise'

---
 compiler/ast.nim                    |   2 +
 compiler/lowerings.nim              |  88 +++++++++----------
 compiler/semexprs.nim               |   8 +-
 compiler/semtypes.nim               |   8 +-
 lib/pure/concurrency/threadpool.nim | 132 ++++++++++++++++------------
 lib/system.nim                      |   4 +-
 lib/system/assign.nim               |   3 +-
 7 files changed, 137 insertions(+), 108 deletions(-)

diff --git a/compiler/ast.nim b/compiler/ast.nim
index c47407ee2b..c3cb63df46 100644
--- a/compiler/ast.nim
+++ b/compiler/ast.nim
@@ -885,6 +885,8 @@ const
 
   nkCallKinds* = {nkCall, nkInfix, nkPrefix, nkPostfix,
                   nkCommand, nkCallStrLit, nkHiddenCallConv}
+  nkIdentKinds* = {nkIdent, nkSym, nkAccQuoted, nkOpenSymChoice,
+                   nkClosedSymChoice}
 
   nkLiterals* = {nkCharLit..nkTripleStrLit}
   nkLambdaKinds* = {nkLambda, nkDo}
diff --git a/compiler/lowerings.nim b/compiler/lowerings.nim
index 047bdf832c..13d4bf60ec 100644
--- a/compiler/lowerings.nim
+++ b/compiler/lowerings.nim
@@ -134,26 +134,26 @@ proc callCodegenProc*(name: string, arg1: PNode;
 
 # we have 4 cases to consider:
 # - a void proc --> nothing to do
-# - a proc returning GC'ed memory --> requires a future
+# - a proc returning GC'ed memory --> requires a promise
 # - a proc returning non GC'ed memory --> pass as hidden 'var' parameter
-# - not in a parallel environment --> requires a future for memory safety
+# - not in a parallel environment --> requires a promise for memory safety
 type
   TSpawnResult = enum
-    srVoid, srFuture, srByVar
-  TFutureKind = enum
-    futInvalid # invalid type T for 'Future[T]'
-    futGC      # Future of a GC'ed type
-    futBlob    # Future of a blob type
+    srVoid, srPromise, srByVar
+  TPromiseKind = enum
+    promInvalid # invalid type T for 'Promise[T]'
+    promGC      # Promise of a GC'ed type
+    promBlob    # Promise of a blob type
 
 proc spawnResult(t: PType; inParallel: bool): TSpawnResult =
   if t.isEmptyType: srVoid
   elif inParallel and not containsGarbageCollectedRef(t): srByVar
-  else: srFuture
+  else: srPromise
 
-proc futureKind(t: PType): TFutureKind =
-  if t.skipTypes(abstractInst).kind in {tyRef, tyString, tySequence}: futGC
-  elif containsGarbageCollectedRef(t): futInvalid
-  else: futBlob
+proc promiseKind(t: PType): TPromiseKind =
+  if t.skipTypes(abstractInst).kind in {tyRef, tyString, tySequence}: promGC
+  elif containsGarbageCollectedRef(t): promInvalid
+  else: promBlob
 
 discard """
 We generate roughly this:
@@ -164,12 +164,12 @@ proc f_wrapper(args) =
                  # the 'parallel' statement
   var b = args.b
 
-  args.fut = nimCreateFuture(thread, sizeof(T)) # optional
-  nimFutureCreateCondVar(args.fut)  # optional
+  args.prom = nimCreatePromise(thread, sizeof(T)) # optional
+  nimPromiseCreateCondVar(args.prom)  # optional
   nimArgsPassingDone() # signal parent that the work is done
   # 
-  args.fut.blob = f(a, b, ...)
-  nimFutureSignal(args.fut)
+  args.prom.blob = f(a, b, ...)
+  nimPromiseSignal(args.prom)
   
   # - or -
   f(a, b, ...)
@@ -181,42 +181,42 @@ stmtList:
   scratchObj.b = b
 
   nimSpawn(f_wrapper, addr scratchObj)
-  scratchObj.fut # optional
+  scratchObj.prom # optional
 
 """
 
-proc createNimCreateFutureCall(fut, threadParam: PNode): PNode =
-  let size = newNodeIT(nkCall, fut.info, getSysType(tyInt))
+proc createNimCreatePromiseCall(prom, threadParam: PNode): PNode =
+  let size = newNodeIT(nkCall, prom.info, getSysType(tyInt))
   size.add newSymNode(createMagic("sizeof", mSizeOf))
-  assert fut.typ.kind == tyGenericInst
-  size.add newNodeIT(nkType, fut.info, fut.typ.sons[1])
+  assert prom.typ.kind == tyGenericInst
+  size.add newNodeIT(nkType, prom.info, prom.typ.sons[1])
 
-  let castExpr = newNodeIT(nkCast, fut.info, fut.typ)
+  let castExpr = newNodeIT(nkCast, prom.info, prom.typ)
   castExpr.add emptyNode
-  castExpr.add callCodeGenProc("nimCreateFuture", threadParam, size)
-  result = newFastAsgnStmt(fut, castExpr)
+  castExpr.add callCodeGenProc("nimCreatePromise", threadParam, size)
+  result = newFastAsgnStmt(prom, castExpr)
 
 proc createWrapperProc(f: PNode; threadParam, argsParam: PSym;
-                       varSection, call, barrier, fut: PNode): PSym =
+                       varSection, call, barrier, prom: PNode): PSym =
   var body = newNodeI(nkStmtList, f.info)
   body.add varSection
   if barrier != nil:
     body.add callCodeGenProc("barrierEnter", barrier)
-  if fut != nil:
-    body.add createNimCreateFutureCall(fut, threadParam.newSymNode)
+  if prom != nil:
+    body.add createNimCreatePromiseCall(prom, threadParam.newSymNode)
     if barrier == nil:
-      body.add callCodeGenProc("nimFutureCreateCondVar", fut)
+      body.add callCodeGenProc("nimPromiseCreateCondVar", prom)
 
   body.add callCodeGenProc("nimArgsPassingDone", threadParam.newSymNode)
-  if fut != nil:
-    let fk = fut.typ.sons[1].futureKind
-    if fk == futInvalid:
-      localError(f.info, "cannot create a future of type: " & 
-        typeToString(fut.typ.sons[1]))
-    body.add newAsgnStmt(indirectAccess(fut,
-      if fk == futGC: "data" else: "blob", fut.info), call)
+  if prom != nil:
+    let fk = prom.typ.sons[1].promiseKind
+    if fk == promInvalid:
+      localError(f.info, "cannot create a promise of type: " & 
+        typeToString(prom.typ.sons[1]))
+    body.add newAsgnStmt(indirectAccess(prom,
+      if fk == promGC: "data" else: "blob", prom.info), call)
     if barrier == nil:
-      body.add callCodeGenProc("nimFutureSignal", fut)
+      body.add callCodeGenProc("nimPromiseSignal", prom)
   else:
     body.add call
   if barrier != nil:
@@ -381,7 +381,7 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode; retType: PType;
   of srVoid:
     internalAssert dest == nil
     result = newNodeI(nkStmtList, n.info)
-  of srFuture:
+  of srPromise:
     internalAssert dest == nil
     result = newNodeIT(nkStmtListExpr, n.info, retType)
   of srByVar:
@@ -450,17 +450,17 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode; retType: PType;
     result.add newFastAsgnStmt(newDotExpr(scratchObj, field), barrier)
     barrierAsExpr = indirectAccess(castExpr, field, n.info)
 
-  var futField, futAsExpr: PNode = nil
-  if spawnKind == srFuture:
-    var field = newSym(skField, getIdent"fut", owner, n.info)
+  var promField, promAsExpr: PNode = nil
+  if spawnKind == srPromise:
+    var field = newSym(skField, getIdent"prom", owner, n.info)
     field.typ = retType
     objType.addField(field)
-    futField = newDotExpr(scratchObj, field)
-    futAsExpr = indirectAccess(castExpr, field, n.info)
+    promField = newDotExpr(scratchObj, field)
+    promAsExpr = indirectAccess(castExpr, field, n.info)
 
   let wrapper = createWrapperProc(fn, threadParam, argsParam, varSection, call,
-                                  barrierAsExpr, futAsExpr)
+                                  barrierAsExpr, promAsExpr)
   result.add callCodeGenProc("nimSpawn", wrapper.newSymNode,
                              genAddrOf(scratchObj.newSymNode))
 
-  if spawnKind == srFuture: result.add futField
+  if spawnKind == srPromise: result.add promField
diff --git a/compiler/semexprs.nim b/compiler/semexprs.nim
index 4e3d2f3cef..8f4cce547a 100644
--- a/compiler/semexprs.nim
+++ b/compiler/semexprs.nim
@@ -1579,9 +1579,9 @@ proc semShallowCopy(c: PContext, n: PNode, flags: TExprFlags): PNode =
   else:
     result = semDirectOp(c, n, flags)
 
-proc createFuture(c: PContext; t: PType; info: TLineInfo): PType =
+proc createPromise(c: PContext; t: PType; info: TLineInfo): PType =
   result = newType(tyGenericInvokation, c.module)
-  addSonSkipIntLit(result, magicsys.getCompilerProc("Future").typ)
+  addSonSkipIntLit(result, magicsys.getCompilerProc("Promise").typ)
   addSonSkipIntLit(result, t)
   result = instGenericContainer(c, info, result, allowMetaTypes = false)
 
@@ -1619,9 +1619,9 @@ proc semMagic(c: PContext, n: PNode, s: PSym, flags: TExprFlags): PNode =
   of mSpawn:
     result = setMs(n, s)
     result.sons[1] = semExpr(c, n.sons[1])
-    # later passes may transform the type 'Future[T]' back into 'T'
+    # later passes may transform the type 'Promise[T]' back into 'T'
     if not result[1].typ.isEmptyType:
-      result.typ = createFuture(c, result[1].typ, n.info)
+      result.typ = createPromise(c, result[1].typ, n.info)
   else: result = semDirectOp(c, n, flags)
 
 proc semWhen(c: PContext, n: PNode, semCheck = true): PNode =
diff --git a/compiler/semtypes.nim b/compiler/semtypes.nim
index 8fcb6ea997..bb81cbe749 100644
--- a/compiler/semtypes.nim
+++ b/compiler/semtypes.nim
@@ -1084,8 +1084,10 @@ proc semTypeNode(c: PContext, n: PNode, prev: PType): PType =
   of nkCallKinds:
     if isRange(n):
       result = semRangeAux(c, n, prev)
-    elif n[0].kind == nkIdent:
-      let op = n.sons[0].ident
+    elif n[0].kind notin nkIdentKinds:
+      result = semTypeExpr(c, n)
+    else:
+      let op = considerAcc(n.sons[0])
       if op.id in {ord(wAnd), ord(wOr)} or op.s == "|":
         checkSonsLen(n, 3)
         var
@@ -1120,8 +1122,6 @@ proc semTypeNode(c: PContext, n: PNode, prev: PType): PType =
         result = semAnyRef(c, n, tyRef, prev)
       else:
         result = semTypeExpr(c, n)
-    else:
-      result = semTypeExpr(c, n)
   of nkWhenStmt:
     var whenResult = semWhen(c, n, false)
     if whenResult.kind == nkStmtList: whenResult.kind = nkStmtListType
diff --git a/lib/pure/concurrency/threadpool.nim b/lib/pure/concurrency/threadpool.nim
index 41c1adca03..24cb9ccdd2 100644
--- a/lib/pure/concurrency/threadpool.nim
+++ b/lib/pure/concurrency/threadpool.nim
@@ -65,12 +65,14 @@ proc closeBarrier*(b: ptr Barrier) {.compilerProc.} =
 # ----------------------------------------------------------------------------
 
 type
+  foreign* = object ## a region that indicates the pointer comes from a
+                    ## foreign thread heap.
   AwaitInfo = object
     cv: CondVar
     idx: int
 
-  RawFuture* = ptr RawFutureObj ## untyped base class for 'Future[T]'
-  RawFutureObj {.inheritable.} = object # \
+  RawPromise* = ptr RawPromiseObj ## untyped base class for 'Promise[T]'
+  RawPromiseObj {.inheritable.} = object # \
     # we allocate this with the thread local allocator; this
     # is possible since we already need to do the GC_unref
     # on the owning thread
@@ -81,10 +83,10 @@ type
     idx: int
     data: PObject  # we incRef and unref it to keep it alive
     owner: ptr Worker
-    next: RawFuture
+    next: RawPromise
     align: float64 # a float for proper alignment
 
-  Future* {.compilerProc.} [T] = ptr object of RawFutureObj
+  Promise* {.compilerProc.} [T] = ptr object of RawPromiseObj
     blob: T  ## the underlying value, if available. Note that usually
              ## you should not access this field directly! However it can
              ## sometimes be more efficient than getting the value via ``^``.
@@ -99,24 +101,24 @@ type
     ready: bool # put it here for correct alignment!
     initialized: bool # whether it has even been initialized
     shutdown: bool # the pool requests to shut down this worker thread
-    futureLock: TLock
-    head: RawFuture
+    promiseLock: TLock
+    head: RawPromise
 
-proc finished*(fut: RawFuture) =
-  ## This MUST be called for every created future to free its associated
+proc finished*(prom: RawPromise) =
+  ## This MUST be called for every created promise to free its associated
   ## resources. Note that the default reading operation ``^`` is destructive
   ## and calls ``finished``.
-  doAssert fut.ai.isNil, "future is still attached to an 'awaitAny'"
-  assert fut.next == nil
-  let w = fut.owner
-  acquire(w.futureLock)
-  fut.next = w.head
-  w.head = fut
-  release(w.futureLock)
+  doAssert prom.ai.isNil, "promise is still attached to an 'awaitAny'"
+  assert prom.next == nil
+  let w = prom.owner
+  acquire(w.promiseLock)
+  prom.next = w.head
+  w.head = prom
+  release(w.promiseLock)
 
-proc cleanFutures(w: ptr Worker) =
+proc cleanPromises(w: ptr Worker) =
   var it = w.head
-  acquire(w.futureLock)
+  acquire(w.promiseLock)
   while it != nil:
     let nxt = it.next
     if it.usesCondVar: destroyCondVar(it.cv)
@@ -124,62 +126,84 @@ proc cleanFutures(w: ptr Worker) =
     dealloc(it)
     it = nxt
   w.head = nil
-  release(w.futureLock)
+  release(w.promiseLock)
 
-proc nimCreateFuture(owner: pointer; blobSize: int): RawFuture {.
+proc nimCreatePromise(owner: pointer; blobSize: int): RawPromise {.
                      compilerProc.} =
-  result = cast[RawFuture](alloc0(RawFutureObj.sizeof + blobSize))
+  result = cast[RawPromise](alloc0(RawPromiseObj.sizeof + blobSize))
   result.owner = cast[ptr Worker](owner)
 
-proc nimFutureCreateCondVar(fut: RawFuture) {.compilerProc.} =
-  fut.cv = createCondVar()
-  fut.usesCondVar = true
+proc nimPromiseCreateCondVar(prom: RawPromise) {.compilerProc.} =
+  prom.cv = createCondVar()
+  prom.usesCondVar = true
 
-proc nimFutureSignal(fut: RawFuture) {.compilerProc.} =
-  if fut.ai != nil:
-    acquire(fut.ai.cv.L)
-    fut.ai.idx = fut.idx
-    inc fut.ai.cv.counter
-    release(fut.ai.cv.L)
-    signal(fut.ai.cv.c)
-  if fut.usesCondVar: signal(fut.cv)
+proc nimPromiseSignal(prom: RawPromise) {.compilerProc.} =
+  if prom.ai != nil:
+    acquire(prom.ai.cv.L)
+    prom.ai.idx = prom.idx
+    inc prom.ai.cv.counter
+    release(prom.ai.cv.L)
+    signal(prom.ai.cv.c)
+  if prom.usesCondVar: signal(prom.cv)
 
-proc await*[T](fut: Future[T]) =
-  ## waits until the value for the future arrives.
-  if fut.usesCondVar: await(fut.cv)
+proc await*[T](prom: Promise[T]) =
+  ## waits until the value for the promise arrives.
+  if prom.usesCondVar: await(prom.cv)
 
-proc `^`*[T](fut: Future[T]): T =
+proc awaitAndThen*[T](prom: Promise[T]; action: proc (x: T) {.closure.}) =
+  ## blocks until the value is available and then passes this value
+  ## to ``action``. Note that due to Nimrod's parameter passing semantics this
+  ## means that ``T`` doesn't need to be copied and so ``awaitAndThen`` can
+  ## sometimes be more efficient than ``^``.
+  if prom.usesCondVar: await(prom)
+  when T is string or T is seq:
+    action(cast[T](prom.data))
+  elif T is ref:
+    {.error: "'awaitAndThen' not available for Promise[ref]".}
+  else:
+    action(prom.blob)
+  finished(prom)
+
+proc `^`*[T](prom: Promise[ref T]): foreign ptr T =
   ## blocks until the value is available and then returns this value. Note
   ## this reading is destructive for reasons of efficiency and convenience.
-  ## This calls ``finished(fut)``.
-  if fut.usesCondVar: await(fut)
-  when T is string or T is seq or T is ref:
-    result = cast[T](fut.data)
-  else:
-    result = fut.blob
-  finished(fut)
+  ## This calls ``finished(prom)``.
+  if prom.usesCondVar: await(prom)
+  result = cast[foreign ptr T](prom.data)
+  finished(prom)
 
-proc awaitAny*(futures: openArray[RawFuture]): int =
-  # awaits any of the given futures. Returns the index of one future for which
-  ## a value arrived. A future only supports one call to 'awaitAny' at the
+proc `^`*[T](prom: Promise[T]): T =
+  ## blocks until the value is available and then returns this value. Note
+  ## this reading is destructive for reasons of efficiency and convenience.
+  ## This calls ``finished(prom)``.
+  if prom.usesCondVar: await(prom)
+  when T is string or T is seq:
+    result = cast[T](prom.data)
+  else:
+    result = prom.blob
+  finished(prom)
+
+proc awaitAny*(promises: openArray[RawPromise]): int =
+  # awaits any of the given promises. Returns the index of one promise for which
+  ## a value arrived. A promise only supports one call to 'awaitAny' at the
   ## same time. That means if you await([a,b]) and await([b,c]) the second
-  ## call will only await 'c'. If there is no future left to be able to wait
+  ## call will only await 'c'. If there is no promise left to be able to wait
   ## on, -1 is returned.
   ## **Note**: This results in non-deterministic behaviour and so should be
   ## avoided.
   var ai: AwaitInfo
   ai.cv = createCondVar()
   var conflicts = 0
-  for i in 0 .. futures.high:
-    if cas(addr futures[i].ai, nil, addr ai):
-      futures[i].idx = i
+  for i in 0 .. promises.high:
+    if cas(addr promises[i].ai, nil, addr ai):
+      promises[i].idx = i
     else:
       inc conflicts
-  if conflicts < futures.len:
+  if conflicts < promises.len:
     await(ai.cv)
     result = ai.idx
-    for i in 0 .. futures.high:
-      discard cas(addr futures[i].ai, addr ai, nil)
+    for i in 0 .. promises.high:
+      discard cas(addr promises[i].ai, addr ai, nil)
   else:
     result = -1
   destroyCondVar(ai.cv)
@@ -207,7 +231,7 @@ proc slave(w: ptr Worker) {.thread.} =
     await(w.taskArrived)
     assert(not w.ready)
     w.f(w, w.data)
-    if w.head != nil: w.cleanFutures
+    if w.head != nil: w.cleanPromises
     if w.shutdown:
       w.shutdown = false
       atomicDec currentPoolSize
@@ -228,7 +252,7 @@ var
 proc activateThread(i: int) {.noinline.} =
   workersData[i].taskArrived = createCondVar()
   workersData[i].taskStarted = createCondVar()
-  initLock workersData[i].futureLock
+  initLock workersData[i].promiseLock
   workersData[i].initialized = true
   createThread(workers[i], slave, addr(workersData[i]))
 
diff --git a/lib/system.nim b/lib/system.nim
index fbd905afab..fc6f617a59 100644
--- a/lib/system.nim
+++ b/lib/system.nim
@@ -42,7 +42,6 @@ type
   cstring* {.magic: Cstring.} ## built-in cstring (*compatible string*) type
   pointer* {.magic: Pointer.} ## built-in pointer type, use the ``addr``
                               ## operator to get a pointer to a variable
-
 const
   on* = true    ## alias for ``true``
   off* = false  ## alias for ``false``
@@ -51,6 +50,9 @@ const
 
 type
   Ordinal* {.magic: Ordinal.}[T]
+  `ptr`* {.magic: Pointer.}[T] ## built-in generic untraced pointer type
+  `ref`* {.magic: Pointer.}[T] ## built-in generic traced pointer type
+
   `nil` {.magic: "Nil".}
   expr* {.magic: Expr.} ## meta type to denote an expression (for templates)
   stmt* {.magic: Stmt.} ## meta type to denote a statement (for templates)
diff --git a/lib/system/assign.nim b/lib/system/assign.nim
index 75c7496331..2ae945fb1c 100644
--- a/lib/system/assign.nim
+++ b/lib/system/assign.nim
@@ -179,7 +179,8 @@ when not defined(nimmixin):
     # internal proc used for destroying sequences and arrays
     for i in countup(0, r.len - 1): destroy(r[i])
 else:
-  # XXX Why is this exported and no compilerproc?
+  # XXX Why is this exported and no compilerproc? -> compilerprocs cannot be
+  # generic for now
   proc nimDestroyRange*[T](r: T) =
     # internal proc used for destroying sequences and arrays
     mixin destroy

From f12a0820e0e7e5c32378bb56b8d0d2591fc71ae5 Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Thu, 29 May 2014 13:19:26 +0200
Subject: [PATCH 07/13] added 'sortoutput' option to make output deterministic
 for threading tests

---
 tests/testament/specs.nim  |  4 +++-
 tests/testament/tester.nim | 12 ++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/tests/testament/specs.nim b/tests/testament/specs.nim
index 225ea18910..6e72f4b5e3 100644
--- a/tests/testament/specs.nim
+++ b/tests/testament/specs.nim
@@ -46,7 +46,7 @@ type
     msg*: string
     ccodeCheck*: string
     err*: TResultEnum
-    substr*: bool
+    substr*, sortoutput*: bool
     targets*: set[TTarget]
 
 const
@@ -113,6 +113,8 @@ proc parseSpec*(filename: string): TSpec =
       result.action = actionRun
       result.outp = e.value
       result.substr = true
+    of "sortoutput":
+      result.sortoutput = parseCfgBool(e.value)
     of "exitcode": 
       discard parseInt(e.value, result.exitCode)
     of "msg":
diff --git a/tests/testament/tester.nim b/tests/testament/tester.nim
index 50d0e6eac9..adf9785e0b 100644
--- a/tests/testament/tester.nim
+++ b/tests/testament/tester.nim
@@ -11,7 +11,8 @@
 
 import
   parseutils, strutils, pegs, os, osproc, streams, parsecfg, json,
-  marshal, backend, parseopt, specs, htmlgen, browsers, terminal
+  marshal, backend, parseopt, specs, htmlgen, browsers, terminal, sequtils,
+  algorithm
 
 const
   resultsFile = "testresults.html"
@@ -150,6 +151,11 @@ proc codegenCheck(test: TTest, check: string, given: var TSpec) =
     except EIO:
       given.err = reCodeNotFound
 
+proc makeDeterministic(s: string): string =
+  var x = toSeq(s.lines)
+  sort(x, system.cmp)
+  result = join(x, "\n")
+
 proc testSpec(r: var TResults, test: TTest) =
   # major entry point for a single test
   let tname = test.name.addFileExt(".nim")
@@ -191,7 +197,9 @@ proc testSpec(r: var TResults, test: TTest) =
             r.addResult(test, "exitcode: " & $expected.exitCode,
                               "exitcode: " & $exitCode, reExitCodesDiffer)
           else:
-            if strip(buf.string) != strip(expected.outp):
+            var bufB = strip(buf.string)
+            if expected.sortoutput: bufB = makeDeterministic(bufB)
+            if bufB != strip(expected.outp):
               if not (expected.substr and expected.outp in buf.string):
                 given.err = reOutputsDiffer
             if given.err == reSuccess:

From 6470bd8f87b860c555556a2a965f6c8077e993ad Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Thu, 29 May 2014 13:27:45 +0200
Subject: [PATCH 08/13] 'parallel' proves array bounds

---
 compiler/guards.nim         | 67 ++++++++++++++++++++++++++++++++++---
 compiler/semparallel.nim    | 30 +++++++++++------
 tests/parallel/tforstmt.nim | 24 +++++++++++++
 3 files changed, 107 insertions(+), 14 deletions(-)
 create mode 100644 tests/parallel/tforstmt.nim

diff --git a/compiler/guards.nim b/compiler/guards.nim
index 3df3bd1a81..ec5adb4da4 100644
--- a/compiler/guards.nim
+++ b/compiler/guards.nim
@@ -246,6 +246,7 @@ proc canon*(n: PNode): PNode =
     result.sons[0] = opLen.newSymNode
   else: discard
 
+  result = skipConv(result)
   result = reassociation(result)
   # most important rule: (x-4) < a.len -->  x < a.len+4
   case result.getMagic
@@ -672,7 +673,9 @@ proc simpleSlice*(a, b: PNode): BiggestInt =
   else:
     result = -1
 
-proc ple(m: TModel; a, b: PNode): TImplication =  
+proc pleViaModel(model: TModel; aa, bb: PNode): TImplication
+
+proc ple(m: TModel; a, b: PNode): TImplication =
   template `<=?`(a,b): expr = ple(m,a,b) == impYes
   #   0 <= 3
   if a.isValue and b.isValue:
@@ -717,12 +720,68 @@ proc ple(m: TModel; a, b: PNode): TImplication =
     if a[1] <=? b or a[2] <=? b: return impYes
 
   # use the knowledge base:
-  return doesImply(m, opLe.buildCall(a, b))
+  return pleViaModel(m, a, b)
+  #return doesImply(m, opLe.buildCall(a, b))
+
+type TReplacements = seq[tuple[a,b: PNode]]
+
+proc replaceSubTree(n, x, by: PNode): PNode =
+  if sameTree(n, x):
+    result = by
+  elif hasSubTree(n, x):
+    result = shallowCopy(n)
+    for i in 0 .. safeLen(n)-1:
+      result.sons[i] = replaceSubTree(n.sons[i], x, by)
+  else:
+    result = n
+
+proc applyReplacements(n: PNode; rep: TReplacements): PNode =
+  result = n
+  for x in rep: result = result.replaceSubTree(x.a, x.b)
+
+proc pleViaModelRec(m: var TModel; a, b: PNode): TImplication =
+  # now check for inferrable facts: a <= b and b <= c  implies a <= c
+  for i in 0..m.high:
+    let fact = m[i]
+    if fact != nil and fact.getMagic in someLe:
+      # x <= y implies a <= b  if  a <= x and y <= b
+      let x = fact[1]
+      let y = fact[2]
+      # mark as used:
+      m[i] = nil
+      if ple(m, a, x) == impYes:
+        if ple(m, y, b) == impYes: return impYes
+        #if pleViaModelRec(m, y, b): return impYes
+
+proc pleViaModel(model: TModel; aa, bb: PNode): TImplication =
+  # compute replacements:
+  var replacements: TReplacements = @[]
+  for fact in model:
+    if fact != nil and fact.getMagic in someEq:
+      let a = fact[1]
+      let b = fact[2]
+      if a.kind == nkSym: replacements.add((a,b))
+      else: replacements.add((b,a))
+  var m: TModel
+  var a = aa
+  var b = bb
+  if replacements.len > 0:
+    m = @[]
+    # make the other facts consistent:
+    for fact in model:
+      if fact != nil and fact.getMagic notin someEq:
+        # XXX 'canon' should not be necessary here, but it is
+        m.add applyReplacements(fact, replacements).canon
+    a = applyReplacements(aa, replacements)
+    b = applyReplacements(bb, replacements)
+  else:
+    # we have to make a copy here, because the model will be modified:
+    m = model
+  result = pleViaModelRec(m, a, b)
 
 proc proveLe*(m: TModel; a, b: PNode): TImplication =
-  #echo "ROOT ", renderTree(a), " <=? ", b.rendertree
   let x = canon(opLe.buildCall(a, b))
-  #echo renderTree(res)
+  #echo "ROOT ", renderTree(x[1]), " <=? ", renderTree(x[2])
   result = ple(m, x[1], x[2])
   if result == impUnknown:
     # try an alternative:  a <= b  iff  not (b < a)  iff  not (b+1 <= a):
diff --git a/compiler/semparallel.nim b/compiler/semparallel.nim
index b135420382..678ffd2fbb 100644
--- a/compiler/semparallel.nim
+++ b/compiler/semparallel.nim
@@ -59,7 +59,8 @@ type
   TDirection = enum
     ascending, descending
   MonotonicVar = object
-    v: PSym
+    v, alias: PSym        # to support the ordinary 'countup' iterator
+                          # we need to detect aliases
     lower, upper, stride: PNode
     dir: TDirection
     blacklisted: bool     # blacklisted variables that are not monotonic
@@ -83,7 +84,7 @@ proc initAnalysisCtx(): AnalysisCtx =
 
 proc lookupSlot(c: AnalysisCtx; s: PSym): int =
   for i in 0.. <c.locals.len:
-    if c.locals[i].v == s: return i
+    if c.locals[i].v == s or c.locals[i].alias == s: return i
   return -1
 
 proc getSlot(c: var AnalysisCtx; v: PSym): ptr MonotonicVar =
@@ -104,6 +105,11 @@ proc gatherArgs(c: var AnalysisCtx; n: PNode) =
         c.args.add root
     gatherArgs(c, n[i])
 
+proc isSingleAssignable(n: PNode): bool =
+  n.kind == nkSym and (let s = n.sym;
+    s.kind in {skTemp, skForVar, skLet} and
+          {sfAddrTaken, sfGlobal} * s.flags == {})
+
 proc isLocal(n: PNode): bool =
   n.kind == nkSym and (let s = n.sym;
     s.kind in {skResult, skTemp, skForVar, skVar, skLet} and
@@ -290,16 +296,16 @@ proc analyseCase(c: var AnalysisCtx; n: PNode) =
 proc analyseIf(c: var AnalysisCtx; n: PNode) =
   analyse(c, n.sons[0].sons[0])
   let oldFacts = c.guards.len
-  addFact(c.guards, n.sons[0].sons[0])
+  addFact(c.guards, canon(n.sons[0].sons[0]))
 
   analyse(c, n.sons[0].sons[1])
   for i in 1.. <n.len:
     let branch = n.sons[i]
     setLen(c.guards, oldFacts)
     for j in 0..i-1:
-      addFactNeg(c.guards, n.sons[j].sons[0])
+      addFactNeg(c.guards, canon(n.sons[j].sons[0]))
     if branch.len > 1:
-      addFact(c.guards, branch.sons[0])
+      addFact(c.guards, canon(branch.sons[0]))
     for i in 0 .. <branch.len:
       analyse(c, branch.sons[i])
   setLen(c.guards, oldFacts)
@@ -307,9 +313,12 @@ proc analyseIf(c: var AnalysisCtx; n: PNode) =
 proc analyse(c: var AnalysisCtx; n: PNode) =
   case n.kind
   of nkAsgn, nkFastAsgn:
-    # since we already ensure sfAddrTaken is not in s.flags, we only need to
-    # prevent direct assignments to the monotonic variable:
-    if n[0].isLocal:
+    if n[0].isSingleAssignable and n[1].isLocal:
+      let slot = c.getSlot(n[1].sym)
+      slot.alias = n[0].sym
+    elif n[0].isLocal:
+      # since we already ensure sfAddrTaken is not in s.flags, we only need to
+      # prevent direct assignments to the monotonic variable:
       let slot = c.getSlot(n[0].sym)
       slot.blackListed = true
     invalidateFacts(c.guards, n[0])
@@ -348,13 +357,13 @@ proc analyse(c: var AnalysisCtx; n: PNode) =
       # loop may never execute:
       let oldState = c.locals.len
       let oldFacts = c.guards.len
-      addFact(c.guards, n.sons[0])
+      addFact(c.guards, canon(n.sons[0]))
       analyse(c, n.sons[1])
       setLen(c.locals, oldState)
       setLen(c.guards, oldFacts)
       # we know after the loop the negation holds:
       if not hasSubnodeWith(n.sons[1], nkBreakStmt):
-        addFactNeg(c.guards, n.sons[0])
+        addFactNeg(c.guards, canon(n.sons[0]))
     dec c.inLoop
   of nkTypeSection, nkProcDef, nkConverterDef, nkMethodDef, nkIteratorDef,
       nkMacroDef, nkTemplateDef, nkConstSection, nkPragma:
@@ -429,6 +438,7 @@ proc liftParallel*(owner: PSym; n: PNode): PNode =
   # - detect monotonic local integer variables
   # - detect used slices
   # - detect used arguments
+  #echo "PAR ", renderTree(n)
   
   var a = initAnalysisCtx()
   let body = n.lastSon
diff --git a/tests/parallel/tforstmt.nim b/tests/parallel/tforstmt.nim
new file mode 100644
index 0000000000..35f28759e4
--- /dev/null
+++ b/tests/parallel/tforstmt.nim
@@ -0,0 +1,24 @@
+discard """
+  output: '''3
+4
+5
+6
+7'''
+  sortoutput: true
+"""
+
+import threadpool, math
+
+proc p(x: int) =
+  echo x
+
+proc testFor(a, b: int; foo: var openArray[int]) =
+  parallel:
+    for i in max(a, 0) .. min(b, foo.len-1):
+      spawn p(foo[i])
+
+var arr = [0, 1, 2, 3, 4, 5, 6, 7]
+
+testFor(3, 10, arr)
+
+

From ea16aca09ec47e3d4393437dea4f398922acaba0 Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Fri, 30 May 2014 13:15:54 +0200
Subject: [PATCH 09/13] correct code generation for tforstmt

---
 compiler/lowerings.nim      | 54 +++++++++++++++++++++++--------------
 tests/parallel/tforstmt.nim |  5 ++--
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/compiler/lowerings.nim b/compiler/lowerings.nim
index 13d4bf60ec..5636d423f2 100644
--- a/compiler/lowerings.nim
+++ b/compiler/lowerings.nim
@@ -160,8 +160,8 @@ We generate roughly this:
 
 proc f_wrapper(args) =
   barrierEnter(args.barrier)  # for parallel statement
-  var a = args.a # copy strings/seqs; thread transfer; not generated for
-                 # the 'parallel' statement
+  var a = args.a # thread transfer; deepCopy or shallowCopy or no copy
+                 # depending on whether we're in a 'parallel' statement
   var b = args.b
 
   args.prom = nimCreatePromise(thread, sizeof(T)) # optional
@@ -199,9 +199,9 @@ proc createNimCreatePromiseCall(prom, threadParam: PNode): PNode =
 proc createWrapperProc(f: PNode; threadParam, argsParam: PSym;
                        varSection, call, barrier, prom: PNode): PSym =
   var body = newNodeI(nkStmtList, f.info)
-  body.add varSection
   if barrier != nil:
     body.add callCodeGenProc("barrierEnter", barrier)
+  body.add varSection
   if prom != nil:
     body.add createNimCreatePromiseCall(prom, threadParam.newSymNode)
     if barrier == nil:
@@ -248,6 +248,17 @@ proc createCastExpr(argsParam: PSym; objType: PType): PNode =
   result.typ = newType(tyPtr, objType.owner)
   result.typ.rawAddSon(objType)
 
+proc addLocalVar(varSection: PNode; owner: PSym; typ: PType; v: PNode): PSym =
+  result = newSym(skTemp, getIdent(genPrefix), owner, varSection.info)
+  result.typ = typ
+  incl(result.flags, sfFromGeneric)
+
+  var vpart = newNodeI(nkIdentDefs, varSection.info, 3)
+  vpart.sons[0] = newSymNode(result)
+  vpart.sons[1] = ast.emptyNode
+  vpart.sons[2] = v
+  varSection.add vpart
+
 proc setupArgsForConcurrency(n: PNode; objType: PType; scratchObj: PSym, 
                              castExpr, call, varSection, result: PNode) =
   let formals = n[0].typ.n
@@ -267,16 +278,8 @@ proc setupArgsForConcurrency(n: PNode; objType: PType; scratchObj: PSym,
     objType.addField(field)
     result.add newFastAsgnStmt(newDotExpr(scratchObj, field), n[i])
 
-    var temp = newSym(skTemp, tmpName, objType.owner, n.info)
-    temp.typ = argType
-    incl(temp.flags, sfFromGeneric)
-
-    var vpart = newNodeI(nkIdentDefs, n.info, 3)
-    vpart.sons[0] = newSymNode(temp)
-    vpart.sons[1] = ast.emptyNode
-    vpart.sons[2] = indirectAccess(castExpr, field, n.info)
-    varSection.add vpart
-    
+    let temp = addLocalVar(varSection, objType.owner, argType,
+                           indirectAccess(castExpr, field, n.info))    
     call.add(newSymNode(temp))
 
 proc getRoot*(n: PNode): PSym =
@@ -310,9 +313,11 @@ proc genHigh(n: PNode): PNode =
     result.sons[1] = n
 
 proc setupArgsForParallelism(n: PNode; objType: PType; scratchObj: PSym;
-                             castExpr, call, result: PNode) =
+                             castExpr, call, varSection, result: PNode) =
   let formals = n[0].typ.n
   let tmpName = getIdent(genPrefix)
+  # we need to copy the foreign scratch object fields into local variables
+  # for correctness: These are called 'threadLocal' here.
   for i in 1 .. <n.len:
     let n = n[i]
     let argType = skipTypes(if i < formals.len: formals[i].typ else: n.typ,
@@ -344,7 +349,9 @@ proc setupArgsForParallelism(n: PNode; objType: PType; scratchObj: PSym;
         result.add newFastAsgnStmt(newDotExpr(scratchObj, fieldA), n[2])
         result.add newFastAsgnStmt(newDotExpr(scratchObj, fieldB), n[3])
 
-        slice.sons[2] = indirectAccess(castExpr, fieldA, n.info)
+        let threadLocal = addLocalVar(varSection, objType.owner, fieldA.typ,
+                                      indirectAccess(castExpr, fieldA, n.info))
+        slice.sons[2] = threadLocal.newSymNode
       else:
         let a = genAddrOf(n)
         field.typ = a.typ
@@ -353,9 +360,12 @@ proc setupArgsForParallelism(n: PNode; objType: PType; scratchObj: PSym;
         result.add newFastAsgnStmt(newDotExpr(scratchObj, fieldB), genHigh(n))
 
         slice.sons[2] = newIntLit(0)
-        
+      # the array itself does not need to go through a thread local variable:
       slice.sons[1] = genDeref(indirectAccess(castExpr, field, n.info))
-      slice.sons[3] = indirectAccess(castExpr, fieldB, n.info)
+
+      let threadLocal = addLocalVar(varSection, objType.owner, fieldB.typ,
+                                    indirectAccess(castExpr, fieldB, n.info))
+      slice.sons[3] = threadLocal.newSymNode
       call.add slice
     elif (let size = computeSize(argType); size < 0 or size > 16) and
         n.getRoot != nil:
@@ -364,13 +374,17 @@ proc setupArgsForParallelism(n: PNode; objType: PType; scratchObj: PSym;
       field.typ = a.typ
       objType.addField(field)
       result.add newFastAsgnStmt(newDotExpr(scratchObj, field), a)
-      call.add(genDeref(indirectAccess(castExpr, field, n.info)))
+      let threadLocal = addLocalVar(varSection, objType.owner, field.typ,
+                                    indirectAccess(castExpr, field, n.info))
+      call.add(genDeref(threadLocal.newSymNode))
     else:
       # boring case
       field.typ = argType
       objType.addField(field)
       result.add newFastAsgnStmt(newDotExpr(scratchObj, field), n)
-      call.add(indirectAccess(castExpr, field, n.info))
+      let threadLocal = addLocalVar(varSection, objType.owner, field.typ,
+                                    indirectAccess(castExpr, field, n.info))
+      call.add(threadLocal.newSymNode)
 
 proc wrapProcForSpawn*(owner: PSym; n: PNode; retType: PType; 
                        barrier, dest: PNode = nil): PNode =
@@ -438,7 +452,7 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode; retType: PType;
   if barrier.isNil:
     setupArgsForConcurrency(n, objType, scratchObj, castExpr, call, varSection, result)
   else: 
-    setupArgsForParallelism(n, objType, scratchObj, castExpr, call, result)
+    setupArgsForParallelism(n, objType, scratchObj, castExpr, call, varSection, result)
 
   var barrierAsExpr: PNode = nil
   if barrier != nil:
diff --git a/tests/parallel/tforstmt.nim b/tests/parallel/tforstmt.nim
index 35f28759e4..58de833f3e 100644
--- a/tests/parallel/tforstmt.nim
+++ b/tests/parallel/tforstmt.nim
@@ -7,14 +7,15 @@ discard """
   sortoutput: true
 """
 
-import threadpool, math
+import threadpool, os
 
 proc p(x: int) =
+  os.sleep(100 - x*10)
   echo x
 
 proc testFor(a, b: int; foo: var openArray[int]) =
   parallel:
-    for i in max(a, 0) .. min(b, foo.len-1):
+    for i in max(a, 0) .. min(b, foo.high):
       spawn p(foo[i])
 
 var arr = [0, 1, 2, 3, 4, 5, 6, 7]

From bea1761da1195acb883b34105ec9a834f2a10c2e Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Fri, 30 May 2014 17:04:39 +0200
Subject: [PATCH 10/13] tester works again

---
 tests/parallel/tsysspawnbadarg.nim | 4 ++--
 tests/testament/tester.nim         | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/parallel/tsysspawnbadarg.nim b/tests/parallel/tsysspawnbadarg.nim
index 120975ed54..ad798a7d35 100644
--- a/tests/parallel/tsysspawnbadarg.nim
+++ b/tests/parallel/tsysspawnbadarg.nim
@@ -1,9 +1,9 @@
 discard """
-  line: 7
+  line: 9
   errormsg: "'spawn' takes a call expression"
   cmd: "nimrod $target --threads:on $options $file"
 """
 
 import threadpool
 
-spawn(1)
+let foo = spawn(1)
diff --git a/tests/testament/tester.nim b/tests/testament/tester.nim
index adf9785e0b..fc6b4ff95c 100644
--- a/tests/testament/tester.nim
+++ b/tests/testament/tester.nim
@@ -11,7 +11,7 @@
 
 import
   parseutils, strutils, pegs, os, osproc, streams, parsecfg, json,
-  marshal, backend, parseopt, specs, htmlgen, browsers, terminal, sequtils,
+  marshal, backend, parseopt, specs, htmlgen, browsers, terminal,
   algorithm
 
 const
@@ -152,7 +152,7 @@ proc codegenCheck(test: TTest, check: string, given: var TSpec) =
       given.err = reCodeNotFound
 
 proc makeDeterministic(s: string): string =
-  var x = toSeq(s.lines)
+  var x = splitLines(s)
   sort(x, system.cmp)
   result = join(x, "\n")
 
@@ -200,7 +200,7 @@ proc testSpec(r: var TResults, test: TTest) =
             var bufB = strip(buf.string)
             if expected.sortoutput: bufB = makeDeterministic(bufB)
             if bufB != strip(expected.outp):
-              if not (expected.substr and expected.outp in buf.string):
+              if not (expected.substr and expected.outp in bufB):
                 given.err = reOutputsDiffer
             if given.err == reSuccess:
               codeGenCheck(test, expected.ccodeCheck, given)

From 9953e0bbca92d81e41a5ca39981b02596027f236 Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Sat, 31 May 2014 01:16:16 +0200
Subject: [PATCH 11/13] tdisjoint_slice2 works

---
 compiler/ccgcalls.nim               |  2 +-
 compiler/guards.nim                 |  8 ++++++++
 compiler/lowerings.nim              |  2 +-
 compiler/semparallel.nim            |  2 +-
 tests/parallel/tdisjoint_slice2.nim | 20 ++++++++++++++++----
 5 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/compiler/ccgcalls.nim b/compiler/ccgcalls.nim
index a7840305dd..71e23aa1dd 100644
--- a/compiler/ccgcalls.nim
+++ b/compiler/ccgcalls.nim
@@ -86,7 +86,7 @@ proc openArrayLoc(p: BProc, n: PNode): PRope =
     initLocExpr(p, q[2], b)
     initLocExpr(p, q[3], c)
     let fmt =
-      case skipTypes(a.t, abstractVar).kind
+      case skipTypes(a.t, abstractVar+{tyPtr}).kind
       of tyOpenArray, tyVarargs, tyArray, tyArrayConstr:
         "($1)+($2), ($3)-($2)+1"
       of tyString, tySequence:
diff --git a/compiler/guards.nim b/compiler/guards.nim
index ec5adb4da4..813a300146 100644
--- a/compiler/guards.nim
+++ b/compiler/guards.nim
@@ -752,6 +752,14 @@ proc pleViaModelRec(m: var TModel; a, b: PNode): TImplication =
       if ple(m, a, x) == impYes:
         if ple(m, y, b) == impYes: return impYes
         #if pleViaModelRec(m, y, b): return impYes
+      # fact:  16 <= i
+      #         x    y
+      # question: i <= 15? no!
+      result = impliesLe(fact, a, b)
+      if result != impUnknown: return result
+      if sameTree(y, a):
+        result = ple(m, x, b)
+        if result != impUnknown: return result
 
 proc pleViaModel(model: TModel; aa, bb: PNode): TImplication =
   # compute replacements:
diff --git a/compiler/lowerings.nim b/compiler/lowerings.nim
index 5636d423f2..b159502dc6 100644
--- a/compiler/lowerings.nim
+++ b/compiler/lowerings.nim
@@ -338,7 +338,7 @@ proc setupArgsForParallelism(n: PNode; objType: PType; scratchObj: PSym;
       objType.addField(fieldB)
       
       if getMagic(n) == mSlice:
-        let a = genAddrOf(n[0])
+        let a = genAddrOf(n[1])
         field.typ = a.typ
         objType.addField(field)
         result.add newFastAsgnStmt(newDotExpr(scratchObj, field), a)
diff --git a/compiler/semparallel.nim b/compiler/semparallel.nim
index 678ffd2fbb..72def1137b 100644
--- a/compiler/semparallel.nim
+++ b/compiler/semparallel.nim
@@ -166,7 +166,7 @@ proc overlap(m: TModel; x,y,c,d: PNode) =
     of impUnknown:
       localError(x.info,
         "cannot prove: $# > $#; required for ($#)..($#) disjoint from ($#)..($#)" %
-          [?y, ?d, ?x, ?y, ?c, ?d])
+          [?c, ?y, ?x, ?y, ?c, ?d])
     of impYes:
       localError(x.info, "($#)..($#) not disjoint from ($#)..($#)" % [?x, ?y, ?c, ?d])
     of impNo: discard
diff --git a/tests/parallel/tdisjoint_slice2.nim b/tests/parallel/tdisjoint_slice2.nim
index b26559fc21..1e86ea644a 100644
--- a/tests/parallel/tdisjoint_slice2.nim
+++ b/tests/parallel/tdisjoint_slice2.nim
@@ -1,3 +1,15 @@
+discard """
+  output: '''0
+1
+2
+3
+4
+5
+6
+7
+8'''
+  sortoutput: true
+"""
 
 import threadpool
 
@@ -7,12 +19,12 @@ proc f(a: openArray[int]) =
 proc f(a: int) = echo a
 
 proc main() =
-  var a: array[0..30, int]
+  var a: array[0..9, int] = [0,1,2,3,4,5,6,7,8,9]
   parallel:
-    spawn f(a[0..15])
+    spawn f(a[0..2])
     #spawn f(a[16..30])
-    var i = 16
-    while i <= 29:
+    var i = 3
+    while i <= 8:
       spawn f(a[i])
       spawn f(a[i+1])
       inc i, 2

From 40baebebfe425f03fbdd41da4c6d2e4c6778d241 Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Sun, 1 Jun 2014 01:45:44 +0200
Subject: [PATCH 12/13] pi test compiles, but crashes randomly

---
 compiler/lowerings.nim | 18 ++++++++++++++----
 compiler/semdata.nim   |  1 +
 compiler/semexprs.nim  |  8 ++++++--
 tests/parallel/tpi.nim | 22 ++++++++++++++++++++++
 4 files changed, 43 insertions(+), 6 deletions(-)
 create mode 100644 tests/parallel/tpi.nim

diff --git a/compiler/lowerings.nim b/compiler/lowerings.nim
index b159502dc6..d370f21f05 100644
--- a/compiler/lowerings.nim
+++ b/compiler/lowerings.nim
@@ -197,18 +197,21 @@ proc createNimCreatePromiseCall(prom, threadParam: PNode): PNode =
   result = newFastAsgnStmt(prom, castExpr)
 
 proc createWrapperProc(f: PNode; threadParam, argsParam: PSym;
-                       varSection, call, barrier, prom: PNode): PSym =
+                       varSection, call, barrier, prom: PNode;
+                       spawnKind: TSpawnResult): PSym =
   var body = newNodeI(nkStmtList, f.info)
   if barrier != nil:
     body.add callCodeGenProc("barrierEnter", barrier)
   body.add varSection
-  if prom != nil:
+  if prom != nil and spawnKind != srByVar:
     body.add createNimCreatePromiseCall(prom, threadParam.newSymNode)
     if barrier == nil:
       body.add callCodeGenProc("nimPromiseCreateCondVar", prom)
 
   body.add callCodeGenProc("nimArgsPassingDone", threadParam.newSymNode)
-  if prom != nil:
+  if spawnKind == srByVar:
+    body.add newAsgnStmt(genDeref(prom), call)
+  elif prom != nil:
     let fk = prom.typ.sons[1].promiseKind
     if fk == promInvalid:
       localError(f.info, "cannot create a promise of type: " & 
@@ -471,9 +474,16 @@ proc wrapProcForSpawn*(owner: PSym; n: PNode; retType: PType;
     objType.addField(field)
     promField = newDotExpr(scratchObj, field)
     promAsExpr = indirectAccess(castExpr, field, n.info)
+  elif spawnKind == srByVar:
+    var field = newSym(skField, getIdent"prom", owner, n.info)
+    field.typ = newType(tyPtr, objType.owner)
+    field.typ.rawAddSon(retType)
+    objType.addField(field)
+    promAsExpr = indirectAccess(castExpr, field, n.info)
+    result.add newFastAsgnStmt(newDotExpr(scratchObj, field), genAddrOf(dest))
 
   let wrapper = createWrapperProc(fn, threadParam, argsParam, varSection, call,
-                                  barrierAsExpr, promAsExpr)
+                                  barrierAsExpr, promAsExpr, spawnKind)
   result.add callCodeGenProc("nimSpawn", wrapper.newSymNode,
                              genAddrOf(scratchObj.newSymNode))
 
diff --git a/compiler/semdata.nim b/compiler/semdata.nim
index 987a70a419..19181d98e0 100644
--- a/compiler/semdata.nim
+++ b/compiler/semdata.nim
@@ -91,6 +91,7 @@ type
     generics*: seq[TInstantiationPair] # pending list of instantiated generics to compile
     lastGenericIdx*: int      # used for the generics stack
     hloLoopDetector*: int     # used to prevent endless loops in the HLO
+    inParallelStmt*: int
    
 proc makeInstPair*(s: PSym, inst: PInstantiation): TInstantiationPair =
   result.genericSym = s
diff --git a/compiler/semexprs.nim b/compiler/semexprs.nim
index 8f4cce547a..e507e711f3 100644
--- a/compiler/semexprs.nim
+++ b/compiler/semexprs.nim
@@ -1615,13 +1615,17 @@ proc semMagic(c: PContext, n: PNode, s: PSym, flags: TExprFlags): PNode =
     result = setMs(n, s)
     var x = n.lastSon
     if x.kind == nkDo: x = x.sons[bodyPos]
+    inc c.inParallelStmt
     result.sons[1] = semStmt(c, x)
+    dec c.inParallelStmt
   of mSpawn:
     result = setMs(n, s)
     result.sons[1] = semExpr(c, n.sons[1])
-    # later passes may transform the type 'Promise[T]' back into 'T'
     if not result[1].typ.isEmptyType:
-      result.typ = createPromise(c, result[1].typ, n.info)
+      if c.inParallelStmt > 0:
+        result.typ = result[1].typ
+      else:
+        result.typ = createPromise(c, result[1].typ, n.info)
   else: result = semDirectOp(c, n, flags)
 
 proc semWhen(c: PContext, n: PNode, semCheck = true): PNode =
diff --git a/tests/parallel/tpi.nim b/tests/parallel/tpi.nim
new file mode 100644
index 0000000000..de5aa9a514
--- /dev/null
+++ b/tests/parallel/tpi.nim
@@ -0,0 +1,22 @@
+
+import strutils, math, threadpool
+
+proc term(k: float): float = 4 * math.pow(-1, k) / (2*k + 1)
+
+proc piU(n: int): float =
+  var ch = newSeq[Promise[float]](n+1)
+  for k in 0..n:
+    ch[k] = spawn term(float(k))
+  for k in 0..n:
+    result += ^ch[k]
+
+proc piS(n: int): float =
+  var ch = newSeq[float](n+1)
+  parallel:
+    for k in 0..ch.high:
+      ch[k] = spawn term(float(k))
+  for k in 0..ch.high:
+    result += ch[k]
+
+echo formatFloat(piU(5000))
+echo formatFloat(piS(5000))

From e6d12f3f6ee933f295dd83a64f5f0e6eba77e1d1 Mon Sep 17 00:00:00 2001
From: Araq <rumpf_a@web.de>
Date: Sun, 1 Jun 2014 15:02:13 +0200
Subject: [PATCH 13/13] fixed codegen for return values

---
 compiler/lowerings.nim | 27 +++++++++++++++------------
 tests/parallel/tpi.nim |  4 ++++
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/compiler/lowerings.nim b/compiler/lowerings.nim
index d370f21f05..df2816a0e0 100644
--- a/compiler/lowerings.nim
+++ b/compiler/lowerings.nim
@@ -155,6 +155,17 @@ proc promiseKind(t: PType): TPromiseKind =
   elif containsGarbageCollectedRef(t): promInvalid
   else: promBlob
 
+proc addLocalVar(varSection: PNode; owner: PSym; typ: PType; v: PNode): PSym =
+  result = newSym(skTemp, getIdent(genPrefix), owner, varSection.info)
+  result.typ = typ
+  incl(result.flags, sfFromGeneric)
+
+  var vpart = newNodeI(nkIdentDefs, varSection.info, 3)
+  vpart.sons[0] = newSymNode(result)
+  vpart.sons[1] = ast.emptyNode
+  vpart.sons[2] = v
+  varSection.add vpart
+
 discard """
 We generate roughly this:
 
@@ -202,6 +213,9 @@ proc createWrapperProc(f: PNode; threadParam, argsParam: PSym;
   var body = newNodeI(nkStmtList, f.info)
   if barrier != nil:
     body.add callCodeGenProc("barrierEnter", barrier)
+  var threadLocalProm: PSym
+  if spawnKind == srByVar:
+    threadLocalProm = addLocalVar(varSection, argsParam.owner, prom.typ, prom)
   body.add varSection
   if prom != nil and spawnKind != srByVar:
     body.add createNimCreatePromiseCall(prom, threadParam.newSymNode)
@@ -210,7 +224,7 @@ proc createWrapperProc(f: PNode; threadParam, argsParam: PSym;
 
   body.add callCodeGenProc("nimArgsPassingDone", threadParam.newSymNode)
   if spawnKind == srByVar:
-    body.add newAsgnStmt(genDeref(prom), call)
+    body.add newAsgnStmt(genDeref(threadLocalProm.newSymNode), call)
   elif prom != nil:
     let fk = prom.typ.sons[1].promiseKind
     if fk == promInvalid:
@@ -251,17 +265,6 @@ proc createCastExpr(argsParam: PSym; objType: PType): PNode =
   result.typ = newType(tyPtr, objType.owner)
   result.typ.rawAddSon(objType)
 
-proc addLocalVar(varSection: PNode; owner: PSym; typ: PType; v: PNode): PSym =
-  result = newSym(skTemp, getIdent(genPrefix), owner, varSection.info)
-  result.typ = typ
-  incl(result.flags, sfFromGeneric)
-
-  var vpart = newNodeI(nkIdentDefs, varSection.info, 3)
-  vpart.sons[0] = newSymNode(result)
-  vpart.sons[1] = ast.emptyNode
-  vpart.sons[2] = v
-  varSection.add vpart
-
 proc setupArgsForConcurrency(n: PNode; objType: PType; scratchObj: PSym, 
                              castExpr, call, varSection, result: PNode) =
   let formals = n[0].typ.n
diff --git a/tests/parallel/tpi.nim b/tests/parallel/tpi.nim
index de5aa9a514..1ef5c6aea0 100644
--- a/tests/parallel/tpi.nim
+++ b/tests/parallel/tpi.nim
@@ -1,3 +1,7 @@
+discard """
+  output: '''3.141792613595791
+3.141792613595791'''
+"""
 
 import strutils, math, threadpool