threaded alloc (#20492)

* allocator: catch up with multi-threading techniques * removed the global thread lock * more atomics for fun and profit * added important sysAssert * stats remain thread local and don't have to be atomic * undo split chunk optimizations in the hope it makes the CI happy
2026-02-12 06:18:51 +00:00 · 2022-10-09 11:44:02 +02:00
parent b47d5486db
commit fbc6975922
3 changed files with 262 additions and 133 deletions
--- a/lib/system/alloc.nim
+++ b/lib/system/alloc.nim
@@ -44,44 +44,6 @@ type
  IntSet = object
    data: TrunkBuckets

-type
-  FreeCell {.final, pure.} = object
-    next: ptr FreeCell  # next free cell in chunk (overlaid with refcount)
-    when not defined(gcDestructors):
-      zeroField: int       # 0 means cell is not used (overlaid with typ field)
-                          # 1 means cell is manually managed pointer
-                          # otherwise a PNimType is stored in there
-    else:
-      alignment: int
-
-  PChunk = ptr BaseChunk
-  PBigChunk = ptr BigChunk
-  PSmallChunk = ptr SmallChunk
-  BaseChunk {.pure, inheritable.} = object
-    prevSize: int        # size of previous chunk; for coalescing
-                         # 0th bit == 1 if 'used
-    size: int            # if < PageSize it is a small chunk
-
-  SmallChunk = object of BaseChunk
-    next, prev: PSmallChunk  # chunks of the same size
-    freeList: ptr FreeCell
-    free: int            # how many bytes remain
-    acc: int             # accumulator for small object allocation
-    when defined(nimAlignPragma):
-      data {.align: MemAlign.}: UncheckedArray[byte]      # start of usable memory
-    else:
-      data: UncheckedArray[byte]
-
-  BigChunk = object of BaseChunk # not necessarily > PageSize!
-    next, prev: PBigChunk    # chunks of the same (or bigger) size
-    when defined(nimAlignPragma):
-      data {.align: MemAlign.}: UncheckedArray[byte]      # start of usable memory
-    else:
-      data: UncheckedArray[byte]
-
-template smallChunkOverhead(): untyped = sizeof(SmallChunk)
-template bigChunkOverhead(): untyped = sizeof(BigChunk)
-
 # ------------- chunk table ---------------------------------------------------
 # We use a PtrSet of chunk starts and a table[Page, chunksize] for chunk
 # endings of big chunks. This is needed by the merging operation. The only
@@ -102,6 +64,47 @@ type
    key, upperBound: int
    level: int

+const
+  RegionHasLock = false # hasThreadSupport and defined(gcDestructors)
+
+type
+  FreeCell {.final, pure.} = object
+    next: ptr FreeCell  # next free cell in chunk (overlaid with refcount)
+    when not defined(gcDestructors):
+      zeroField: int       # 0 means cell is not used (overlaid with typ field)
+                          # 1 means cell is manually managed pointer
+                          # otherwise a PNimType is stored in there
+    else:
+      alignment: int
+
+  PChunk = ptr BaseChunk
+  PBigChunk = ptr BigChunk
+  PSmallChunk = ptr SmallChunk
+  BaseChunk {.pure, inheritable.} = object
+    prevSize: int        # size of previous chunk; for coalescing
+                         # 0th bit == 1 if 'used
+    size: int            # if < PageSize it is a small chunk
+    owner: ptr MemRegion
+
+  SmallChunk = object of BaseChunk
+    next, prev: PSmallChunk  # chunks of the same size
+    freeList: ptr FreeCell
+    free: int            # how many bytes remain
+    acc: int             # accumulator for small object allocation
+    when defined(gcDestructors):
+      sharedFreeList: ptr FreeCell # make no attempt at avoiding false sharing for now for this object field
+    when defined(nimAlignPragma):
+      data {.align: MemAlign.}: UncheckedArray[byte]      # start of usable memory
+    else:
+      data: UncheckedArray[byte]
+
+  BigChunk = object of BaseChunk # not necessarily > PageSize!
+    next, prev: PBigChunk    # chunks of the same (or bigger) size
+    when defined(nimAlignPragma):
+      data {.align: MemAlign.}: UncheckedArray[byte]      # start of usable memory
+    else:
+      data: UncheckedArray[byte]
+
  HeapLinks = object
    len: int
    chunks: array[30, (PBigChunk, int)]
@@ -117,10 +120,15 @@ type
    llmem: PLLChunk
    currMem, maxMem, freeMem, occ: int # memory sizes (allocated from OS)
    lastSize: int # needed for the case that OS gives us pages linearly
+    when RegionHasLock:
+      lock: SysLock
+    when defined(gcDestructors):
+      sharedFreeListBigChunks: PBigChunk # make no attempt at avoiding false sharing for now for this object field
+
    chunkStarts: IntSet
    when not defined(gcDestructors):
      root, deleted, last, freeAvlNodes: PAvlNode
-    locked, blockChunkSizeIncrease: bool # if locked, we cannot free pages.
+    lockActive, locked, blockChunkSizeIncrease: bool # if locked, we cannot free pages.
    nextChunkSize: int
    when not defined(gcDestructors):
      bottomData: AvlNode
@@ -128,6 +136,24 @@ type
    when defined(nimTypeNames):
      allocCounter, deallocCounter: int

+template smallChunkOverhead(): untyped = sizeof(SmallChunk)
+template bigChunkOverhead(): untyped = sizeof(BigChunk)
+
+when hasThreadSupport:
+  template loada(x: untyped): untyped = atomicLoadN(unsafeAddr x, ATOMIC_RELAXED)
+  template storea(x, y: untyped) = atomicStoreN(unsafeAddr x, y, ATOMIC_RELAXED)
+
+  when false:
+    # not yet required
+    template atomicStatDec(x, diff: untyped) = discard atomicSubFetch(unsafeAddr x, diff, ATOMIC_RELAXED)
+    template atomicStatInc(x, diff: untyped) = discard atomicAddFetch(unsafeAddr x, diff, ATOMIC_RELAXED)
+else:
+  template loada(x: untyped): untyped = x
+  template storea(x, y: untyped) = x = y
+
+template atomicStatDec(x, diff: untyped) = dec x, diff
+template atomicStatInc(x, diff: untyped) = inc x, diff
+
 const
  fsLookupTable: array[byte, int8] = [
    -1'i8, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -234,11 +260,11 @@ proc addChunkToMatrix(a: var MemRegion; b: PBigChunk) =
  setBit(fl, a.flBitmap)

 proc incCurrMem(a: var MemRegion, bytes: int) {.inline.} =
-  inc(a.currMem, bytes)
+  atomicStatInc(a.currMem, bytes)

 proc decCurrMem(a: var MemRegion, bytes: int) {.inline.} =
  a.maxMem = max(a.maxMem, a.currMem)
-  dec(a.currMem, bytes)
+  atomicStatDec(a.currMem, bytes)

 proc getMaxMem(a: var MemRegion): int =
  # Since we update maxPagesCount only when freeing pages,
@@ -536,12 +562,13 @@ proc splitChunk2(a: var MemRegion, c: PBigChunk, size: int): PBigChunk =
  result = cast[PBigChunk](cast[ByteAddress](c) +% size)
  result.size = c.size - size
  track("result.size", addr result.size, sizeof(int))
-  # XXX check if these two nil assignments are dead code given
-  # addChunkToMatrix's implementation:
-  result.next = nil
-  result.prev = nil
+  when not defined(nimOptimizedSplitChunk):
+    # still active because of weird codegen issue on some of our CIs:
+    result.next = nil
+    result.prev = nil
  # size and not used:
  result.prevSize = size
+  result.owner = addr a
  sysAssert((size and 1) == 0, "splitChunk 2")
  sysAssert((size and PageMask) == 0,
      "splitChunk: size is not a multiple of the PageSize")
@@ -572,6 +599,9 @@ proc freeBigChunk(a: var MemRegion, c: PBigChunk) =
          c = cast[PBigChunk](le)
          if c.size > MaxBigChunkSize:
            let rest = splitChunk2(a, c, MaxBigChunkSize)
+            when defined(nimOptimizedSplitChunk):
+              rest.next = nil
+              rest.prev = nil
            addChunkToMatrix(a, c)
            c = rest
  when coalescRight:
@@ -596,6 +626,13 @@ proc getBigChunk(a: var MemRegion, size: int): PBigChunk =
  mappingSearch(size, fl, sl)
  sysAssert((size and PageMask) == 0, "getBigChunk: unaligned chunk")
  result = findSuitableBlock(a, fl, sl)
+
+  when RegionHasLock:
+    if not a.lockActive:
+      a.lockActive = true
+      initSysLock(a.lock)
+    acquireSys a.lock
+
  if result == nil:
    if size < nimMinHeapPages * PageSize:
      result = requestOsChunks(a, nimMinHeapPages * PageSize)
@@ -605,6 +642,7 @@ proc getBigChunk(a: var MemRegion, size: int): PBigChunk =
      # if we over allocated split the chunk:
      if result.size > size:
        splitChunk(a, result, size)
+    result.owner = addr a
  else:
    removeChunkFromMatrix2(a, result, fl, sl)
    if result.size >= size + PageSize:
@@ -612,12 +650,20 @@ proc getBigChunk(a: var MemRegion, size: int): PBigChunk =
  # set 'used' to to true:
  result.prevSize = 1
  track("setUsedToFalse", addr result.size, sizeof(int))
+  sysAssert result.owner == addr a, "getBigChunk: No owner set!"

  incl(a, a.chunkStarts, pageIndex(result))
  dec(a.freeMem, size)
+  when RegionHasLock:
+    releaseSys a.lock

 proc getHugeChunk(a: var MemRegion; size: int): PBigChunk =
  result = cast[PBigChunk](osAllocPages(size))
+  when RegionHasLock:
+    if not a.lockActive:
+      a.lockActive = true
+      initSysLock(a.lock)
+    acquireSys a.lock
  incCurrMem(a, size)
  # XXX add this to the heap links. But also remove it from it later.
  when false: a.addHeapLink(result, size)
@@ -627,7 +673,10 @@ proc getHugeChunk(a: var MemRegion; size: int): PBigChunk =
  result.size = size
  # set 'used' to to true:
  result.prevSize = 1
+  result.owner = addr a
  incl(a, a.chunkStarts, pageIndex(result))
+  when RegionHasLock:
+    releaseSys a.lock

 proc freeHugeChunk(a: var MemRegion; c: PBigChunk) =
  let size = c.size
@@ -691,6 +740,69 @@ else:
  template trackSize(x) = discard
  template untrackSize(x) = discard

+proc deallocBigChunk(a: var MemRegion, c: PBigChunk) =
+  when RegionHasLock:
+    acquireSys a.lock
+  dec a.occ, c.size
+  untrackSize(c.size)
+  sysAssert a.occ >= 0, "rawDealloc: negative occupied memory (case B)"
+  when not defined(gcDestructors):
+    a.deleted = getBottom(a)
+    del(a, a.root, cast[int](addr(c.data)))
+  if c.size >= HugeChunkSize: freeHugeChunk(a, c)
+  else: freeBigChunk(a, c)
+  when RegionHasLock:
+    releaseSys a.lock
+
+when defined(gcDestructors):
+  template atomicPrepend(head, elem: untyped) =
+    # see also https://en.cppreference.com/w/cpp/atomic/atomic_compare_exchange
+    while true:
+      elem.next.storea head.loada
+      if atomicCompareExchangeN(addr head, addr elem.next, elem, weak = true, ATOMIC_RELEASE, ATOMIC_RELAXED):
+        break
+
+  proc addToSharedFreeListBigChunks(a: var MemRegion; c: PBigChunk) {.inline.} =
+    sysAssert c.next == nil, "c.next pointer must be nil"
+    atomicPrepend a.sharedFreeListBigChunks, c
+
+  proc addToSharedFreeList(c: PSmallChunk; f: ptr FreeCell) {.inline.} =
+    atomicPrepend c.sharedFreeList, f
+
+  proc compensateCounters(a: var MemRegion; c: PSmallChunk; size: int) =
+    # rawDealloc did NOT do the usual:
+    # `inc(c.free, size); dec(a.occ, size)` because it wasn't the owner of these
+    # memory locations. We have to compensate here for these for the entire list.
+    # Well, not for the entire list, but for `max` elements of the list because
+    # we split the list in order to achieve bounded response times.
+    var it = c.freeList
+    var x = 0
+    var maxIters = 20 # make it time-bounded
+    while it != nil:
+      if maxIters == 0:
+        let rest = it.next.loada
+        it.next.storea nil
+        addToSharedFreeList(c, rest)
+        break
+      inc x, size
+      it = it.next.loada
+      dec maxIters
+    inc(c.free, x)
+    dec(a.occ, x)
+
+  proc freeDeferredObjects(a: var MemRegion; root: PBigChunk) =
+    var it = root
+    var maxIters = 20 # make it time-bounded
+    while true:
+      if maxIters == 0:
+        let rest = it.next.loada
+        it.next.storea nil
+        addToSharedFreeListBigChunks(a, rest)
+        break
+      it = it.next.loada
+      dec maxIters
+      if it == nil: break
+
 proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
  when defined(nimTypeNames):
    inc(a.allocCounter)
@@ -703,7 +815,7 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
  #c_fprintf(stdout, "alloc; size: %ld; %ld\n", requestedSize, size)
  if size <= SmallChunkSize-smallChunkOverhead():
    # allocate a small block: for small chunks, we use only its next pointer
-    var s = size div MemAlign
+    let s = size div MemAlign
    var c = a.freeSmallChunks[s]
    if c == nil:
      c = getSmallChunk(a)
@@ -711,7 +823,10 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
      sysAssert c.size == PageSize, "rawAlloc 3"
      c.size = size
      c.acc = size
+      when defined(gcDestructors):
+        c.sharedFreeList = nil
      c.free = SmallChunkSize - smallChunkOverhead() - size
+      sysAssert c.owner == addr(a), "rawAlloc: No owner set!"
      c.next = nil
      c.prev = nil
      listAdd(a.freeSmallChunks[s], c)
@@ -723,6 +838,10 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
      #if c.size != size:
      #  c_fprintf(stdout, "csize: %lld; size %lld\n", c.size, size)
      sysAssert c.size == size, "rawAlloc 6"
+      when defined(gcDestructors):
+        if c.freeList == nil:
+          c.freeList = atomicExchangeN(addr c.sharedFreeList, nil, ATOMIC_RELAXED)
+          compensateCounters(a, c, size)
      if c.freeList == nil:
        sysAssert(c.acc + smallChunkOverhead() + size <= SmallChunkSize,
                  "rawAlloc 7")
@@ -747,6 +866,11 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
    inc a.occ, size
    trackSize(c.size)
  else:
+    when defined(gcDestructors):
+      let deferredFrees = atomicExchangeN(addr a.sharedFreeListBigChunks, nil, ATOMIC_RELAXED)
+      if deferredFrees != nil:
+        freeDeferredObjects(a, deferredFrees)
+
    size = requestedSize + bigChunkOverhead() #  roundup(requestedSize+bigChunkOverhead(), PageSize)
    # allocate a large block
    var c = if size >= HugeChunkSize: getHugeChunk(a, size)
@@ -779,48 +903,52 @@ proc rawDealloc(a: var MemRegion, p: pointer) =
    # `p` is within a small chunk:
    var c = cast[PSmallChunk](c)
    var s = c.size
-    dec a.occ, s
-    untrackSize(s)
-    sysAssert a.occ >= 0, "rawDealloc: negative occupied memory (case A)"
-    sysAssert(((cast[ByteAddress](p) and PageMask) - smallChunkOverhead()) %%
-               s == 0, "rawDealloc 3")
+    #       ^ We might access thread foreign storage here.
+    # The other thread cannot possibly free this block as it's still alive.
    var f = cast[ptr FreeCell](p)
-    when not defined(gcDestructors):
-      #echo("setting to nil: ", $cast[ByteAddress](addr(f.zeroField)))
-      sysAssert(f.zeroField != 0, "rawDealloc 1")
-      f.zeroField = 0
-    f.next = c.freeList
-    c.freeList = f
-    when overwriteFree:
-      # set to 0xff to check for usage after free bugs:
-      nimSetMem(cast[pointer](cast[int](p) +% sizeof(FreeCell)), -1'i32,
-               s -% sizeof(FreeCell))
-    # check if it is not in the freeSmallChunks[s] list:
-    if c.free < s:
-      # add it to the freeSmallChunks[s] array:
-      listAdd(a.freeSmallChunks[s div MemAlign], c)
-      inc(c.free, s)
+    if c.owner == addr(a):
+      # We own the block, there is no foreign thread involved.
+      dec a.occ, s
+      untrackSize(s)
+      sysAssert a.occ >= 0, "rawDealloc: negative occupied memory (case A)"
+      sysAssert(((cast[ByteAddress](p) and PageMask) - smallChunkOverhead()) %%
+                s == 0, "rawDealloc 3")
+      when not defined(gcDestructors):
+        #echo("setting to nil: ", $cast[ByteAddress](addr(f.zeroField)))
+        sysAssert(f.zeroField != 0, "rawDealloc 1")
+        f.zeroField = 0
+      f.next = c.freeList
+      c.freeList = f
+      when overwriteFree:
+        # set to 0xff to check for usage after free bugs:
+        nimSetMem(cast[pointer](cast[int](p) +% sizeof(FreeCell)), -1'i32,
+                s -% sizeof(FreeCell))
+      # check if it is not in the freeSmallChunks[s] list:
+      if c.free < s:
+        # add it to the freeSmallChunks[s] array:
+        listAdd(a.freeSmallChunks[s div MemAlign], c)
+        inc(c.free, s)
+      else:
+        inc(c.free, s)
+        if c.free == SmallChunkSize-smallChunkOverhead():
+          listRemove(a.freeSmallChunks[s div MemAlign], c)
+          c.size = SmallChunkSize
+          freeBigChunk(a, cast[PBigChunk](c))
    else:
-      inc(c.free, s)
-      if c.free == SmallChunkSize-smallChunkOverhead():
-        listRemove(a.freeSmallChunks[s div MemAlign], c)
-        c.size = SmallChunkSize
-        freeBigChunk(a, cast[PBigChunk](c))
+      when defined(gcDestructors):
+        addToSharedFreeList(c, f)
    sysAssert(((cast[ByteAddress](p) and PageMask) - smallChunkOverhead()) %%
               s == 0, "rawDealloc 2")
  else:
    # set to 0xff to check for usage after free bugs:
    when overwriteFree: nimSetMem(p, -1'i32, c.size -% bigChunkOverhead())
-    # free big chunk
-    var c = cast[PBigChunk](c)
-    dec a.occ, c.size
-    untrackSize(c.size)
-    sysAssert a.occ >= 0, "rawDealloc: negative occupied memory (case B)"
-    when not defined(gcDestructors):
-      a.deleted = getBottom(a)
-      del(a, a.root, cast[int](addr(c.data)))
-    if c.size >= HugeChunkSize: freeHugeChunk(a, c)
-    else: freeBigChunk(a, c)
+    when defined(gcDestructors):
+      if c.owner == addr(a):
+        deallocBigChunk(a, cast[PBigChunk](c))
+      else:
+        addToSharedFreeListBigChunks(c.owner[], cast[PBigChunk](c))
+    else:
+      deallocBigChunk(a, cast[PBigChunk](c))
  sysAssert(allocInv(a), "rawDealloc: end")
  when logAlloc: cprintf("dealloc(pointer_%p)\n", p)

@@ -1002,7 +1130,7 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =
        inc(result, it.size)
        it = it.next

-  when hasThreadSupport:
+  when hasThreadSupport and not defined(gcDestructors):
    proc addSysExitProc(quitProc: proc() {.noconv.}) {.importc: "atexit", header: "<stdlib.h>".}

    var sharedHeap: MemRegion
@@ -1012,36 +1140,16 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =

  proc getFreeMem(): int =
    #sysAssert(result == countFreeMem())
-    when hasThreadSupport and defined(gcDestructors):
-      acquireSys(heapLock)
-      result = sharedHeap.freeMem
-      releaseSys(heapLock)
-    else:
-      result = allocator.freeMem
+    result = allocator.freeMem

  proc getTotalMem(): int =
-    when hasThreadSupport and defined(gcDestructors):
-      acquireSys(heapLock)
-      result = sharedHeap.currMem
-      releaseSys(heapLock)
-    else:
-      result = allocator.currMem
+    result = allocator.currMem

  proc getOccupiedMem(): int =
-    when hasThreadSupport and defined(gcDestructors):
-      acquireSys(heapLock)
-      result = sharedHeap.occ
-      releaseSys(heapLock)
-    else:
-      result = allocator.occ #getTotalMem() - getFreeMem()
+    result = allocator.occ #getTotalMem() - getFreeMem()

  proc getMaxMem*(): int =
-    when hasThreadSupport and defined(gcDestructors):
-      acquireSys(heapLock)
-      result = getMaxMem(sharedHeap)
-      releaseSys(heapLock)
-    else:
-      result = getMaxMem(allocator)
+    result = getMaxMem(allocator)

  when defined(nimTypeNames):
    proc getMemCounters*(): (int, int) = getMemCounters(allocator)
@@ -1049,7 +1157,7 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =
  # -------------------- shared heap region ----------------------------------

  proc allocSharedImpl(size: Natural): pointer =
-    when hasThreadSupport:
+    when hasThreadSupport and not defined(gcDestructors):
      acquireSys(heapLock)
      result = alloc(sharedHeap, size)
      releaseSys(heapLock)
@@ -1061,7 +1169,7 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =
    zeroMem(result, size)

  proc deallocSharedImpl(p: pointer) =
-    when hasThreadSupport:
+    when hasThreadSupport and not defined(gcDestructors):
      acquireSys(heapLock)
      dealloc(sharedHeap, p)
      releaseSys(heapLock)
@@ -1069,7 +1177,7 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =
      deallocImpl(p)

  proc reallocSharedImpl(p: pointer, newSize: Natural): pointer =
-    when hasThreadSupport:
+    when hasThreadSupport and not defined(gcDestructors):
      acquireSys(heapLock)
      result = realloc(sharedHeap, p, newSize)
      releaseSys(heapLock)
@@ -1077,7 +1185,7 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =
      result = reallocImpl(p, newSize)

  proc reallocShared0Impl(p: pointer, oldSize, newSize: Natural): pointer =
-    when hasThreadSupport:
+    when hasThreadSupport and not defined(gcDestructors):
      acquireSys(heapLock)
      result = realloc0(sharedHeap, p, oldSize, newSize)
      releaseSys(heapLock)
@@ -1085,20 +1193,31 @@ template instantiateForRegion(allocator: untyped) {.dirty.} =
      result = realloc0Impl(p, oldSize, newSize)

  when hasThreadSupport:
-    template sharedMemStatsShared(v: int) =
-      acquireSys(heapLock)
-      result = v
-      releaseSys(heapLock)
+    when defined(gcDestructors):
+      proc getFreeSharedMem(): int =
+        allocator.freeMem

-    proc getFreeSharedMem(): int =
-      sharedMemStatsShared(sharedHeap.freeMem)
+      proc getTotalSharedMem(): int =
+        allocator.currMem

-    proc getTotalSharedMem(): int =
-      sharedMemStatsShared(sharedHeap.currMem)
+      proc getOccupiedSharedMem(): int =
+        allocator.occ

-    proc getOccupiedSharedMem(): int =
-      sharedMemStatsShared(sharedHeap.occ)
-      #sharedMemStatsShared(sharedHeap.currMem - sharedHeap.freeMem)
+    else:
+      template sharedMemStatsShared(v: int) =
+        acquireSys(heapLock)
+        result = v
+        releaseSys(heapLock)
+
+      proc getFreeSharedMem(): int =
+        sharedMemStatsShared(sharedHeap.freeMem)
+
+      proc getTotalSharedMem(): int =
+        sharedMemStatsShared(sharedHeap.currMem)
+
+      proc getOccupiedSharedMem(): int =
+        sharedMemStatsShared(sharedHeap.occ)
+        #sharedMemStatsShared(sharedHeap.currMem - sharedHeap.freeMem)
  {.pop.}

 {.pop.}
--- a/lib/system/atomics.nim
+++ b/lib/system/atomics.nim
@@ -17,7 +17,7 @@ type
  AtomType* = SomeNumber|pointer|ptr|char|bool
    ## Type Class representing valid types for use with atomic procs

-when someGcc and hasThreadSupport:
+when someGcc:
  type AtomMemModel* = distinct cint

  var ATOMIC_RELAXED* {.importc: "__ATOMIC_RELAXED", nodecl.}: AtomMemModel
@@ -164,7 +164,7 @@ when someGcc and hasThreadSupport:
    ## ignore this parameter.

  template fence*() = atomicThreadFence(ATOMIC_SEQ_CST)
-elif someVcc and hasThreadSupport:
+elif someVcc:
  type AtomMemModel* = distinct cint

  const
--- a/lib/system/threads.nim
+++ b/lib/system/threads.nim
@@ -53,13 +53,23 @@ when defined(nimPreviewSlimSystem):
 const
  hasAllocStack = defined(zephyr) # maybe freertos too?

+when defined(gcDestructors):
+  proc allocThreadStorage(size: int): pointer =
+    result = c_malloc(csize_t size)
+    zeroMem(result, size)
+
+  proc deallocThreadStorage(p: pointer) = c_free(p)
+else:
+  template allocThreadStorage(size: untyped): untyped = allocShared0(size)
+  template deallocThreadStorage(p: pointer) = deallocShared(p)
+
 when hasAllocStack or defined(zephyr) or defined(freertos):
  const
-    nimThreadStackSize {.intdefine.} = 8192 
+    nimThreadStackSize {.intdefine.} = 8192
    nimThreadStackGuard {.intdefine.} = 128

-    StackGuardSize = nimThreadStackGuard 
-    ThreadStackSize = nimThreadStackSize - nimThreadStackGuard 
+    StackGuardSize = nimThreadStackGuard
+    ThreadStackSize = nimThreadStackSize - nimThreadStackGuard
 else:
  const
    StackGuardSize = 4096
@@ -176,7 +186,7 @@ else:
    finally:
      afterThreadRuns()
      when hasAllocStack:
-        deallocShared(thrd.rawStack)
+        deallocThreadStorage(thrd.rawStack)

 proc threadProcWrapStackFrame[TArg](thrd: ptr Thread[TArg]) {.raises: [].} =
  when defined(boehmgc):
@@ -207,7 +217,7 @@ template threadProcWrapperBody(closure: untyped): untyped =
  # mark as not running anymore:
  thrd.core = nil
  thrd.dataFn = nil
-  deallocShared(cast[pointer](core))
+  deallocThreadStorage(cast[pointer](core))

 {.push stack_trace:off.}
 when defined(windows):
@@ -278,7 +288,7 @@ when false:
    t.dataFn = nil
    ## if thread `t` already exited, `t.core` will be `null`.
    if not isNil(t.core):
-      deallocShared(t.core)
+      deallocThreadStorage(t.core)
      t.core = nil

 when hostOS == "windows":
@@ -290,7 +300,7 @@ when hostOS == "windows":
    ## Entry point is the proc `tp`.
    ## `param` is passed to `tp`. `TArg` can be `void` if you
    ## don't need to pass any data to the thread.
-    t.core = cast[PGcThread](allocShared0(sizeof(GcThread)))
+    t.core = cast[PGcThread](allocThreadStorage(sizeof(GcThread)))

    when TArg isnot void: t.data = param
    t.dataFn = tp
@@ -315,7 +325,7 @@ elif defined(genode):
  proc createThread*[TArg](t: var Thread[TArg],
                           tp: proc (arg: TArg) {.thread, nimcall.},
                           param: TArg) =
-    t.core = cast[PGcThread](allocShared0(sizeof(GcThread)))
+    t.core = cast[PGcThread](allocThreadStorage(sizeof(GcThread)))

    when TArg isnot void: t.data = param
    t.dataFn = tp
@@ -339,7 +349,7 @@ else:
    ## Entry point is the proc `tp`. `param` is passed to `tp`.
    ## `TArg` can be `void` if you
    ## don't need to pass any data to the thread.
-    t.core = cast[PGcThread](allocShared0(sizeof(GcThread)))
+    t.core = cast[PGcThread](allocThreadStorage(sizeof(GcThread)))

    when TArg isnot void: t.data = param
    t.dataFn = tp
@@ -348,7 +358,7 @@ else:
    doAssert pthread_attr_init(a) == 0
    when hasAllocStack:
      var
-        rawstk = allocShared0(ThreadStackSize + StackGuardSize)
+        rawstk = allocThreadStorage(ThreadStackSize + StackGuardSize)
        stk = cast[pointer](cast[uint](rawstk) + StackGuardSize)
      let setstacksizeResult = pthread_attr_setstack(addr a, stk, ThreadStackSize)
      t.rawStack = rawstk