From 6621d643988cd0c6b22b3ad9b9dc9cfb2d8d443d Mon Sep 17 00:00:00 2001 From: Andreas Rumpf Date: Tue, 7 Apr 2026 18:07:34 +0200 Subject: [PATCH] fixes #25577 (#25691) --- lib/system/alloc.nim | 67 +++++++++++++++++++++++++++---------- lib/system/arc.nim | 38 ++++++++++++++++----- tests/align/talign_heap.nim | 23 +++++++++++++ 3 files changed, 101 insertions(+), 27 deletions(-) create mode 100644 tests/align/talign_heap.nim diff --git a/lib/system/alloc.nim b/lib/system/alloc.nim index c40f808b88..925f20d906 100644 --- a/lib/system/alloc.nim +++ b/lib/system/alloc.nim @@ -127,11 +127,12 @@ type # reaches dealloc while the source chunk is active. # Instead, the receiving chunk gains the capacity and thus reserves space in the foreign chunk. acc: uint32 # Offset from data, used when there are no free cells available but the chunk is considered free. - foreignCells: int # When a free cell is given to a chunk that is not its origin, + foreignCells: int32 # When a free cell is given to a chunk that is not its origin, # both the cell and the source chunk are considered foreign. # Receiving a foreign cell can happen both when deallocating from another thread or when # the active chunk in `a.freeSmallChunks` is not the current chunk. # Freeing a chunk while `foreignCells > 0` leaks memory as all references to it become lost. + chunkAlignOff: int32 # Byte offset from `data` where cells begin. Non-zero for alignment > MemAlign. data {.align: MemAlign.}: UncheckedArray[byte] # start of usable memory BigChunk = object of BaseChunk # not necessarily > PageSize! @@ -472,8 +473,8 @@ iterator allObjects(m: var MemRegion): pointer {.inline.} = var c = cast[PSmallChunk](c) let size = c.size - var a = cast[int](addr(c.data)) - let limit = a + c.acc.int + var a = cast[int](addr(c.data)) + c.chunkAlignOff.int + let limit = cast[int](addr(c.data)) + c.acc.int while a <% limit: yield cast[pointer](a) a = a +% size @@ -851,6 +852,15 @@ when defined(heaptrack): proc heaptrack_malloc(a: pointer, size: int) {.cdecl, importc, dynlib: heaptrackLib.} proc heaptrack_free(a: pointer) {.cdecl, importc, dynlib: heaptrackLib.} +proc smallChunkAlignOffset(alignment: int): int {.inline.} = + ## Compute the initial data offset so that data + result + sizeof(FreeCell) + ## is alignment-aligned within a page-aligned small chunk. + if alignment <= MemAlign: + result = 0 + else: + result = align(smallChunkOverhead() + sizeof(FreeCell), alignment) - + smallChunkOverhead() - sizeof(FreeCell) + proc bigChunkAlignOffset(alignment: int): int {.inline.} = ## Compute the alignment offset for big chunk data. if alignment == 0: @@ -863,14 +873,13 @@ proc rawAlloc(a: var MemRegion, requestedSize: int, alignment: int = 0): pointer inc(a.allocCounter) sysAssert(allocInv(a), "rawAlloc: begin") sysAssert(roundup(65, 8) == 72, "rawAlloc: roundup broken") - var size = roundup(requestedSize, MemAlign) + var size = roundup(requestedSize, max(MemAlign, alignment)) + let alignOff = smallChunkAlignOffset(alignment) sysAssert(size >= sizeof(FreeCell), "rawAlloc: requested size too small") sysAssert(size >= requestedSize, "insufficient allocated size!") #c_fprintf(stdout, "alloc; size: %ld; %ld\n", requestedSize, size) - # For custom alignments > MemAlign, force big chunk allocation - # Small chunks cannot handle arbitrary alignments due to fixed cell boundaries - if size <= SmallChunkSize-smallChunkOverhead() and alignment == 0: + if size + alignOff <= SmallChunkSize-smallChunkOverhead(): template fetchSharedCells(tc: PSmallChunk) = # Consumes cells from (potentially) foreign threads from `a.sharedFreeLists[s]` when defined(gcDestructors): @@ -888,16 +897,19 @@ proc rawAlloc(a: var MemRegion, requestedSize: int, alignment: int = 0): pointer # allocate a small block: for small chunks, we use only its next pointer let s = size div MemAlign var c = a.freeSmallChunks[s] + if c != nil and c.chunkAlignOff != alignOff.int32: + c = nil if c == nil: # There is no free chunk of the requested size available, we need a new one. c = getSmallChunk(a) # init all fields in case memory didn't get zeroed c.freeList = nil c.foreignCells = 0 + c.chunkAlignOff = alignOff.int32 sysAssert c.size == PageSize, "rawAlloc 3" c.size = size - c.acc = size.uint32 - c.free = SmallChunkSize - smallChunkOverhead() - size.int32 + c.acc = (alignOff + size).uint32 + c.free = SmallChunkSize - smallChunkOverhead() - alignOff.int32 - size.int32 sysAssert c.owner == addr(a), "rawAlloc: No owner set!" c.next = nil c.prev = nil @@ -908,7 +920,7 @@ proc rawAlloc(a: var MemRegion, requestedSize: int, alignment: int = 0): pointer # Because removals from `a.freeSmallChunks[s]` only happen in the other alloc branch and during dealloc, # we must not add it to the list if it cannot be used the next time a pointer of `size` bytes is needed. listAdd(a.freeSmallChunks[s], c) - result = addr(c.data) + result = addr(c.data) +! alignOff sysAssert((cast[int](result) and (MemAlign-1)) == 0, "rawAlloc 4") else: # There is a free chunk of the requested size available, use it. @@ -950,7 +962,7 @@ proc rawAlloc(a: var MemRegion, requestedSize: int, alignment: int = 0): pointer sysAssert(allocInv(a), "rawAlloc: before listRemove test") listRemove(a.freeSmallChunks[s], c) sysAssert(allocInv(a), "rawAlloc: end listRemove test") - sysAssert(((cast[int](result) and PageMask) - smallChunkOverhead()) %% + sysAssert(((cast[int](result) and PageMask) - smallChunkOverhead() - c.chunkAlignOff) %% size == 0, "rawAlloc 21") sysAssert(allocInv(a), "rawAlloc: end small size") inc a.occ, size @@ -1018,7 +1030,7 @@ proc rawDealloc(a: var MemRegion, p: pointer) = dec a.occ, s untrackSize(s) sysAssert a.occ >= 0, "rawDealloc: negative occupied memory (case A)" - sysAssert(((cast[int](p) and PageMask) - smallChunkOverhead()) %% + sysAssert(((cast[int](p) and PageMask) - smallChunkOverhead() - c.chunkAlignOff) %% s == 0, "rawDealloc 3") when not defined(gcDestructors): #echo("setting to nil: ", $cast[int](addr(f.zeroField))) @@ -1029,7 +1041,8 @@ proc rawDealloc(a: var MemRegion, p: pointer) = nimSetMem(cast[pointer](cast[int](p) +% sizeof(FreeCell)), -1'i32, s -% sizeof(FreeCell)) let activeChunk = a.freeSmallChunks[s div MemAlign] - if activeChunk != nil and c != activeChunk: + if activeChunk != nil and c != activeChunk and + activeChunk.chunkAlignOff == c.chunkAlignOff: # This pointer is not part of the active chunk, lend it out # and do not adjust the current chunk (same logic as compensateCounters.) # Put the cell into the active chunk, @@ -1076,7 +1089,7 @@ proc rawDealloc(a: var MemRegion, p: pointer) = when defined(gcDestructors): addToSharedFreeList(c, f, s div MemAlign) - sysAssert(((cast[int](p) and PageMask) - smallChunkOverhead()) %% + sysAssert(((cast[int](p) and PageMask) - smallChunkOverhead() - c.chunkAlignOff) %% s == 0, "rawDealloc 2") else: # set to 0xff to check for usage after free bugs: @@ -1102,8 +1115,11 @@ when not defined(gcDestructors): var c = cast[PSmallChunk](c) var offset = (cast[int](p) and (PageSize-1)) -% smallChunkOverhead() - result = (c.acc.int >% offset) and (offset %% c.size == 0) and - (cast[ptr FreeCell](p).zeroField >% 1) + if c.acc.int >% offset: + let ao = c.chunkAlignOff.int + result = (offset >= ao) and + ((offset -% ao) %% c.size == 0) and + (cast[ptr FreeCell](p).zeroField >% 1) else: var c = cast[PBigChunk](c) # prev stores the aligned data pointer set during rawAlloc @@ -1122,11 +1138,12 @@ when not defined(gcDestructors): var c = cast[PSmallChunk](c) var offset = (cast[int](p) and (PageSize-1)) -% smallChunkOverhead() - if c.acc.int >% offset: + let ao = c.chunkAlignOff.int + if c.acc.int >% offset and offset >= ao: sysAssert(cast[int](addr(c.data)) +% offset == cast[int](p), "offset is not what you think it is") var d = cast[ptr FreeCell](cast[int](addr(c.data)) +% - offset -% (offset %% c.size)) + ao +% ((offset -% ao) -% ((offset -% ao) %% c.size))) if d.zeroField >% 1: result = d sysAssert isAllocatedPtr(a, result), " result wrong pointer!" @@ -1257,6 +1274,20 @@ template instantiateForRegion(allocator: untyped) {.dirty.} = proc alloc0Impl(size: Natural): pointer = result = alloc0(allocator, size) + when defined(gcOrc) or defined(gcYrc): + proc nimAlignedAlloc0(size: Natural, alignment: int): pointer = + incStat(allocCount) + result = rawAlloc(allocator, size, alignment) + zeroMem(result, size) + + proc nimAlignedAlloc(size: Natural, alignment: int): pointer = + incStat(allocCount) + result = rawAlloc(allocator, size, alignment) + + proc nimAlignedDealloc(p: pointer) = + incStat(deallocCount) + rawDealloc(allocator, p) + proc deallocImpl(p: pointer) = dealloc(allocator, p) diff --git a/lib/system/arc.nim b/lib/system/arc.nim index 3ac84be3bb..eea0468e82 100644 --- a/lib/system/arc.nim +++ b/lib/system/arc.nim @@ -92,12 +92,27 @@ else: when not defined(nimHasQuirky): {.pragma: quirky.} +# Forward declarations for native allocator alignment (implemented in alloc.nim). +# rawAlloc's contract: result + sizeof(FreeCell) is alignment-aligned. +# For ORC/YRC, sizeof(FreeCell) == sizeof(RefHeader). +const useNativeAlignedAlloc = (defined(gcOrc) or defined(gcYrc)) and + not defined(useMalloc) and not defined(nimscript) and + not defined(nimdoc) and not defined(useNimRtl) + +when useNativeAlignedAlloc: + proc nimAlignedAlloc0(size: Natural, alignment: int): pointer {.gcsafe, raises: [].} + proc nimAlignedAlloc(size: Natural, alignment: int): pointer {.gcsafe, raises: [].} + proc nimAlignedDealloc(p: pointer) {.gcsafe, raises: [].} + proc nimNewObj(size, alignment: int): pointer {.compilerRtl.} = - let hdrSize = align(sizeof(RefHeader), alignment) - let s = size +% hdrSize - when defined(nimscript): + when defined(nimscript) or defined(nimdoc): discard + elif useNativeAlignedAlloc: + let s = size +% sizeof(RefHeader) + result = nimAlignedAlloc0(s, alignment) +! sizeof(RefHeader) else: + let hdrSize = align(sizeof(RefHeader), alignment) + let s = size +% hdrSize result = alignedAlloc0(s, alignment) +! hdrSize when defined(nimArcDebug) or defined(nimArcIds): head(result).refId = gRefId @@ -111,12 +126,14 @@ proc nimNewObj(size, alignment: int): pointer {.compilerRtl.} = proc nimNewObjUninit(size, alignment: int): pointer {.compilerRtl.} = # Same as 'newNewObj' but do not initialize the memory to zero. - # The codegen proved for us that this is not necessary. - let hdrSize = align(sizeof(RefHeader), alignment) - let s = size + hdrSize - when defined(nimscript): + when defined(nimscript) or defined(nimdoc): discard + elif useNativeAlignedAlloc: + let s = size + sizeof(RefHeader) + result = cast[ptr RefHeader](nimAlignedAlloc(s, alignment) +! sizeof(RefHeader)) else: + let hdrSize = align(sizeof(RefHeader), alignment) + let s = size + hdrSize result = cast[ptr RefHeader](alignedAlloc(s, alignment) +! hdrSize) head(result).rc = 0 when defined(gcOrc) or defined(gcYrc): @@ -189,8 +206,11 @@ proc nimRawDispose(p: pointer, alignment: int) {.compilerRtl.} = if freedCells.data == nil: init(freedCells) freedCells.incl head(p) else: - let hdrSize = align(sizeof(RefHeader), alignment) - alignedDealloc(p -! hdrSize, alignment) + when useNativeAlignedAlloc: + nimAlignedDealloc(p -! sizeof(RefHeader)) + else: + let hdrSize = align(sizeof(RefHeader), alignment) + alignedDealloc(p -! hdrSize, alignment) template `=dispose`*[T](x: owned(ref T)) = nimRawDispose(cast[pointer](x), T.alignOf) #proc dispose*(x: pointer) = nimRawDispose(x) diff --git a/tests/align/talign_heap.nim b/tests/align/talign_heap.nim new file mode 100644 index 0000000000..65abf728ac --- /dev/null +++ b/tests/align/talign_heap.nim @@ -0,0 +1,23 @@ +discard """ + matrix: "--mm:refc; --mm:orc; --mm:arc" + targets: "c cpp" + output: "ok" +""" + +# Test that heap-allocated objects with .align use small chunks, +# not a big chunk per object (regression test for #25577). +type U = object + d {.align: 32.}: int8 + +var e: seq[ref U] +for _ in 0 ..< 10000: e.add(new U) + +# Without small-chunk alignment, each object gets its own page (~46 MB). +# With the fix, 10000 objects fit in ~1-3 MB depending on the GC. +doAssert getTotalMem() < 8 * 1024 * 1024, "align:32 heap objects use too much memory" + +# Verify alignment is actually correct +for i in 0 ..< e.len: + doAssert (cast[int](addr e[i].d) and 31) == 0, "field not 32-byte aligned" + +echo "ok"