Allocator: Always place free cells into the active chunk and add documentation (#23871)

Lets single threaded applications benefit from tracking foreign cells as
well.
After this, `SmallChunk` technically doesn't need to act as a linked
list anymore I think, gotta investigate that more though.
The likelihood of overflowing `chunk.free` also rises, so to work around
that it might make sense to check `foreignCells` instead of adjusting
free space or replace free with a counter for the local capacity.

For Nim compile I can observe a ~10mb reduction, and smaller ones for
other projects.

(cherry picked from commit 881fbb8f81)
This commit is contained in:
SirOlaf
2024-07-22 16:36:46 +02:00
committed by narimiran
parent 179ae267e9
commit 7b834b94da

View File

@@ -20,6 +20,37 @@ template track(op, address, size) =
# We manage *chunks* of memory. Each chunk is a multiple of the page size.
# Each chunk starts at an address that is divisible by the page size.
# Small chunks may be divided into smaller cells of reusable pointers to reduce the number of page allocations.
# An allocation of a small pointer looks approximately like this
#[
alloc -> rawAlloc -> No free chunk available > Request a new page from tslf -> result = chunk.data -------------+
| |
v |
Free chunk available |
| |
v v
Fetch shared cells -> No free cells available -> Advance acc -> result = chunk.data + chunk.acc -------> return
(may not add new cells) ^
| |
v |
Free cells available -> result = chunk.freeList -> Advance chunk.freeList -----------------------------------+
]#
# so it is split into 3 paths, where the last path is preferred to prevent unnecessary allocations.
#
#
# A deallocation of a small pointer then looks like this
#[
dealloc -> rawDealloc -> chunk.owner == addr(a) --------------> This thread owns the chunk ------> The current chunk is active -> Chunk is completely unused -----> Chunk references no foreign cells
| | (Add cell into the current chunk) | Return the current chunk back to tlsf
| | | |
v v v v
A different thread owns this chunk. The current chunk is not active. chunk.free was < size Chunk references foreign cells, noop
Add the cell to a.sharedFreeLists Add the cell into the active chunk Activate the chunk (end)
(end) (end) (end)
]#
# So "true" deallocation is delayed for as long as possible in favor of reusing cells.
const
nimMinHeapPages {.intdefine.} = 128 # 0.5 MB
@@ -71,6 +102,8 @@ const
type
FreeCell {.final, pure.} = object
# A free cell is a pointer that has been freed, meaning it became available for reuse.
# It may become foreign if it is lent to a chunk that did not create it, doing so reduces the amount of needed pages.
next: ptr FreeCell # next free cell in chunk (overlaid with refcount)
when not defined(gcDestructors):
zeroField: int # 0 means cell is not used (overlaid with typ field)
@@ -90,11 +123,18 @@ type
SmallChunk = object of BaseChunk
next, prev: PSmallChunk # chunks of the same size
freeList: ptr FreeCell
free: int32 # how many bytes remain
acc: uint32 # accumulator for small object allocation
foreignCells: int # Number of deferred free cells from other threads this chunk stole from sharedFreeLists.
# Freeing the chunk before this is zero means the stolen cells become inaccessible permanently.
freeList: ptr FreeCell # Singly linked list of cells. They may be from foreign chunks or from the current chunk.
# Should be `nil` when the chunk isn't active in `a.freeSmallChunks`.
free: int32 # Bytes this chunk is able to provide using both the accumulator and free cells.
# When a cell is considered foreign, its source chunk's free field is NOT adjusted until it
# reaches dealloc while the source chunk is active.
# Instead, the receiving chunk gains the capacity and thus reserves space in the foreign chunk.
acc: uint32 # Offset from data, used when there are no free cells available but the chunk is considered free.
foreignCells: int # When a free cell is given to a chunk that is not its origin,
# both the cell and the source chunk are considered foreign.
# Receiving a foreign cell can happen both when deallocating from another thread or when
# the active chunk in `a.freeSmallChunks` is not the current chunk.
# Freeing a chunk while `foreignCells > 0` leaks memory as all references to it become lost.
data {.align: MemAlign.}: UncheckedArray[byte] # start of usable memory
BigChunk = object of BaseChunk # not necessarily > PageSize!
@@ -110,8 +150,11 @@ type
when not defined(gcDestructors):
minLargeObj, maxLargeObj: int
freeSmallChunks: array[0..max(1, SmallChunkSize div MemAlign-1), PSmallChunk]
# List of available chunks per size class. Only one is expected to be active per class.
when defined(gcDestructors):
sharedFreeLists: array[0..max(1, SmallChunkSize div MemAlign-1), ptr FreeCell]
# When a thread frees a pointer it did not create, it must not adjust the counters.
# Instead, the cell is placed here and deferred until the next allocation.
flBitmap: uint32
slBitmap: array[RealFli, uint32]
matrix: array[RealFli, array[MaxSli, PBigChunk]]
@@ -794,6 +837,8 @@ when defined(gcDestructors):
inc total, size
let chunk = cast[PSmallChunk](pageAddr(it))
if c != chunk:
# The cell is foreign, potentially even from a foreign thread.
# It must block the current chunk from being freed, as doing so would leak memory.
inc c.foreignCells
it = it.next
# By not adjusting the foreign chunk we reserve space in it to prevent deallocation
@@ -828,6 +873,7 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
if size <= SmallChunkSize-smallChunkOverhead():
template fetchSharedCells(tc: PSmallChunk) =
# Consumes cells from (potentially) foreign threads from `a.sharedFreeLists[s]`
when defined(gcDestructors):
if tc.freeList == nil:
when hasThreadSupport:
@@ -836,13 +882,17 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
else:
tc.freeList = a.sharedFreeLists[s]
a.sharedFreeLists[s] = nil
# if `tc.freeList` isn't nil, `tc` will gain capacity.
# We must calculate how much it gained and how many foreign cells are included.
compensateCounters(a, tc, size)
# allocate a small block: for small chunks, we use only its next pointer
let s = size div MemAlign
var c = a.freeSmallChunks[s]
if c == nil:
# There is no free chunk of the requested size available, we need a new one.
c = getSmallChunk(a)
# init all fields in case memory didn't get zeroed
c.freeList = nil
c.foreignCells = 0
sysAssert c.size == PageSize, "rawAlloc 3"
@@ -852,12 +902,17 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
sysAssert c.owner == addr(a), "rawAlloc: No owner set!"
c.next = nil
c.prev = nil
# Shared cells are fetched here in case `c.size * 2 >= SmallChunkSize - smallChunkOverhead()`.
# For those single cell chunks, we would otherwise have to allocate a new one almost every time.
fetchSharedCells(c)
if c.free >= size:
# Because removals from `a.freeSmallChunks[s]` only happen in the other alloc branch and during dealloc,
# we must not add it to the list if it cannot be used the next time a pointer of `size` bytes is needed.
listAdd(a.freeSmallChunks[s], c)
result = addr(c.data)
sysAssert((cast[int](result) and (MemAlign-1)) == 0, "rawAlloc 4")
else:
# There is a free chunk of the requested size available, use it.
sysAssert(allocInv(a), "rawAlloc: begin c != nil")
sysAssert c.next != c, "rawAlloc 5"
#if c.size != size:
@@ -869,24 +924,30 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer =
result = cast[pointer](cast[int](addr(c.data)) +% c.acc.int)
inc(c.acc, size)
else:
# There are free cells available, prefer them over the accumulator
result = c.freeList
when not defined(gcDestructors):
sysAssert(c.freeList.zeroField == 0, "rawAlloc 8")
c.freeList = c.freeList.next
if cast[PSmallChunk](pageAddr(result)) != c:
# This cell isn't a blocker for the current chunk anymore
# This cell isn't a blocker for the current chunk's deallocation anymore
dec(c.foreignCells)
else:
sysAssert(c == cast[PSmallChunk](pageAddr(result)), "Bad cell")
sysAssert(c == cast[PSmallChunk](pageAddr(result)), "rawAlloc: Bad cell")
# Even if the cell we return is foreign, the local chunk's capacity decreases.
# The capacity was previously reserved in the source chunk (when it first got allocated),
# then added into the current chunk during dealloc,
# so the source chunk will not be freed or leak memory because of this.
dec(c.free, size)
sysAssert((cast[int](result) and (MemAlign-1)) == 0, "rawAlloc 9")
sysAssert(allocInv(a), "rawAlloc: end c != nil")
# We fetch deferred cells *after* advancing c.freeList/acc to adjust c.free.
# We fetch deferred cells *after* advancing `c.freeList`/`acc` to adjust `c.free`.
# If after the adjustment it turns out there's free cells available,
# the chunk stays in a.freeSmallChunks[s] and the need for a new chunk is delayed.
# the chunk stays in `a.freeSmallChunks[s]` and the need for a new chunk is delayed.
fetchSharedCells(c)
sysAssert(allocInv(a), "rawAlloc: before c.free < size")
if c.free < size:
# Even after fetching shared cells the chunk has no usable memory left. It is no longer the active chunk
sysAssert(allocInv(a), "rawAlloc: before listRemove test")
listRemove(a.freeSmallChunks[s], c)
sysAssert(allocInv(a), "rawAlloc: end listRemove test")
@@ -952,23 +1013,37 @@ proc rawDealloc(a: var MemRegion, p: pointer) =
#echo("setting to nil: ", $cast[int](addr(f.zeroField)))
sysAssert(f.zeroField != 0, "rawDealloc 1")
f.zeroField = 0
f.next = c.freeList
c.freeList = f
when overwriteFree:
# set to 0xff to check for usage after free bugs:
nimSetMem(cast[pointer](cast[int](p) +% sizeof(FreeCell)), -1'i32,
s -% sizeof(FreeCell))
# check if it is not in the freeSmallChunks[s] list:
if c.free < s:
# add it to the freeSmallChunks[s] array:
listAdd(a.freeSmallChunks[s div MemAlign], c)
inc(c.free, s)
let activeChunk = a.freeSmallChunks[s div MemAlign]
if activeChunk != nil and c != activeChunk:
# This pointer is not part of the active chunk, lend it out
# and do not adjust the current chunk (same logic as compensateCounters.)
# Put the cell into the active chunk,
# may prevent a queue of available chunks from forming in a.freeSmallChunks[s div MemAlign].
# This queue would otherwise waste memory in the form of free cells until we return to those chunks.
f.next = activeChunk.freeList
activeChunk.freeList = f # lend the cell
inc(activeChunk.free, s) # By not adjusting the current chunk's capacity it is prevented from being freed
inc(activeChunk.foreignCells) # The cell is now considered foreign from the perspective of the active chunk
else:
inc(c.free, s)
if c.free == SmallChunkSize-smallChunkOverhead() and c.foreignCells == 0:
listRemove(a.freeSmallChunks[s div MemAlign], c)
c.size = SmallChunkSize
freeBigChunk(a, cast[PBigChunk](c))
f.next = c.freeList
c.freeList = f
if c.free < s:
# The chunk could not have been active as it didn't have enough space to give
listAdd(a.freeSmallChunks[s div MemAlign], c)
inc(c.free, s)
else:
inc(c.free, s)
# Free only if the entire chunk is unused and there are no borrowed cells.
# If the chunk were to be freed while it references foreign cells,
# the foreign chunks will leak memory and can never be freed.
if c.free == SmallChunkSize-smallChunkOverhead() and c.foreignCells == 0:
listRemove(a.freeSmallChunks[s div MemAlign], c)
c.size = SmallChunkSize
freeBigChunk(a, cast[PBigChunk](c))
else:
when logAlloc: cprintf("dealloc(pointer_%p) # SMALL FROM %p CALLER %p\n", p, c.owner, addr(a))