diff --git a/lib/system/alloc.nim b/lib/system/alloc.nim index 4db74bae29..3de6d87137 100644 --- a/lib/system/alloc.nim +++ b/lib/system/alloc.nim @@ -20,6 +20,37 @@ template track(op, address, size) = # We manage *chunks* of memory. Each chunk is a multiple of the page size. # Each chunk starts at an address that is divisible by the page size. +# Small chunks may be divided into smaller cells of reusable pointers to reduce the number of page allocations. + +# An allocation of a small pointer looks approximately like this +#[ + + alloc -> rawAlloc -> No free chunk available > Request a new page from tslf -> result = chunk.data -------------+ + | | + v | + Free chunk available | + | | + v v + Fetch shared cells -> No free cells available -> Advance acc -> result = chunk.data + chunk.acc -------> return + (may not add new cells) ^ + | | + v | + Free cells available -> result = chunk.freeList -> Advance chunk.freeList -----------------------------------+ +]# +# so it is split into 3 paths, where the last path is preferred to prevent unnecessary allocations. +# +# +# A deallocation of a small pointer then looks like this +#[ + dealloc -> rawDealloc -> chunk.owner == addr(a) --------------> This thread owns the chunk ------> The current chunk is active -> Chunk is completely unused -----> Chunk references no foreign cells + | | (Add cell into the current chunk) | Return the current chunk back to tlsf + | | | | + v v v v + A different thread owns this chunk. The current chunk is not active. chunk.free was < size Chunk references foreign cells, noop + Add the cell to a.sharedFreeLists Add the cell into the active chunk Activate the chunk (end) + (end) (end) (end) +]# +# So "true" deallocation is delayed for as long as possible in favor of reusing cells. const nimMinHeapPages {.intdefine.} = 128 # 0.5 MB @@ -71,6 +102,8 @@ const type FreeCell {.final, pure.} = object + # A free cell is a pointer that has been freed, meaning it became available for reuse. + # It may become foreign if it is lent to a chunk that did not create it, doing so reduces the amount of needed pages. next: ptr FreeCell # next free cell in chunk (overlaid with refcount) when not defined(gcDestructors): zeroField: int # 0 means cell is not used (overlaid with typ field) @@ -90,11 +123,18 @@ type SmallChunk = object of BaseChunk next, prev: PSmallChunk # chunks of the same size - freeList: ptr FreeCell - free: int32 # how many bytes remain - acc: uint32 # accumulator for small object allocation - foreignCells: int # Number of deferred free cells from other threads this chunk stole from sharedFreeLists. - # Freeing the chunk before this is zero means the stolen cells become inaccessible permanently. + freeList: ptr FreeCell # Singly linked list of cells. They may be from foreign chunks or from the current chunk. + # Should be `nil` when the chunk isn't active in `a.freeSmallChunks`. + free: int32 # Bytes this chunk is able to provide using both the accumulator and free cells. + # When a cell is considered foreign, its source chunk's free field is NOT adjusted until it + # reaches dealloc while the source chunk is active. + # Instead, the receiving chunk gains the capacity and thus reserves space in the foreign chunk. + acc: uint32 # Offset from data, used when there are no free cells available but the chunk is considered free. + foreignCells: int # When a free cell is given to a chunk that is not its origin, + # both the cell and the source chunk are considered foreign. + # Receiving a foreign cell can happen both when deallocating from another thread or when + # the active chunk in `a.freeSmallChunks` is not the current chunk. + # Freeing a chunk while `foreignCells > 0` leaks memory as all references to it become lost. data {.align: MemAlign.}: UncheckedArray[byte] # start of usable memory BigChunk = object of BaseChunk # not necessarily > PageSize! @@ -110,8 +150,11 @@ type when not defined(gcDestructors): minLargeObj, maxLargeObj: int freeSmallChunks: array[0..max(1, SmallChunkSize div MemAlign-1), PSmallChunk] + # List of available chunks per size class. Only one is expected to be active per class. when defined(gcDestructors): sharedFreeLists: array[0..max(1, SmallChunkSize div MemAlign-1), ptr FreeCell] + # When a thread frees a pointer it did not create, it must not adjust the counters. + # Instead, the cell is placed here and deferred until the next allocation. flBitmap: uint32 slBitmap: array[RealFli, uint32] matrix: array[RealFli, array[MaxSli, PBigChunk]] @@ -794,6 +837,8 @@ when defined(gcDestructors): inc total, size let chunk = cast[PSmallChunk](pageAddr(it)) if c != chunk: + # The cell is foreign, potentially even from a foreign thread. + # It must block the current chunk from being freed, as doing so would leak memory. inc c.foreignCells it = it.next # By not adjusting the foreign chunk we reserve space in it to prevent deallocation @@ -828,6 +873,7 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer = if size <= SmallChunkSize-smallChunkOverhead(): template fetchSharedCells(tc: PSmallChunk) = + # Consumes cells from (potentially) foreign threads from `a.sharedFreeLists[s]` when defined(gcDestructors): if tc.freeList == nil: when hasThreadSupport: @@ -836,13 +882,17 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer = else: tc.freeList = a.sharedFreeLists[s] a.sharedFreeLists[s] = nil + # if `tc.freeList` isn't nil, `tc` will gain capacity. + # We must calculate how much it gained and how many foreign cells are included. compensateCounters(a, tc, size) # allocate a small block: for small chunks, we use only its next pointer let s = size div MemAlign var c = a.freeSmallChunks[s] if c == nil: + # There is no free chunk of the requested size available, we need a new one. c = getSmallChunk(a) + # init all fields in case memory didn't get zeroed c.freeList = nil c.foreignCells = 0 sysAssert c.size == PageSize, "rawAlloc 3" @@ -852,12 +902,17 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer = sysAssert c.owner == addr(a), "rawAlloc: No owner set!" c.next = nil c.prev = nil + # Shared cells are fetched here in case `c.size * 2 >= SmallChunkSize - smallChunkOverhead()`. + # For those single cell chunks, we would otherwise have to allocate a new one almost every time. fetchSharedCells(c) if c.free >= size: + # Because removals from `a.freeSmallChunks[s]` only happen in the other alloc branch and during dealloc, + # we must not add it to the list if it cannot be used the next time a pointer of `size` bytes is needed. listAdd(a.freeSmallChunks[s], c) result = addr(c.data) sysAssert((cast[int](result) and (MemAlign-1)) == 0, "rawAlloc 4") else: + # There is a free chunk of the requested size available, use it. sysAssert(allocInv(a), "rawAlloc: begin c != nil") sysAssert c.next != c, "rawAlloc 5" #if c.size != size: @@ -869,24 +924,30 @@ proc rawAlloc(a: var MemRegion, requestedSize: int): pointer = result = cast[pointer](cast[int](addr(c.data)) +% c.acc.int) inc(c.acc, size) else: + # There are free cells available, prefer them over the accumulator result = c.freeList when not defined(gcDestructors): sysAssert(c.freeList.zeroField == 0, "rawAlloc 8") c.freeList = c.freeList.next if cast[PSmallChunk](pageAddr(result)) != c: - # This cell isn't a blocker for the current chunk anymore + # This cell isn't a blocker for the current chunk's deallocation anymore dec(c.foreignCells) else: - sysAssert(c == cast[PSmallChunk](pageAddr(result)), "Bad cell") + sysAssert(c == cast[PSmallChunk](pageAddr(result)), "rawAlloc: Bad cell") + # Even if the cell we return is foreign, the local chunk's capacity decreases. + # The capacity was previously reserved in the source chunk (when it first got allocated), + # then added into the current chunk during dealloc, + # so the source chunk will not be freed or leak memory because of this. dec(c.free, size) sysAssert((cast[int](result) and (MemAlign-1)) == 0, "rawAlloc 9") sysAssert(allocInv(a), "rawAlloc: end c != nil") - # We fetch deferred cells *after* advancing c.freeList/acc to adjust c.free. + # We fetch deferred cells *after* advancing `c.freeList`/`acc` to adjust `c.free`. # If after the adjustment it turns out there's free cells available, - # the chunk stays in a.freeSmallChunks[s] and the need for a new chunk is delayed. + # the chunk stays in `a.freeSmallChunks[s]` and the need for a new chunk is delayed. fetchSharedCells(c) sysAssert(allocInv(a), "rawAlloc: before c.free < size") if c.free < size: + # Even after fetching shared cells the chunk has no usable memory left. It is no longer the active chunk sysAssert(allocInv(a), "rawAlloc: before listRemove test") listRemove(a.freeSmallChunks[s], c) sysAssert(allocInv(a), "rawAlloc: end listRemove test") @@ -952,23 +1013,37 @@ proc rawDealloc(a: var MemRegion, p: pointer) = #echo("setting to nil: ", $cast[int](addr(f.zeroField))) sysAssert(f.zeroField != 0, "rawDealloc 1") f.zeroField = 0 - f.next = c.freeList - c.freeList = f when overwriteFree: # set to 0xff to check for usage after free bugs: nimSetMem(cast[pointer](cast[int](p) +% sizeof(FreeCell)), -1'i32, s -% sizeof(FreeCell)) - # check if it is not in the freeSmallChunks[s] list: - if c.free < s: - # add it to the freeSmallChunks[s] array: - listAdd(a.freeSmallChunks[s div MemAlign], c) - inc(c.free, s) + let activeChunk = a.freeSmallChunks[s div MemAlign] + if activeChunk != nil and c != activeChunk: + # This pointer is not part of the active chunk, lend it out + # and do not adjust the current chunk (same logic as compensateCounters.) + # Put the cell into the active chunk, + # may prevent a queue of available chunks from forming in a.freeSmallChunks[s div MemAlign]. + # This queue would otherwise waste memory in the form of free cells until we return to those chunks. + f.next = activeChunk.freeList + activeChunk.freeList = f # lend the cell + inc(activeChunk.free, s) # By not adjusting the current chunk's capacity it is prevented from being freed + inc(activeChunk.foreignCells) # The cell is now considered foreign from the perspective of the active chunk else: - inc(c.free, s) - if c.free == SmallChunkSize-smallChunkOverhead() and c.foreignCells == 0: - listRemove(a.freeSmallChunks[s div MemAlign], c) - c.size = SmallChunkSize - freeBigChunk(a, cast[PBigChunk](c)) + f.next = c.freeList + c.freeList = f + if c.free < s: + # The chunk could not have been active as it didn't have enough space to give + listAdd(a.freeSmallChunks[s div MemAlign], c) + inc(c.free, s) + else: + inc(c.free, s) + # Free only if the entire chunk is unused and there are no borrowed cells. + # If the chunk were to be freed while it references foreign cells, + # the foreign chunks will leak memory and can never be freed. + if c.free == SmallChunkSize-smallChunkOverhead() and c.foreignCells == 0: + listRemove(a.freeSmallChunks[s div MemAlign], c) + c.size = SmallChunkSize + freeBigChunk(a, cast[PBigChunk](c)) else: when logAlloc: cprintf("dealloc(pointer_%p) # SMALL FROM %p CALLER %p\n", p, c.owner, addr(a))