#
#
#            Nimrod's Runtime Library
#        (c) Copyright 2009 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

# Low level allocator for Nimrod.
# TODO: 
# - eliminate "used" field
# - make searching for block O(1)

proc raiseOutOfMem {.noinline.} =
  assert false
  quit(1)

# ------------ platform specific chunk allocation code -----------------------

when defined(posix): 
  const # XXX: make these variables for portability?
    PROT_READ  = 1             # page can be read 
    PROT_WRITE = 2             # page can be written 
    MAP_PRIVATE = 2            # Changes are private 
  
  when defined(linux):
    const MAP_ANONYMOUS = 0x20       # don't use a file
  elif defined(macosx):
    const MAP_ANONYMOUS = 0x1000
  else:
    const MAP_ANONYMOUS = 0 # other operating systems may not know about this

  proc mmap(adr: pointer, len: int, prot, flags, fildes: cint,
            off: int): pointer {.header: "<sys/mman.h>".}

  proc munmap(adr: pointer, len: int) {.header: "<sys/mman.h>".}
  
  proc osAllocPages(size: int): pointer {.inline.} = 
    result = mmap(nil, size, PROT_READ or PROT_WRITE, 
                           MAP_PRIVATE or MAP_ANONYMOUS, -1, 0)
    if result == nil or result == cast[pointer](-1):
      raiseOutOfMem()
      
  proc osDeallocPages(p: pointer, size: int) {.inline} =
    munmap(p, size)
  
elif defined(windows): 
  const
    MEM_RESERVE = 0x2000 
    MEM_COMMIT = 0x1000
    MEM_TOP_DOWN = 0x100000
    PAGE_READWRITE = 0x04

    MEM_DECOMMIT = 0x4000
    MEM_RELEASE = 0x8000

  proc VirtualAlloc(lpAddress: pointer, dwSize: int, flAllocationType,
                    flProtect: int32): pointer {.
                    header: "<windows.h>", stdcall.}
  
  proc VirtualFree(lpAddress: pointer, dwSize: int, 
                   dwFreeType: int32) {.header: "<windows.h>", stdcall.}
  
  proc osAllocPages(size: int): pointer {.inline.} = 
    result = VirtualAlloc(nil, size, MEM_RESERVE or MEM_COMMIT,
                          PAGE_READWRITE)
    if result == nil: raiseOutOfMem()

  proc osDeallocPages(p: pointer, size: int) {.inline.} = 
    # according to Microsoft, 0 is the only correct value here:
    VirtualFree(p, 0, MEM_RELEASE)

else: 
  {.error: "Port memory manager to your platform".}

# --------------------- end of non-portable code -----------------------------

# We manage *chunks* of memory. Each chunk is a multiple of the page size.
# The page size may or may not the operating system's page size. Each chunk
# starts at an address that is divisible by the page size. Chunks that are
# bigger than ``ChunkOsReturn`` are returned back to the operating system
# immediately.


# Guess the page size of the system; if it is the
# wrong value, performance may be worse (this is not
# for sure though), but GC still works; must be a power of two!
when defined(linux) or defined(windows) or defined(macosx):
  const
    PageShift = 12
    PageSize = 1 shl PageShift # on 32 bit systems 4096
else:
  {.error: "unkown page size".}

const
  PageMask = PageSize-1
  
  SmallChunkSize = PageSize # * 4

  MemAlign = 8 # minimal memory block that can be allocated

  BitsPerPage = PageSize div MemAlign
  UnitsPerPage = BitsPerPage div (sizeof(int)*8)
    # how many ints do we need to describe a page:
    # on 32 bit systems this is only 16 (!)

  ChunkOsReturn = 64 * PageSize
  InitialMemoryRequest = ChunkOsReturn div 2 # < ChunkOsReturn!
  
  # Compile time options:
  coalescRight = true
  coalescLeft = true

const
  TrunkShift = 9
  BitsPerTrunk = 1 shl TrunkShift # needs to be a power of 2 and divisible by 64
  TrunkMask = BitsPerTrunk - 1
  IntsPerTrunk = BitsPerTrunk div (sizeof(int)*8)
  IntShift = 5 + ord(sizeof(int) == 8) # 5 or 6, depending on int width
  IntMask = 1 shl IntShift - 1

type 
  PTrunk = ptr TTrunk
  TTrunk {.final.} = object 
    next: PTrunk         # all nodes are connected with this pointer
    key: int             # start address at bit 0
    bits: array[0..IntsPerTrunk-1, int] # a bit vector
  
  TTrunkBuckets = array[0..1023, PTrunk]
  TIntSet {.final.} = object 
    data: TTrunkBuckets
  
type
  TAlignType = float
  TFreeCell {.final, pure.} = object
    next: ptr TFreeCell  # next free cell in chunk (overlaid with refcount)
    zeroField: pointer   # nil means cell is not used (overlaid with typ field)

  PChunk = ptr TBaseChunk
  PBigChunk = ptr TBigChunk
  PSmallChunk = ptr TSmallChunk
  TBaseChunk {.pure.} = object
    prevSize: int        # size of previous chunk; for coalescing
    size: int            # if < PageSize it is a small chunk
    used: bool           # later will be optimized into prevSize...
  
  TSmallChunk = object of TBaseChunk
    next, prev: PSmallChunk  # chunks of the same size
    freeList: ptr TFreeCell
    free: int            # how many bytes remain    
    acc: int             # accumulator for small object allocation
    data: TAlignType     # start of usable memory
  
  TBigChunk = object of TBaseChunk # not necessarily > PageSize!
    next: PBigChunk      # chunks of the same (or bigger) size
    prev: PBigChunk
    data: TAlignType     # start of usable memory

template smallChunkOverhead(): expr = sizeof(TSmallChunk)-sizeof(TAlignType)
template bigChunkOverhead(): expr = sizeof(TBigChunk)-sizeof(TAlignType)

proc roundup(x, v: int): int {.inline.} = return ((-x) and (v-1)) +% x

assert(roundup(14, PageSize) == PageSize)
assert(roundup(15, 8) == 16)

# ------------- chunk table ---------------------------------------------------
# We use a PtrSet of chunk starts and a table[Page, chunksize] for chunk
# endings of big chunks. This is needed by the merging operation. The only
# remaining operation is best-fit for big chunks. Since there is a size-limit
# for big chunks (because greater than the limit means they are returned back
# to the OS), a fixed size array can be used. 

type
  PLLChunk = ptr TLLChunk
  TLLChunk {.pure.} = object ## *low-level* chunk
    size: int                # remaining size
    acc: int                 # accumulator
    
  TAllocator {.final, pure.} = object
    llmem: PLLChunk
    currMem, maxMem: int  # currently and maximum used memory size (allocated from OS)
    freeSmallChunks: array[0..SmallChunkSize div MemAlign-1, PSmallChunk]
    freeChunksList: PBigChunk # XXX make this a datastructure with O(1) access
    chunkStarts: TIntSet
   
proc incCurrMem(a: var TAllocator, bytes: int) {.inline.} = 
  inc(a.currMem, bytes)

proc decCurrMem(a: var TAllocator, bytes: int) {.inline.} =
  a.maxMem = max(a.maxMem, a.currMem)
  dec(a.currMem, bytes)

proc getMaxMem(a: var TAllocator): int =
  # Since we update maxPagesCount only when freeing pages, 
  # maxPagesCount may not be up to date. Thus we use the
  # maximum of these both values here:
  return max(a.currMem, a.maxMem)
   
var
  allocator: TAllocator
    
proc llAlloc(a: var TAllocator, size: int): pointer =
  # *low-level* alloc for the memory managers data structures. Deallocation
  # is never done.
  if a.llmem == nil or size > a.llmem.size:
    var request = roundup(size+sizeof(TLLChunk), PageSize)
    a.llmem = cast[PLLChunk](osAllocPages(request))
    incCurrMem(a, request)
    a.llmem.size = request - sizeof(TLLChunk)
    a.llmem.acc = sizeof(TLLChunk)
  result = cast[pointer](cast[TAddress](a.llmem) + a.llmem.acc)
  dec(a.llmem.size, size)
  inc(a.llmem.acc, size)
  zeroMem(result, size)
  
proc IntSetGet(t: TIntSet, key: int): PTrunk = 
  var it = t.data[key and high(t.data)]
  while it != nil: 
    if it.key == key: return it
    it = it.next
  result = nil

proc IntSetPut(t: var TIntSet, key: int): PTrunk = 
  result = IntSetGet(t, key)
  if result == nil:
    result = cast[PTrunk](llAlloc(allocator, sizeof(result^)))
    result.next = t.data[key and high(t.data)]
    t.data[key and high(t.data)] = result
    result.key = key

proc Contains(s: TIntSet, key: int): bool = 
  var t = IntSetGet(s, key shr TrunkShift)
  if t != nil: 
    var u = key and TrunkMask
    result = (t.bits[u shr IntShift] and (1 shl (u and IntMask))) != 0
  else: 
    result = false
  
proc Incl(s: var TIntSet, key: int) = 
  var t = IntSetPut(s, key shr TrunkShift)
  var u = key and TrunkMask
  t.bits[u shr IntShift] = t.bits[u shr IntShift] or (1 shl (u and IntMask))

proc Excl(s: var TIntSet, key: int) = 
  var t = IntSetGet(s, key shr TrunkShift)
  if t != nil:
    var u = key and TrunkMask
    t.bits[u shr IntShift] = t.bits[u shr IntShift] and not
        (1 shl (u and IntMask))

proc ContainsOrIncl(s: var TIntSet, key: int): bool = 
  var t = IntSetGet(s, key shr TrunkShift)
  if t != nil: 
    var u = key and TrunkMask
    result = (t.bits[u shr IntShift] and (1 shl (u and IntMask))) != 0
    if not result: 
      t.bits[u shr IntShift] = t.bits[u shr IntShift] or
          (1 shl (u and IntMask))
  else: 
    Incl(s, key)
    result = false
   
# ------------- chunk management ----------------------------------------------
proc pageIndex(c: PChunk): int {.inline.} = 
  result = cast[TAddress](c) shr PageShift

proc pageIndex(p: pointer): int {.inline.} = 
  result = cast[TAddress](p) shr PageShift

proc pageAddr(p: pointer): PChunk {.inline.} = 
  result = cast[PChunk](cast[TAddress](p) and not PageMask)
  assert(Contains(allocator.chunkStarts, pageIndex(result)))

var lastSize = PageSize

proc requestOsChunks(a: var TAllocator, size: int): PBigChunk = 
  incCurrMem(a, size)
  result = cast[PBigChunk](osAllocPages(size))
  assert((cast[TAddress](result) and PageMask) == 0)
  result.next = nil
  result.prev = nil
  result.used = false
  result.size = size
  # update next.prevSize:
  var nxt = cast[TAddress](result) +% size
  assert((nxt and PageMask) == 0)
  var next = cast[PChunk](nxt)
  if pageIndex(next) in a.chunkStarts:
    #echo("Next already allocated!")
    next.prevSize = size
  # set result.prevSize:
  var prv = cast[TAddress](result) -% lastSize
  assert((nxt and PageMask) == 0)
  var prev = cast[PChunk](prv)
  if pageIndex(prev) in a.chunkStarts and prev.size == lastSize:
    #echo("Prev already allocated!")
    result.prevSize = lastSize
  else:
    result.prevSize = 0 # unknown
  lastSize = size # for next request

proc freeOsChunks(a: var TAllocator, p: pointer, size: int) = 
  # update next.prevSize:
  var c = cast[PChunk](p)
  var nxt = cast[TAddress](p) +% c.size
  assert((nxt and PageMask) == 0)
  var next = cast[PChunk](nxt)
  if pageIndex(next) in a.chunkStarts:
    next.prevSize = 0 # XXX used
  excl(a.chunkStarts, pageIndex(p))
  osDeallocPages(p, size)
  decCurrMem(a, size)

proc isAccessible(p: pointer): bool {.inline.} = 
  result = Contains(allocator.chunkStarts, pageIndex(p))

proc ListAdd[T](head: var T, c: T) {.inline.} = 
  assert c.prev == nil
  assert c.next == nil
  c.next = head
  if head != nil: 
    assert head.prev == nil
    head.prev = c
  head = c

proc ListRemove[T](head: var T, c: T) {.inline.} =
  if c == head: 
    head = c.next
    assert c.prev == nil
    if head != nil: head.prev = nil
  else:
    assert c.prev != nil
    c.prev.next = c.next
    if c.next != nil: c.next.prev = c.prev
  c.next = nil
  c.prev = nil
  
proc isSmallChunk(c: PChunk): bool {.inline.} = 
  return c.size <= SmallChunkSize-smallChunkOverhead()
  #return c.size < SmallChunkSize
  
proc chunkUnused(c: PChunk): bool {.inline.} = 
  result = not c.used
  
proc freeBigChunk(a: var TAllocator, c: PBigChunk) = 
  var c = c
  assert(c.size >= PageSize)
  when coalescRight:
    var ri = cast[PChunk](cast[TAddress](c) +% c.size)
    assert((cast[TAddress](ri) and PageMask) == 0)
    if isAccessible(ri) and chunkUnused(ri):
      if not isSmallChunk(ri):
        ListRemove(a.freeChunksList, cast[PBigChunk](ri))
        inc(c.size, ri.size)
        excl(a.chunkStarts, pageIndex(ri))
  when coalescLeft:
    if c.prevSize != 0: 
      var le = cast[PChunk](cast[TAddress](c) -% c.prevSize)
      assert((cast[TAddress](le) and PageMask) == 0)
      if isAccessible(le) and chunkUnused(le):
        if not isSmallChunk(le):
          ListRemove(a.freeChunksList, cast[PBigChunk](le))
          inc(le.size, c.size)
          excl(a.chunkStarts, pageIndex(c))
          c = cast[PBigChunk](le)

  if c.size < ChunkOsReturn: 
    ListAdd(a.freeChunksList, c)
    c.used = false
  else:
    freeOsChunks(a, c, c.size)

proc splitChunk(a: var TAllocator, c: PBigChunk, size: int) = 
  var rest = cast[PBigChunk](cast[TAddress](c) +% size)
  rest.size = c.size - size
  rest.used = false
  rest.next = nil # XXX
  rest.prev = nil
  rest.prevSize = size
  c.size = size
  incl(a.chunkStarts, pageIndex(rest))
  ListAdd(a.freeChunksList, rest)

proc getBigChunk(a: var TAllocator, size: int): PBigChunk = 
  # use first fit for now:
  assert((size and PageMask) == 0)
  result = a.freeChunksList
  block search:
    while result != nil:
      assert chunkUnused(result)
      if result.size == size: 
        ListRemove(a.freeChunksList, result)
        break search
      elif result.size > size:
        splitChunk(a, result, size)
        ListRemove(a.freeChunksList, result)
        break search
      result = result.next
    if size < InitialMemoryRequest: 
      result = requestOsChunks(a, InitialMemoryRequest)
      splitChunk(a, result, size)
    else:
      result = requestOsChunks(a, size)
  result.prevSize = 0
  result.used = true
  incl(a.chunkStarts, pageIndex(result))

proc getSmallChunk(a: var TAllocator): PSmallChunk = 
  var res = getBigChunk(a, PageSize)
  assert res.prev == nil
  assert res.next == nil
  result = cast[PSmallChunk](res)

# -----------------------------------------------------------------------------

proc getCellSize(p: pointer): int {.inline.} = 
  var c = pageAddr(p)
  result = c.size
  
proc alloc(a: var TAllocator, requestedSize: int): pointer =
  var size = roundup(max(requestedSize, sizeof(TFreeCell)), MemAlign)
  if size <= SmallChunkSize-smallChunkOverhead(): 
    # allocate a small block: for small chunks, we use only its next pointer
    var s = size div MemAlign
    var c = a.freeSmallChunks[s]
    if c == nil: 
      c = getSmallChunk(a)
      c.freeList = nil
      assert c.size == PageSize
      c.size = size
      c.acc = size
      c.free = SmallChunkSize - smallChunkOverhead() - size
      c.next = nil
      c.prev = nil
      ListAdd(a.freeSmallChunks[s], c)
      result = addr(c.data)
    else:
      assert c.next != c
      assert c.size == size
      if c.freeList == nil:
        assert(c.acc + smallChunkOverhead() + size <= SmallChunkSize) 
        result = cast[pointer](cast[TAddress](addr(c.data)) +% c.acc)
        inc(c.acc, size)      
      else:
        result = c.freeList
        assert(c.freeList.zeroField == nil)
        c.freeList = c.freeList.next
      dec(c.free, size)
    if c.free < size: 
      ListRemove(a.freeSmallChunks[s], c)
  else:
    size = roundup(requestedSize+bigChunkOverhead(), PageSize)
    # allocate a large block
    var c = getBigChunk(a, size)
    assert c.prev == nil
    assert c.next == nil
    assert c.size == size
    result = addr(c.data)
  cast[ptr TFreeCell](result).zeroField = cast[ptr TFreeCell](1) # make it != nil
  #echo("setting to one: ", $cast[TAddress](addr(cast[ptr TFreeCell](result).zeroField)))

proc contains(list, x: PSmallChunk): bool = 
  var it = list
  while it != nil:
    if it == x: return true
    it = it.next

proc dealloc(a: var TAllocator, p: pointer) = 
  var c = pageAddr(p)
  if isSmallChunk(c):
    # `p` is within a small chunk:
    var c = cast[PSmallChunk](c)
    var s = c.size
    var f = cast[ptr TFreeCell](p)
    #echo("setting to nil: ", $cast[TAddress](addr(f.zeroField)))
    assert(f.zeroField != nil)
    f.zeroField = nil
    f.next = c.freeList
    c.freeList = f
    # check if it is not in the freeSmallChunks[s] list:
    if c.free < s:
      assert c notin a.freeSmallChunks[s div memAlign]
      # add it to the freeSmallChunks[s] array:
      ListAdd(a.freeSmallChunks[s div memAlign], c)
      inc(c.free, s)
    else:
      inc(c.free, s)
      if c.free == SmallChunkSize-smallChunkOverhead():
        ListRemove(a.freeSmallChunks[s div memAlign], c)
        c.size = SmallChunkSize
        freeBigChunk(a, cast[PBigChunk](c))
  else:
    # free big chunk
    freeBigChunk(a, cast[PBigChunk](c))

proc realloc(a: var TAllocator, p: pointer, size: int): pointer = 
  # could be made faster, but this is unnecessary, the GC does not use it anyway
  result = alloc(a, size)
  copyMem(result, p, getCellSize(p))
  dealloc(a, p)

proc isAllocatedPtr(a: TAllocator, p: pointer): bool = 
  if isAccessible(p):
    var c = pageAddr(p)
    if not chunkUnused(c):
      if isSmallChunk(c):
        result = (cast[TAddress](p) -% cast[TAddress](c) -%
                 smallChunkOverhead()) %% c.size == 0 and
          cast[ptr TFreeCell](p).zeroField != nil
      else:
        var c = cast[PBigChunk](c)
        result = p == addr(c.data)

when isMainModule:
  const iterations = 4000_000
  incl(allocator.chunkStarts, 11)
  assert 11 in allocator.chunkStarts
  excl(allocator.chunkStarts, 11)
  assert 11 notin allocator.chunkStarts
  var p: array [1..iterations, pointer]
  for i in 7..7:
    var x = i * 8
    for j in 1.. iterations:
      p[j] = alloc(allocator, x)
    for j in 1..iterations:
      assert isAllocatedPtr(allocator, p[j])
    echo($i, " used memory: ", $(allocator.currMem))
    for j in countdown(iterations, 1):
      #echo("j: ", $j)
      dealloc(allocator, p[j])
      assert(not isAllocatedPtr(allocator, p[j]))
    echo($i, " after freeing: ", $(allocator.currMem))