/*--------------------------------------------------------------------------
 *    buddy system memory allocator
 *
 *    EXPORTS
 *       char *OsAlloc(int)
 *       OsFree(char *)
 *    IMPORTS
 *       char *OsSharedMalloc(int)
 *       OsSpinInit(SPINLOCK), OsSpinLock(SPINLOCK), OsSpinUnlock(SPINLOCK)
 *       MyPenum
 *-------------------------------------------------------------------------*/

#include "host.h"     /* spin locks and OsSharedMalloc from chare kernel */
#include "cksys.h"    /* import MyPenum from chare kernel */
#include <signal.h>   /* abort fatal errors by SIGKILL to avoid deadlock */
#include <assert.h>   /* debugging assertions */
#include <stdio.h>

/* Internal parameters.
 * The buddy system memory tree root has size MAXBUDDY and the leaves have
 * size MINBUDDY.  MAXSIZE must be >= SysMem.  Global chunks are numbered
 * successively from 0, and when global chunk n its base pe is recorded via
 * ProcBase[n]=MyPenum.  Then the base pe of any subbuffer is obtained
 * by hashing its address to the global chunk number and indexing ProcBase.
 * MAXSIZE is used to declare ProcBase.  BUCKETS is used to declare and to
 * traverse LocalFree.
 */
#define	MINBUDDY 4       /* must be > sizeof(MEMHDR) */
#define	MAXBUDDY 15      /* size of a global chunk */
#define	MAXSIZE (1<<25)  /* upper bound on SysMem */
#define	BUCKETS	32      /* accomodates MAXSIZE up to 2**32 */

#define	SIZE(bucket) (1<<(bucket))
#define	TRACE(level,s) if(MemTrace >= level) s
#define	CHECK 117
#define	FreeList(bucket) (&(LocalFree[MyPenum][bucket].head))
#define	BaseProc(p) ProcBase[ ((p)-MemBase)>>MAXBUDDY ]
#define	AddToFreeList(b,bucket) \
            { MEMHDR *mh; \
					mh = &LocalFree[MyPenum][bucket].head; \
					b->next = mh->next; \
					b->prev = mh; \
					mh->next = b; \
					if (b->next != NULL) b->next->prev = b; \
					b->inuse = MyPenum; \
					assert( b->next != mh->next ); \
					}

/* MEMHDR resides at the beginning of all memory blocks used by the
 * buddy system.  a free block uses the entire MEMHDR struct, while an
 * allocated block only uses MEMHDRPREFIX bytes of it.  the free list
 * pointers are not needed in allocated memory, so their space becomes
 * part of the user area.
 */
#define MEMHDRPREFIX 4
typedef struct BLOCK {
	char bucket;		/* order of block, MINBUDDY <= bucket < MAXBUDDY */
	char inuse;			/* =pe if buffer has been OsFree'd, =-1 if in use */
	char check;			/* consistency check */
	char pad;			/* unused byte */
	struct BLOCK *next, *prev, *gorf;
} MEMHDR;
/* A change was made to the MEMHDR structure that has not gotten extensive
 * testing.  The unused 'pad' field used to be 'freepe', which was set to
 * MyPenum on a freed buffer.  'inuse' was simply 1 or 0 indicating whether
 * a buffer was active.  Now, only one of these fields was used at a time,
 * so they were fused into 'inuse', for which -1 indicates
 * an active buffer and >=0 indicates a free buffer as well as its pe of
 * residence.  Actually there was one place where both fields where used:
 * GarbageDistribute, which set inuse to show the buffer active, but set
 * freepe to show the buffer inactive on its base pe.  It is thought that
 * setting freepe is unnecessary; inuse shows active buffer for all buffers
 * in the garbage matrix, so when a buffer is freed in GarbageCompact, if it
 * is the first buddy of a pair it gets freechained and its inuse value
 * set to the base pe; if it is the second of the pair, its buddy is found
 * and merged, and inuse gets set to the base pe.  So the fused 'inuse' is
 * being used, freeing up a byte in the free header for other uses...
 */
typedef struct {
	MEMHDR head;
	SPINLOCK lock;
} BUCKET;

typedef long FIXEDSIZE;


/*****************************************************************************
 *                                                                           *
 *                            global variables                               *
 *                                                                           *
 *****************************************************************************/

shared char *MemBase;                     /* fwa of global pool */
shared int SysMem = MaxMem;               /* size of global pool */
shared BUCKET LocalFree[MaxPe][BUCKETS];  /* free lists */
shared int ProcBase[MAXSIZE>>MAXBUDDY];	/* address-to-basepe mapping */

/* global memory system variables */
shared LONG gcellmin;      /* statistic: low-water count of global cells */
shared LONG gcellcount;    /* current number of global cells */
shared char *gmfree;       /* start of global memory free chain */
shared SPINLOCK gmlock;    /* for global memory requests */

/* stuff for garbage collector */
shared LONG Trash = 0;                  /* flag garbage collection pending */
shared MEMHDR Garbage[MaxPe][MaxPe];    /* garbage distribution matrix */
shared SPINLOCK glock1, glock2;         /* barriers */
shared LONG gbgBarrier, gbgBarrier2, gbgExitBarrier;   /* barrier counts */

/* statistics */
int PrintMemStat = 0;		 /* CLI variable, turns ON or OFF Statistics */
shared int MemTrace = 0;         /* debug level, for TRACE macro */
shared SPINLOCK padlock;         /* for random stuff */
shared LONG mrgcount = 0;        /* count successful merges */
shared LONG mrgdenycount = 0;    /* count merges inhibited by diff basepe */
shared LONG rmtmerge = 0;        /* count remote merges */
shared LONG gbgcollections = 0;  /* count garbage collections */
shared LONG gbgtime = 0;         /* time spent garbage collecting */
shared LONG globalreqs = 0;      /* count requests from global pool */
shared LONG BytesUsed = 0;       /* estimate of bytes in use (not locked) */
shared LONG BytesReturned = 0;   /* estimate of bytes returned (not locked) */
shared LONG MaxBytesUsed = 0;    /* estimate of max allocation (not locked) */

char *memAlloc(), *PL(), *localAlloc(), *globalAlloc(), *OsAlloc();

/*****************************************************************************
 *                                                                           *
 *                            external interface                             *
 *                                                                           *
 *****************************************************************************/

/*----------------------------------------------------------------- OsAlloc */

char * OsAlloc(size)
register LONG size;
{
 	/* converts user request into a suitable request for the buddy system.
 	 * the header length is added to the user request size, then this is
 	 * rounded up to the next bucket size (i.e. the next power of 2 for
 	 * the binary buddy system).  if request fails, prints a message and
	 * calls exit().  calls memAlloc.
	 */
	register char *p;
	register SHORT bucket;

	size += MEMHDRPREFIX;
	if( size > SIZE(MAXBUDDY) ) {
		OsPrintf("OsAlloc: request too large %d\n", size);
		return NULL;
	}
	if( size < SIZE(MINBUDDY) ) size = SIZE(MINBUDDY);

	for( bucket = MINBUDDY; bucket; bucket++ ) 
		if( size <= SIZE(bucket) ) break;
	if( (p = localAlloc(bucket)) == NULL ) {
		/* request failed.  do garbage collection and retry request.
		 * if it fails again the system is out of memory.
		 */
		GarbageMan();
		p = localAlloc(bucket);
	}
	if (p == NULL) {
		OsPrintf("Memory Overflow, quitting.\n");
		OsKillSys();
		kill(0,SIGINT);
		exit(1);
	}
	((MEMHDR *)p)->check = CHECK;
	BytesUsed += SIZE(bucket);
	MaxBytesUsed = (MaxBytesUsed < BytesUsed ? BytesUsed : MaxBytesUsed);
	TRACE(100,OsPrintf("OsAlloc: allocate %d bytes at %d pe %d\n", size, p-MemBase, MyPenum));
	return p + MEMHDRPREFIX;
}

/*--------------------------------------------------------------- localAlloc */

char *localAlloc(bucket)
SHORT bucket;
{
	/* allocate a buffer from 'bucket'.
	 */
	char *p;
	MEMHDR *mh;
	register MEMHDR *b;
	SHORT bsearch, bsplit;

	/* look through free chains of bucket's size and larger for a pre-split
	 * free block.  if none, ask the global system for a MAXBUDDY size block
	 */
	p = NULL;
	for (bsearch = bucket; bsearch < MAXBUDDY; bsearch++ ) {
		mh = FreeList(bsearch);
		if ((b = mh->next) != NULL) {
			assert(b->prev == mh);
			p = (char *)b;
			b->prev->next = b->next;
			if( b->next != NULL ) b->next->prev = b->prev;
			break;
		}
	}
	if (p == NULL) {
		if ((p = globalAlloc()) == NULL)
			return NULL;
	}

	/* split the block found in the first part down to the requested size.
	 * keep the first half and add the second half to a free chain.
	 * `bsplit' is the bucket we are splitting into during this iteration.
	 */
	for (bsplit = bsearch-1; bsplit >= bucket; bsplit-- ) {
		b = (MEMHDR *) (p + SIZE(bsplit));
		AddToFreeList(b,bsplit);
		b->bucket = bsplit;
	}

	b = (MEMHDR *) p;
	b->bucket = bucket;
	b->inuse = -1;
	return p;
}

/*----------------------------------------------------------------- OsFree */

OsFree(p)
register char *p;
{
	/* does simple integrity check on pointer passed, exits() if wrong.
	 * else sets the Free flag on the memory buffer to indicate to the
	 * allocating processor that the buffer can be reused when convenient.
	 */
	register MEMHDR *b;
	SHORT bucket;

	b = (MEMHDR *)(p - MEMHDRPREFIX);
	if( b->check != CHECK ) {
		OsPrintf("OsFree: bad check at %d\n", p-MemBase);
		exit(1);
	}
	bucket = b->bucket;
	BytesUsed -= SIZE(bucket);
	BytesReturned += SIZE(bucket);
	/*TRACE(100,OsPrintf("OsFree: return memory at %d pe %d (%d)\n", p-sizeof(MEMHDR)-MemBase, MyPenum, b->procid));*/
	localFree(b);
}

/*----------------------------------------------------------------- localFree */

localFree(p)
char *p;
{
	/* return memory to the buddy system.  p points to the MEMHDR header.
	 * if the buddy is free and if it is on this processor, the buffers are
	 * merged and the process repeated with the buddy of the resultant buffer.
	 * note that this allows merging of buffers that originated on another pe.
	 * if the buddies are not both on this processor (or the buddy is in use)
	 * p is freechained.  The first case represents fragmentation since both
	 * buddies are free.  Extensive fragmentation causes garbage collection.
	 */
	LONG mysize;
	register MEMHDR *b, *b2;

	b = (MEMHDR *)p;
	mysize = SIZE(b->bucket);
	b->check = 0;
	b->inuse = MyPenum;
	b2 = (MEMHDR *) (((p - MemBase) ^ mysize) + MemBase);

	/* while b's buddy b2 is free, merge b and b2, set b to the start of
	 * the resultant block, show that b is a block of the next larger size, 
	 * recompute b's buddy and continue.
	 */
	while( b2->inuse == MyPenum && b2->bucket == b->bucket && b->bucket < MAXBUDDY ) {
	/* keep some statistics for our amusement */
		if (BaseProc(p) != MyPenum) rmtmerge++;
		mrgcount++;
	/* remove buddy from its free chain */
		b2->prev->next = b2->next;
		if( b2->next != NULL ) b2->next->prev = b2->prev;
		assert( b2->prev->next == NULL || b2->prev->next != b2->prev->next->next );
		b->inuse = MyPenum;
	/* update for next pass */
		p = (MemBase + (( p-MemBase) & (~mysize)));
		b = (MEMHDR *)p;
		b->bucket++;
		mysize = SIZE(b->bucket);
		b2 = (MEMHDR *) (((p - MemBase) ^ mysize) + MemBase);
	}

	/* b is a block that can't be combined further.  if b has been built up
	 * to the size of a global chunk, return it to the global memory system.
	 * otherwise place it on the local free chain for its bucket
	 */
	if( b->bucket < MAXBUDDY ) {
		AddToFreeList(b,b->bucket);
		/* record number of times merges were prevented because of pe mismatch.
		 * then we can display the %age of remote merges out of total merges
	 	 * as rmtmerge / (mrgcount+mrgdenycnt)
	 	 */
		if (b2->inuse != MyPenum && b2->bucket == b->bucket) mrgdenycount++;
	}
	else
		globalFree(b);
}

/*----------------------------------------------------------------- MemInit */

MemInit()
{
	/* entry point to initialize the memory allocator
	 */
	globalInit();
	localInit();
}

/*----------------------------------------------------------------- localInit */

static localInit()
{
	/* initialize local memory system.
	 * show all buckets empty, for all processors.
	 */
	MEMHDR *b;
	SHORT bucket;
	int pe;

	/* initialize free list headers
	 */
	for( bucket = 0; bucket < BUCKETS; bucket++ ) {
		for( pe = 0; pe < MaxPe; pe++ ) {
			OsSpinInit( LocalFree[pe][bucket].lock );
			LocalFree[pe][bucket].head.next = NULL;
			LocalFree[pe][bucket].head.prev = NULL;
		}
	}
}

/*--------------------------------------------------------------- globalInit */

static globalInit()
{
	/* initialize the global memory system.  a buffer of MAXSIZE bytes is
	 * requested from unix and MemBase is set to its address.  the buffer is
	 * broken into contiguous blocks of size 2^MAXBUDDY which are chained
	 * together in a singly linked list headed by gmfree.  the forward link
	 * is stored at the beginning of each block.  this link does not need
	 * to be retained once the memory is allocated, so the usable memory in
	 * each block of the global system is exactly 2^MAXBUDDY.
	 */
	MEMHDR *b;
	char *p;
	extern int numPe;

	OsSpinInit( gmlock );
	OsSpinInit( padlock );
	OsSpinInit( glock1 );
	OsSpinInit( glock2 );

	/* ask the system for the initial block of memory.  MemBase is used
	 * by localFree to compute the buddy of a given block with block sizes
	 * being powers of 2.  SysMem is rounded down to the nearest multiple
	 * of SIZE(MAXBUDDY).
	 */
	SysMem = (SysMem >> MAXBUDDY) << MAXBUDDY;
	SysMem *= numPe;
	if( (MemBase = (char *) OsSharedMalloc(SysMem)) == NULL ) {
		printf("MemInit: memory allocation failure (%d bytes)\n", SysMem);
		exit(1);
	}

	gcellcount = 1;
	for (p = MemBase; p+(SIZE(MAXBUDDY)<<1) <= MemBase + SysMem;
		p += SIZE(MAXBUDDY) ) {
		*((char **)p) = p + SIZE(MAXBUDDY);
		gcellcount++;
	}
	TRACE(100,OsPrintf("Global chain has %d cells of %d bytes\n", gcellcount, SIZE(MAXBUDDY)));
	*((char **)p) = NULL;
	gmfree = MemBase;
	gcellmin = gcellcount;
}

/*--------------------------------------------------------------- globalFree */

static globalFree(p)
char *p;
{
 	/* link p onto head of global free chain, interlocked.
 	 * p is a block of size SIZE(MAXBUDDY).
 	 */
	struct chow *gmb;

	OsSpinLock( gmlock );
	*((char **)p) = gmfree;
	gmfree = p;
	BaseProc(p) = -1;
	gcellcount++;
	OsSpinUnlock( gmlock );
}

/*--------------------------------------------------------------- globalAlloc */

static char *globalAlloc()
{
	/* peel off and return first block from the global free chain, interlocked.
	 * returns NULL if global memory exhausted.
	 */
	register char *p;

	OsSpinLock( gmlock );
	if( (p = gmfree) != NULL ) {
		globalreqs++;
		gmfree = *((char **)gmfree);
		BaseProc(p) = MyPenum;
		if (--gcellcount < gcellmin) gcellmin = gcellcount;
	}
	OsSpinUnlock( gmlock );
	TRACE(100,OsPrintf("Global cell allocated PE %d, now %d left\n", MyPenum, gcellcount));
	return p;
}

void * OsAllocInit(size)
int size;
{
	return (void *) OsSharedMalloc(size);
}

/*****************************************************************************
 *                                                                           *
 *                            garbage collection                             *
 *                                                                           *
 *****************************************************************************/

/*--------------------------------------------------------------- GarbageMan */

GarbageMan()
{
	/* perform garbage collection.  foreign buffers are returned to their
	 * base processors and freed there.  this is done in parallel by first
	 * distributing the buffers, synchronizing, then returning them.
	 * all spin barriers in this routine count down.
	 */
	extern int numPe;
	extern int SysDone;
	int stime, syntime, deltagcell, row, col;
	int initpe = -1;

	if (SysDone) return;
	OsSpinLock( glock1 );
	if( Trash == 0 ) {
		TRACE(10,OsPrintf("Garbage collecting..."));
		gbgcollections++;
		deltagcell = gcellcount;
		stime = OsTimer(TIMER_READ);
		initpe = MyPenum;
		gbgBarrier = numPe;
		gbgBarrier2 = numPe;
		gbgExitBarrier = 0;
		Trash = 1;
		for (row = 0 ; row < numPe; row++)
			for (col = 0; col < numPe; col++)
				Garbage[row][col].next = NULL;
	}
	gbgBarrier--;
	OsSpinUnlock( glock1 );
	while (gbgBarrier)
		if (SysDone) {
			OsPrintf("Garbage collection aborted\n");
			return;
		}
	Trash = 0;
	gbgExitBarrier = numPe;
	syntime = OsTimer(TIMER_READ) - stime;

	GarbageDistribute();

	OsSpinLock(glock1); gbgBarrier2--; OsSpinUnlock(glock1);
	while( gbgBarrier2 ) ;

	GarbageFree();

	OsSpinLock(glock1); gbgExitBarrier--; OsSpinUnlock(glock1);
	while(gbgExitBarrier);

	if (MyPenum == initpe ) {
		stime = OsTimer(TIMER_READ) - stime;
		TRACE(10,OsPrintf("in %d ms, %d sync ms, %d DG\n", stime, syntime, gcellcount-deltagcell));
		gbgtime += stime;
		TRACE(90,MemStats());
	}
}

/*------------------------------------------------------- GarbageDistribute */

static GarbageDistribute()
{
	/* distribute blocks from my free chain into Garbage slots according
	 * to their base pe.  col MyPenum of Garbage[][] is reserved to me.
	 * blocks are removed from the free chain and threaded together using
	 * the *gorf* field of the free header.  inuse is set to 1 as if the
	 * block was allocated; this prevents a buffer later in the gorf chain
	 * from being merged prematurely.
	 */
	int col;
	int basepe;
	int bucket;
	MEMHDR *b;

	col = MyPenum;
	for (bucket = MINBUDDY; bucket < MAXBUDDY; bucket++) {
		for (b = LocalFree[col][bucket].head.next; b != NULL; b = b->next) {
			assert(b != b->next);
			basepe = BaseProc( (char *) b );
			b->gorf = Garbage[basepe][col].next;
			Garbage[basepe][col].next = b;
/*			b->freepe = basepe;   /* not necessary?  will be set by AddFree */
			b->inuse = -1;   /* mark buffer in use */
		}
		LocalFree[col][bucket].head.next = NULL;
		LocalFree[col][bucket].head.prev = NULL;
	}
}

/*--------------------------------------------------------- GarbageFree */

static GarbageFree()
{
	/* row MyPenum of the matrix built by GarbageDistribute contains
	 * free memory blocks that were originally allocated on my processor.
	 * walk down the row and return everything to the buddy system.
	 */
	int row;
	int pe;
	int count;
	MEMHDR *b;

	count = 0;
	row = MyPenum;
	for (pe = 0; pe < numPe; pe++)
		for (b = Garbage[row][pe].next; b != NULL; b = b->gorf ) {
			assert(b != b->gorf);
			localFree(b);
			count++;
		}
	TRACE(10,OsPrintf("%%%d+%d ", MyPenum, count));
}

/*---------------------------------------------------------- MemAdviseSize */

MemAdviseHash()
{
	return 0;
}

/*---------------------------------------------  Statistics & debug output */

MemStats()
{
    if (PrintMemStat)
    {
	OsSpinLock( padlock );
	/* entry point to print statistics about the memory allocator
	 * requires memory trace at least 1.  +mt10 shows buffer counts for
	 * each of the buckets.  +mt90 shows information about each free buffer.
	 */
	TRACE(1,OsPrintf("Memory Statistics: --------------------------\n"));
	TRACE(1,OsPrintf("%d total, %d used, %d max used, %d freed\n", SysMem, BytesUsed, MaxBytesUsed, BytesReturned));
	TRACE(1,OsPrintf("%d garbage collections in %d ms, %d gbl reqs\n", gbgcollections, gbgtime, globalreqs));
	TRACE(1,OsPrintf("%d remote merges (out of %d+%d = %d%%)\n", rmtmerge, mrgcount, mrgdenycount, 100*rmtmerge/(mrgcount+mrgdenycount) ));
	TRACE(10,FreeMap());
	OsSpinUnlock( padlock );
    }
}


static FreeMap()
{
	int i, n, tot, gtot, pe, flag;
	MEMHDR *p;
	char *cp;

	tot = 0;
	/*===  scan through local buddy systems */
	for( pe = 0; pe < MaxPe; pe++ ) {
		flag = 0;
		for( i=0; i<=MAXBUDDY; i++ ) {
			n = 0;
			for( p = LocalFree[pe][i].head.next; p != NULL; p = p->next ) {
				n++;
				assert( p != p->next );
			}
			if( n != 0 ) {
				if( flag == 0 ) { flag = 1; OsPrintf("PE %d  ", pe); }
				OsPrintf("%d:%d ", i, n);
			}
			tot += n*SIZE(i);
		}
		if( flag ) OsPrintf("\n");
	}

	/*===  scan through global free chain */
	for( n=0, cp = gmfree; cp != NULL; cp = *((char **)cp) ) n++;
	OsPrintf("Free summary: %d local + %d global(%d blocks(%d min)) = %d free\n"
		, tot, n*SIZE(MAXBUDDY), n, gcellmin, tot+n*SIZE(MAXBUDDY));
	OsPrintf("Memory budget: %d in use out of %d total(%d global blocks of %d)\n",
		SysMem - (tot+n*SIZE(MAXBUDDY)), SysMem, SysMem/SIZE(MAXBUDDY), SIZE(MAXBUDDY) );

	TRACE(90,DumpFreeList());
}


static DumpFreeList()
{
	int pe, bucket, flag;
	char *pb;
	MEMHDR *b;

	OsPrintf("Free list dump\n");
	for (pe = 0; pe < MaxPe; pe++ ) {
		flag = 0;
		for (bucket = 0; bucket < BUCKETS; bucket++ ) {
			for (b = LocalFree[pe][bucket].head.next; b != NULL; b = b->next ) {
				if( !flag ) {
					OsPrintf("addr   bi pi b  use   p=%d\n", pe);
					flag = 1;
				}
				pb = (char *)(((char *)b - MemBase) ^ SIZE(bucket));
				OsPrintf("%-6d %-2d %-2d %-2d %-3d %-6d\n", ((char *)b) - MemBase, b->bucket, BaseProc((char *)b), bucket, b->inuse, pb);
			}
		}
	}
}
