/* File:     ExtHash.cc
 * Purpose:  To implement a simple, in-memory self-growing
 *           hash table.  Deletions are not implemented,
 *           since the FDC code never needs them.
 *
 * RCS:
 ************************************************************
 * $Id: ExtHash.cc,v 1.1 2001/09/03 19:36:11 lw2j Exp $
 * $Log:	ExtHash.cc,v $
// Revision 1.1  2001/09/03  19:36:11  lw2j
// Initial revision
// 
 ************************************************************
 */


#if (USE_EXTHASH == 1) 

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <errno.h>
#include <math.h>
#include <iostream.h>
#include "ExtHash.h"

/* N.B. -- This implementation is simplified; for instance, it does not
 * support deleting entries.  Fortunately, the FDC code never needs that
 * particular operation.
 */


/* Ugly, but it seems to work.  This rounds up _num to the next
 * multiple of _div if it isn't one already.  Both are assumed to be
 * positive numbers.
 */
#define ROUND_UP_MACRO(_num,_div)  (((_num) % (_div)) ? ((_num)+((_div)-((_num)%(_div)))) : (_num))



/* Pages include a number of entries.  Note that the key is a
 * pointer to data _outside_ the table -- that is, the array
 * isn't stored inside the page itself.  Something to keep in
 * mind if ever modified to use on-disk storage...
 *
 * This was done to minimize coding time by allowing 
 * fixed-length records.
 */
struct entry_t {
  double*      key;   
  unsigned int val;
};


/* Second level blocks need to track more types of data.  These are:
   a) The number of bits being used with THIS prefix.
   b) The address of the first overflow block, if any.  Overflow blocks
      should only occur when the second-level block is full, and cannot
      be effectively split because the keys, when hashed, are identical.
      Of course, we could implement a series of hash functions to provide
      different splits, but we do not yet.
   c) The number of entries currently within this second-level block.
   d) The entries themselves.
*/

struct overflow_block_t;

/* For figuring out how much space a block requires, sans table.  This 
 * should hopefully mean that I can make fewer assumptions regarding 
 * structure padding, regardless of #pragmas, et al.  
 */
struct unused_second_level_stump_t {
  unsigned char     bits;
  unsigned short    entries;
  overflow_block_t *overflow;
  unused_second_level_stump_t *next;
  unused_second_level_stump_t *prev;
};


/* One padding assumption:  The beginning of the entry table will 
 * likely needed to be properly aligned; we waste a few bytes if
 * that assumption is not correct.
 */
static const int ENTRIES_PER_SECOND_LEVEL_BLOCK = (ExtHash::PAGE_SIZE - 
(ROUND_UP_MACRO(sizeof(unused_second_level_stump_t),sizeof(entry_t)))) / sizeof(entry_t);


/* Second level blocks form the main pages of each bucket.  
 * They are chained together in a linked list, since iterating
 * over the top-level bucket will include duplicate pointers to
 * the same second-level blocks, and we need to somehow support
 * a cursor.
 *
 * Each should fit in at most one page, as defined by ExtHash::PAGE_SIZE.
 * Each also tracks how many entries there are in *that* page,
 * (not counting overflows), and how many bits are being used
 * by this bucket (which may be less than the number of bits
 * used by the top-level index if there are multiple pointers to
 * this bucket).  Overflow buckets form a linked list off these
 * blocks.
 */
struct second_level_block_t {
  unsigned char         bits;
  unsigned short        entries;
  overflow_block_t     *overflow;
  second_level_block_t *next;  
  second_level_block_t *prev;
  entry_t               table[ENTRIES_PER_SECOND_LEVEL_BLOCK];
};

typedef second_level_block_t *p_second_level_block_t;

/* Overflow blocks are similar to second-level blocks, 
 * but do not need to track the  number of bits required,
 * nor other second-level blocks.
 */

struct unused_overflow_stump_t {
  unsigned short    entries;
  overflow_block_t *overflow;
};


static const int ENTRIES_PER_OVERFLOW_BLOCK = (ExtHash::PAGE_SIZE - (ROUND_UP_MACRO(sizeof(unused_overflow_stump_t),sizeof(entry_t)))) / sizeof(entry_t);

struct overflow_block_t {
  unsigned short         entries;
  overflow_block_t      *overflow;
  entry_t                table[ENTRIES_PER_OVERFLOW_BLOCK];
};



/* prototypes */
static second_level_block_t*  new_second_level_block(unsigned char bits);
static overflow_block_t*      new_overflow_block(void);
static void  delete_second_level_block(second_level_block_t *p);
static void  delete_overflows(overflow_block_t *p);
static unsigned int  hash_key(const double *key, unsigned int dimensionality);
static entry_t   *find_second_level_entry(const double* key,
                                          unsigned int dimensionality,
                                          second_level_block_t *p);
static unsigned int keycmp(unsigned int dimensionality, 
                           const double *key_0, 
                           const double *key_1);


/* Helper functions */

/* A simple allocator. */
static second_level_block_t *new_second_level_block(unsigned char bits_new=0) {
    second_level_block_t *new_p = new second_level_block_t;

    new_p->bits     = bits_new;
    new_p->entries  = 0;
    new_p->overflow = NULL;
    new_p->next     = NULL;
    new_p->prev     = NULL;

    return new_p;
}


/* Another allocator. */
static overflow_block_t*   new_overflow_block(void) {
  overflow_block_t* new_p = new overflow_block_t;
  
  new_p->entries  = 0;
  new_p->overflow = (overflow_block_t *) NULL;

  return new_p;
}




/* Deletion method. */
static void delete_second_level_block(second_level_block_t *p) {
  if (p) {
    unsigned int idx=0;

    delete_overflows(p->overflow);

    /* Recall that the (double*) keys are *external 
     * pointers* (although ones that we allocate
     * ourselves.  We need to delete these too, to
     * prevent a massive memory leak.
     */
    for (idx=0; idx < p->entries; idx++) {
      delete [] (((p->table)[idx]).key);
    }

    delete p;
  }
}


/* iteratively delete the entire chain */
static void delete_overflows(overflow_block_t *p) {  
  overflow_block_t *next;
  unsigned int      idx=0;

  while(p) {
    /* Again, we need to delete entries' key arrays. */
    for (idx=0; idx < p->entries; idx++) {
      delete [] (((p->table)[idx]).key);
    }

    next = p->overflow;
    delete p;
    p = next;
  } 
}




/* For now, use a simple hash. */
static unsigned int hash_key(const double *key, unsigned int dimensionality) {
  static const unsigned int multiplier = 116507; /* arbitrary choice */
  
  unsigned int hash     = 0;
  unsigned int idx      = 0;
  const char   *key_byte = (char*) key;

  for (idx=0; idx < (8*dimensionality); idx++) {
    hash *= multiplier;
    hash += *key_byte;
    key_byte++;
  }

  return hash;
}


/* Scan a second-level block, and if necessary any overflow blocks, and
 * return the address of the first (there really should be only one...)
 * entry that exactly matches the given key (using ALL bits for the
 * comparison.  That's regular key, not hashed).
 */
static entry_t *find_second_level_entry(const double  *key, 
                                        unsigned int dimensionality,
                                        second_level_block_t *p) {
  entry_t          *pe  = NULL;
  overflow_block_t *po  = NULL;
  int      idx = 0;
  int      entries = 0;

  if (!p) {
    return NULL;
  }

  entries = p->entries;
  assert(entries <= ENTRIES_PER_SECOND_LEVEL_BLOCK);

  for (idx=0; (idx < entries) && (!pe); idx++) {
    if (!keycmp(dimensionality, ((p->table)[idx]).key, key)) {
      pe = &((p->table)[idx]);
      return pe;
    }
  }

  /* Not in primary second-level box.  Look in overflow blocks, if any. */
  po = p->overflow;

  while(po) {
    entries = po->entries;
    for (idx=0; (idx < entries); idx++) {
      if (!keycmp(dimensionality, ((po->table)[idx]).key, key)) {
        pe = &((po->table)[idx]);
        return pe;
      }
    }
    po = po->overflow;
  }

  return NULL;
}


/* Are two keys identical?  Return 0 if so, 1 otherwise. */
static unsigned int keycmp(unsigned int dimensionality, 
                           const double *key_0, 
                           const double *key_1) {
  unsigned int idx=0;

  for (idx=0; idx < dimensionality; idx++) {
    if (key_0[idx] != key_1[idx]) {
      return 1;
    }
  }

  return 0;
}



/* Method implementations */
ExtHash::ExtHash(unsigned int dimensionality_new, int in_memory):
   bits(0), table(NULL), head(NULL), tail(NULL), curs_sec(NULL),
   curs_ovr(NULL), curs_idx(0) {

  /* warning:  in_memory is ignored! */

  dimensionality = dimensionality_new;

  table =  new p_second_level_block_t[1];
  table[0] =  new_second_level_block();
  head = table[0];
  tail = table[0];
}



ExtHash::~ExtHash(void) {
  second_level_block_t *curr = NULL;

  curr = head;
  while (head) {
    curr = head->next;
    delete_second_level_block(head);
    head = curr;
  }

  delete [] table;
}




/* Is it there?  Return 0 if not there. */
unsigned int ExtHash::fetch(const double *key) {
  unsigned int hash        = hash_key(key, dimensionality);
  unsigned int bitmask     = 0;
  unsigned int bucket_idx  = 0;
  entry_t              *pe = NULL;
  second_level_block_t *p2 = NULL;

  bitmask = (1 << bits) - 1;    

  bucket_idx = hash & bitmask;
  
  p2 = table[bucket_idx];
  assert(p2);
  
  if ((pe = find_second_level_entry(key, dimensionality, p2))) {
    /* It's there. */
    return pe->val;
  }
  
  return 0;
}



/* Is it there?  Return 0 if not there, else increment and return. */
unsigned int ExtHash::fetch_plusplus(const double *key) {
  unsigned int hash        = hash_key(key, dimensionality);
  unsigned int bitmask     = 0;
  unsigned int bucket_idx  = 0;
  entry_t              *pe = NULL;
  second_level_block_t *p2 = NULL;

  bitmask = (1 << bits) - 1;    

  bucket_idx = hash & bitmask;
  
  p2 = table[bucket_idx];
  assert(p2);
  
  if ((pe = find_second_level_entry(key, dimensionality, p2))) {
    /* It's there. */
    (pe->val)++;
    return pe->val;
  }

  /* Store and return a 1. */
  return (store(key, 1));
}





/* A long method...
 */
unsigned int ExtHash::store(const double* key, unsigned int val) {
  unsigned int hash        = hash_key(key, dimensionality);
  unsigned int masked      = 0;
  unsigned int bitmask     = 0;
  int index       = 0;
  int bucket_idx  = 0;
  int bucket2_idx = 0;
  unsigned int mismatch    = 0;
  second_level_block_t *p2 = NULL;
  second_level_block_t **p2arr = NULL;
  entry_t              *pe = NULL;
  overflow_block_t     *po = NULL;


  bitmask = (1 << bits) - 1;    
  bucket_idx = hash & bitmask;

  p2 = table[bucket_idx];
  assert(p2);
  
  /* First, check to see whether it's *already* in that bucket.  If so,
   * we'll replace the old entry.
   */
  if ((pe = find_second_level_entry(key, dimensionality, p2))) {
    /* Already here.  Yay, this makes our job easy. */
    pe->val = val;
    return val;
  } 

  /* Second case:  It's not there, but there's room in the primary 
   * second-level bucket to accept it. 
   */
  if ((p2->entries) < ENTRIES_PER_SECOND_LEVEL_BLOCK) {
    index = (p2->entries)++;
    ((p2->table)[index]).key = new double[dimensionality];
    memcpy((char*) (((p2->table)[index]).key), (char*) key,
           sizeof(double) * dimensionality);

    ((p2->table)[index]).val = val;
    return val;
  }

  /* Third case:  It's not there, but there's room in the last overflow
   * block.
   */
  po = p2->overflow;
  while (po && (po->overflow)) { po = po->overflow; };

  if (po && ((po->entries) < ENTRIES_PER_OVERFLOW_BLOCK)) {
    index = (po->entries)++;
    ((po->table)[index]).key = new double[dimensionality];
    memcpy((char*) (((po->table)[index]).key), (char*) key,
           sizeof(double) * dimensionality);
    po->table[index].val = val;
    return val;
  }


  /* Fourth case:  There's no room in the bucket, or in overflow buckets.
   * However, we're not using as many bits as the top-level index suggests --
   * (p2->bits < ht->bits).  This means that we don't need to reallocate the
   * hash table.  However, we do need to use another bit (if possible), then
   * create a new bucket and rehash the current contents of the old bucket.
   * Then, we recurse.   This should not cause a stack overflow, since 
   * the recursion is pretty limited, but...
   */  
  if ((p2->bits) < bits) {
    bitmask = (0x1 << (p2->bits)) - 1;
    bucket_idx  = hash & bitmask;
    bucket2_idx = bucket_idx;

    /* Since it's not using all the bits, these should still point to the
     * same bucket. 
     */
    table[bucket_idx]  = new_second_level_block(p2->bits + 1);

    if (p2->prev) {
      p2->prev->next = table[bucket_idx];
      table[bucket_idx]->prev = p2->prev;
    } else {
      assert(head == p2);
      head = table[bucket_idx];
    }

    if (p2->next) {
      p2->next->prev = table[bucket_idx];
      table[bucket_idx]->next = p2->next;
    } else {
      assert(tail == p2);
      tail = table[bucket_idx];
    } 

    /* Fix up other buckets that may refer to this. */
    bitmask++;
    for (index=0; index < (0x1 << bits); index++) {
       if (table[index] == p2) {

         if (index & bitmask) {
           if (bucket2_idx == bucket_idx) {
             bucket2_idx = index;
             table[index] = new_second_level_block(p2->bits + 1);

             assert(tail);  /* always at least one bucket */
             tail->next = table[index];    
             table[index]->prev = tail;
             tail = table[index];
           }
           table[index] = table[bucket2_idx];
         } else {
           table[index] = table[bucket_idx];
         }  
       }
    }

    /* Re-insert everything from p2.  They *should* land in just those
     * two buckets.
     */
    for (index=0; index < p2->entries; index++) {
      if ((!store((p2->table)[index].key, (p2->table)[index].val))) {
        delete_second_level_block(p2);
        return 0;
      }
    }
 
    /* Don't forget the overflow blocks, if any. */
    po = p2->overflow;
    while (po) {
      for (index=0; index < po->entries; index++) {
        if ((!store((po->table)[index].key, (po->table)[index].val))) {
          /* oops */
          delete_second_level_block(p2);
          return 0;
        }
      }
      po = po->overflow;
    }

    /* Delete the old blocks. */   
    delete_second_level_block(p2);

    return store(key, val);
  }


  /* Fifth case:  No room in the bucket (including overflow), and the 
   * bucket is using as many bits as the rest of the table.  This is 
   * given, else it would have fallen into a previous case.  The new 
   * constraint is that every object, AND the one to be inserted, 
   * have unique keys that hash to the very same value (considering 
   * the 8*sizeof(int)-1 least-significant bits); in this case, 
   * splits are completely irrelevant and we MUST use overflow blocks.
   * The possible failure mode in this case is that the new() fails, 
   * which will actually cause an exception, instead of a return 
   * value.
   */

  mismatch = 0;
  bitmask  = (((unsigned int) 0x1) << ((8*sizeof(unsigned int) - 1))) - 1;
  masked = hash & bitmask;
 
  for (index=0, pe=&(p2->table[0]); index < p2->entries; index++, pe++) {
    if ((hash_key(pe->key, dimensionality) & bitmask) == masked) {
      /* Coincidence, arrrrrrrrrgh.  (Or a bad hash function). */
    } else {
      mismatch = 1;
      break;
    }
  }

  if (!mismatch) {
     /* Matches everything in the primary block.  Check overflow blocks for
      * mismatches... if none found, then we'll need to go with a new
      * overflow block.  If matches are found, we'll fall through to the next
      * case:  doubling the size of the top-level table. 
      */
     po = p2->overflow;
     while (po && (!mismatch)) {
       for (index=0, pe=&(po->table[0]); index < po->entries; index++, pe++) {
         if ((bitmask & hash_key(pe->key, dimensionality)) == masked) {
           /* Coincidence, arrrrrrrrrgh.  (Or a bad hash function). */
         } else {
           mismatch = 1;
          break;
         }
       }
       po = po->overflow;
     }
 
    if (!mismatch) {
      /* They all matched, at least except for the MSB.  *sigh* Either bug,
       * or incredibly contrived input.
       */
      po = p2->overflow;
      if (!po) {
        p2->overflow = new_overflow_block();
        po = p2->overflow;
      } else {
        while (po->overflow) { 
          /* should be full -- make sure. */
          assert(po->entries == ENTRIES_PER_OVERFLOW_BLOCK);
          po = po->overflow;
        }
        po->overflow = new_overflow_block();
        po = po->overflow;
      }
        
      po->entries = 1;
      po->table[0].key = new double[dimensionality];
      memcpy((char*) (((po->table)[0]).key), (char*) key,
             sizeof(double) * dimensionality);      
      po->table[0].val = val;

      return val;
    }
  }


  /* Sixth case:  The bucket is full, including overflows.  The bucket is also
   * using the same number of bits as the main table, meaning that only one
   * pointer in the table points to it.  Not every key in the bucket hashes to
   * the same value ("same" ignoring MSB).
   *
   * Special constraint for this case:  we have already reached the maximum
   * number of bits allowed ((8*sizeof(int)) - 1).  Like the previous case,
   * this means that we can't split (at all; in the previous case, we could,
   * but it was pointless since they'd all go into exactly the same buckets
   * every split.  Here, we can't because we can't express the indices 
   * needed.  Therefore, we again need to use an overflow block.
   */

   if (bits == ((8*sizeof(int))-1)) {
      po = p2->overflow;
      if (!po) {
        p2->overflow = new_overflow_block();
        po = p2->overflow;
      } else {
        while (po->overflow) { 
          /* should be full -- make sure. */
          assert(po->entries == ENTRIES_PER_OVERFLOW_BLOCK);
          po = po->overflow;
        }
        po->overflow = new_overflow_block();
        po = po->overflow;
      }
        
      po->entries = 1;
      po->table[0].key = new double[dimensionality];
      memcpy((char*) (((po->table)[0]).key), (char*) key,
             sizeof(double) * dimensionality);      
      po->table[0].val = val;
      return val;
   }

  /* Seventh case:  We can, and should, increment the number of bits used
   * overall; double the size of the table; make sure that the new pointers
   * point to the old ones as well.  Then, we recurse; this should fall into
   * the fourth case.
   *
   * We duplicate the pointers by simply memcpy()-ing it twice.
   */


  index = 0x1 << bits;   /* previous size of table, in no. of pointers */

  p2arr = new p_second_level_block_t[index*2];

  for (bucket_idx=0; bucket_idx < index; bucket_idx++) {
    p2arr[bucket_idx] = (table)[bucket_idx];
    assert(p2arr[bucket_idx]);
    p2arr[bucket_idx+index] = p2arr[bucket_idx];
    table[bucket_idx] = NULL;
  }

  delete [] table;
  table = p2arr;  
  bits++;

  return (store(key, val));
}


/* Initialize cursor, set values, and return 0 on success (or
 * -1 on failure.
 */
int ExtHash::cursor_first(double *key_dbls, unsigned int &value) {
  curs_sec = head;
  curs_ovr = NULL;
  curs_idx = 0;

	while (curs_sec && (!(curs_sec->entries))) {
		curs_sec = curs_sec->next;
	}

	if (!curs_sec) {
		return -1;
	}

  return(cursor_retrieve(key_dbls, value));
}

/* Advance cursor and retrieve the item pointed-to. */
int ExtHash::cursor_next(double *key_dbls, unsigned int &value) {
  
  if (curs_ovr) {
    /* Are we in an overflow page? */ 
    curs_idx++;
   
    if (curs_idx >= curs_ovr->entries) {
      /* off the overflow page... try the next one. */
      curs_ovr = curs_ovr->overflow;
      curs_idx = 0;

      if (curs_ovr) {
        /* There is a next overflow page, so return the first
         * item on it.
         */
        return (cursor_retrieve(key_dbls, value));
      } else {
        /* None-such.  We need to iterate into the next bucket, if
         * any.  Note that curs_sec MUST be non-NULL when curs_ovr
         * is non-NULL.
         */
        assert(curs_sec);
        curs_sec   = curs_sec->next;

				while (curs_sec && (curs_sec->entries == 0)) {
					curs_sec = curs_sec->next;
				}

				if (!curs_sec) {
					return -1;
				}

        return (cursor_retrieve(key_dbls, value));
      }
    } else {
      /* Still on the same overflow page.  Good. */
      return(cursor_retrieve(key_dbls, value));
    }
  } else if (curs_sec) {
    /* Not in an overflow page, but in a secondary-level bucket. */
		curs_idx++;

		if (curs_idx >= curs_sec->entries) {
      /* Off page.  Need to check overflow existence first,
       * otherwise, use the next bucket. 
       */
      curs_idx = 0;

      if (curs_sec->overflow) {
        curs_ovr = curs_sec->overflow;
        return (cursor_retrieve(key_dbls, value));
      } else {
        /* No overflow page, either.  Next bucket, then. */
        curs_sec = curs_sec->next;

				while (curs_sec && (!(curs_sec->entries))) {
					curs_sec = curs_sec->next;
				}

				if (!curs_sec) {
					return -1;
				}

        return (cursor_retrieve(key_dbls, value));
      }
    } else {
      /* Still in first page of bucket. */
      return (cursor_retrieve(key_dbls, value));
    }
  } else {
    /* Not in ANY page, so reset to first. */
    return (cursor_first(key_dbls, value));
  }
}



/* Return 0 on success, -1 on failure.  Failure basically
 * means that the cursor was never initialized, or has 
 * advanced out of the table.
 *
 * Recall that allocating the memory for key_dbls is the 
 * user's responsibility.
 */
int ExtHash::cursor_retrieve(double *key_dbls, unsigned int &value) {
  entry_t e = {0,0};

  if (curs_ovr) {
    /* In an overflow page. */
    e = (curs_ovr->table)[curs_idx];
    memcpy((char*) key_dbls, (char*) (e.key), sizeof(double) * dimensionality);
    value = e.val;
    return 0;
  } else if (curs_sec) {
    /* In a secondary bucket, first page. */
    e = (curs_sec->table)[curs_idx];
    memcpy((char*) key_dbls, (char*) (e.key), sizeof(double) * dimensionality);
    value = e.val;
    return 0;
  } else {
    /* Nowhere, man. */
    return -1;
  }
}

#endif /* USE_EXTHASH */
