/* A hash table facility.  Optimized for speed of access.

   The hash table is as simple as possible: we get the key, and use it
   to look in a place.  If the place is already filled with someone
   else, we munge the key to get a second place to look, and from
   there we just keep incrementing the address until we find it or a
   spot to put it, with wraparound at the end.  A semaphor is used to
   find the end.

   This is O(1) if the hash table isn't very full, but boggs down to
   O(n) as n->m, the table size.  We rehash when n->m/2, which is
   probably earlier than optimal.

   With the current tuning of parameters, just before a rehash there
   are about 1.3 probes per read acess and just after about 1.05,
   measure for a pretty wide range of input distributions.  1.0 would
   be optimal.  Secondary clustering is visible, but there isn't enough
   to cause real problems.

   To speed things up at the expense of space, slots for storing the
   keys could be allocated, and instead of "strcmp(str,*j)" in the
   inner loop one would have "key == j->key && strcmp(str,j->str)"
   which is obviously faster.  But this would only be worthwhile if
   the probe figures get high, which will happen only if the rehash
   threshhold is increased, which will be done only if space is tight!

   Set bits in HT_MONITOR as follows for various monitoring options:

   1 Short rehash notifications.
   2 Probe count on each access.
   4 Performance estimate on each rehash.
   8 Display hash table configuration on each rehash.  0's are good.
   */

#define HT_MONITOR 1

/* stdio is here just for NULL. */
#include <stdio.h>
#include <strings.h>
#include "hash.h"

/* make lint happy: */
extern char *malloc();


/* Make_table() creates an empty hash table.  The 'offset' is the size
   of a structure to be allocated preceding each string in the table.
   For instance, if "foo" is inserted into a table with an offset of
   8, a block of 12 characters is malloc'ed, and the last 4 of them
   get "foo" inserted, and a pointer to that "foo" is returned.  Thus,
   the extra 8 characters can be used for making a hairy structure.
   Just subtract back from the returned string.

   If you're only interested in making unique instances of strings,
   offset should be 0.

   Estimate is an estimate of how many elements will be inserted into
   the table.  It is used to allocate the table at a reasonable
   initial size so lots rehashing won't be necessary.  */

hash_table make_hash_table(offset,estimate)
     unsigned offset,estimate;
{
  register hash_table table;
  register int i;
  unsigned size = estimate>10 ? 2*estimate+1 : 21;

  table = (struct hash_table_str *) malloc(sizeof(struct hash_table_str));
  table->size = size;
  table->count = 0;
  table->extra_offset = offset;
  table->array = (char **) malloc((size+1)*sizeof(char *));
  for (i=0; i<size+1; i++)
    (*(table->array))[i] = NULL;
  return table;
}

unsigned hash_key(str)
     char *str;
{
  register unsigned char *str1 = (unsigned char *)str;
  register unsigned cache;
  register unsigned k=0;

  /* If you change this, be sure to monitor performance carefully
     before and after, as it has been tuned.  Idle tweaking is
     unlikely to improve things.  */

  while ((cache = *str1++) != 0)
    k = ( k << 7 | k >> 25 ) ^ cache*cache;
  /* The following magic number is floor( 2^32 * (sqrt(5)-1)/2 ). */
  return 2654435769 * k >> 10;
}

/* Intern() does the insertions and lookups hash tables.  ht is the
   hash table to be looked in, str is the string to be looked up or
   inserted, and add is a flag which, if true, causes a copy of the
   string to be added if it isn't found, and if false causes NULL to
   be returned when the string is not found.  */

char *intern(ht,str,add)
     hash_table ht;
     char *str;
     int add;			/* If FALSE, won't modify table. */
{
  char **table = ht->array;
  unsigned size = ht->size;
  register unsigned key = hash_key(str);
  register char **j = &table[key % size];
  register char *cache;
#if (HT_MONITOR & 2)
  unsigned probe_count = 0;
#endif

  /* First we do an initial probe: */
  if ((cache=(*j)) != NULL)
    {
#if (HT_MONITOR & 2)
      probe_count += 1;
#endif
      if (strcmp(str,cache) == 0)
	{
#if (HT_MONITOR & 2)
	  fprintf(stderr, "(found %s %d)", str, probe_count);
#endif
	  return cache;
	}
      else
	{
	  j = &table[(key*key) % size];
	  /* This 'while' is the inner loop; things are set up to make
	     it fast.  To make the test simpler, a sentinel value of
	     NULL is kept at the top of the hash table and never filled. */
	  while ((cache=(*j)) != NULL)
	    {
#if (HT_MONITOR & 2)
	      probe_count += 1;
#endif
	      if (strcmp(str,cache) == 0)
		{
#if (HT_MONITOR & 2)
		  fprintf(stderr, "(found %s %d)", str, probe_count);
#endif
		  return cache;
		}
	      j++;
	    }
	  if (j == &table[size]) 
	    {

	      /* Ooops, we hit the sentinel.  Restart from the top of
		 the table. */

	      /* Since we are guaranteed that the hash table has room,
		 we don't need to do any sentinel checking here. */

	      j = table;
	      /* This is the inner loop again: */
	      while ((cache=(*j)) != NULL)
		{
#if (HT_MONITOR & 2)
		  probe_count += 1;
#endif
		  if (strcmp(str,cache) == 0)
		    {
#if (HT_MONITOR & 2)
		      fprintf(stderr, "(found %s %d)", str, probe_count);
#endif
		      return cache;
		    }
		  j++;
		}
	    }
	}
    }
  if (add==0)
    {
#if (HT_MONITOR & 2)
      fprintf(stderr, "(not found %s %d)", str, probe_count);
#endif
      return NULL;
    }
  ht->count += 1;
  *j =
    strcpy(malloc(strlen(str)+1 + ht->extra_offset) + ht->extra_offset,
	   str);
#if (HT_MONITOR & 2)
      fprintf(stderr, "(inserted %s %d)", str, probe_count);
#endif
  if (size < 2*ht->count + 1)
    /* Getting full, better rehash... */
    {
      char **savej = j;		/* reuse the register. */
      register int i;
      char *to_move;
      unsigned new_size		/* This should be primeish (eg. not even). */
	= 3*size | 1;
      char **new_table = (char **) malloc( (new_size+1) * sizeof( char * ) );
#if (HT_MONITOR & (1|4|8))
      fprintf(stderr, "Note: hash table grown, %d -> %d (%d in use).\n",
	      size, new_size, ht->count);
#endif
      for (i=0; i<new_size+1; i++)
	new_table[i] = NULL;

      /* Now its time to move everything from the old table to the new one.
	 We go through the table BACKWARDS because the reprobes go forwards,
	 so we get a better distribution when rehashing a cluster. */
	 
      for (i=size-1; i >= 0; i--)
	if ( (to_move = table[i]) != NULL )
	  {
	    key = hash_key(to_move);
	    j = &new_table[key % new_size];
	    /* Find the spot ... */
	    if ( *j != NULL )
	      {
		j = &new_table[ (key*key) % new_size ];
		while ( *j != NULL )
		  j++;
		if (j == &new_table[new_size])
		  {
		    j = &new_table[0];
		    while ( *j != NULL )
		      j++;
		  }
	      }
	    /* ... move the blot. */
	    *j = to_move;
	  }
#if (HT_MONITOR & (4|8))
      {
	unsigned total = 0;
	int i1;

	fprintf(stderr, "Old table: ");
#if (HT_MONITOR & 8)
	fprintf(stderr, "\n");
#endif
	for (i=0; i<size; i++)
	  if ( (cache = table[i]) == NULL )
	    {
#if (HT_MONITOR & 8)
	      fputc('-',stderr);
#endif
	    }
	  else
	    {
	      key = hash_key(table[i]);
	      if (i == key%size)
		i1 = 0;
	      else
		{
		  i1 = i - (key*key) % size;
		  if (i1 < 0) i1 = i1 + size;
		  i1 += 1;
		}
	      total += i1;
#if (HT_MONITOR & 8)
	      i1 += '0';
	      if (i1 > '9') i1 = '*';
	      fputc(i1,stderr);
#endif
	    }
#if (HT_MONITOR & 8)
	fprintf(stderr, "\n");
#endif
	fprintf(stderr, "mean bad probes per entry: %f.\nNew table: ",
		((float)total) / (float)ht->count);

	total = 0;
#if (HT_MONITOR & 8)
	fprintf(stderr, "\n");
#endif
	for (i=0; i<new_size; i++)
	  if ( (cache = new_table[i]) == NULL )
	    {
#if (HT_MONITOR & 8)
	      fputc('-',stderr);
#endif
	    }
	  else
	    {
	      key = hash_key(new_table[i]);
	      if (i == key%new_size)
		i1 = 0;
	      else
		{
		  i1 = i - (key*key) % new_size;
		  if (i1 < 0) i1 = i1 + new_size;
		  i1 += 1;
		}
	      total += i1;
#if (HT_MONITOR & 8)
	      i1 += '0';
	      if (i1 > '9') i1 = '*';
	      fputc(i1,stderr);
#endif
	    }
#if (HT_MONITOR & 8)
	fprintf(stderr, "\n");
#endif
	fprintf(stderr, "mean bad probes per entry: %f.\n",
		((float)total) / (float)ht->count);
	fflush(stderr);
      }
#endif
      ht->array = new_table;
      ht->size = new_size;
      free((char *)table);
      j = savej;
    }
  return *j;
}
