/*	This file is part of the software similarity tester SIM.
	Written by Dick Grune, Vrije Universiteit, Amsterdam.
	$Header: hash.c,v 2.1 91/06/18 22:01:15 dick Exp $
*/

/*	Text is compared by comparing every substring to all substrings
	to the right of it; this process is in essence quadratic.  However,
	only substrings of length at least 'min_run_size' are of interest,
	which gives us the possibility to speed up this process by using
	a hash table.
	
	For every position in the text, we construct an index which gives
	the next position in the text where a run of min_run_size tokens
	starts that has the same hash code.  If there is no such run, the
	index is 0.  These forward references are kept in the array
	forw_ref[].
	
	To construct this array, we use a hash table hash[], such that
	hash[i] is the index of the latest token with hash_code i,
	or 0 if there is none.  See make_forw_ref().

	For long text sequences (say hundreds of thousands of tokens),
	the hashing is not really efficient any more since too many
	spurious matches occur.  Therefore, the forward reference table is
	scanned a second time, eliminating from any chain all references to
	runs that do not end in the same token.  For the UNIX manuals this
	reduced the number of matches from 91.9% to 1.9% (of which 0.06%
	was genuine).
*/

#include	"private.h"
#include	"debug.h"
#include	"token.h"
#include	"buff.h"
#include	"sim.h"

extern char *calloc();

extern char options[];
extern int ntexts;
extern struct text *text;
extern int min_run_size;

#define	N_HASH		10639		/* any suitable prime */

unsigned int *forw_ref;			/* to be filled by malloc() */

PRIVATE int
hash1(p)
	TOKEN *p;
{
	/*	hash1(p) returns the hash code of the min_run_size
		tokens starting at p; caller guarantees that there
		are at least min_run_size tokens.
	*/
	register int h = 0;
	register int n = min_run_size;
	register TOKEN *tkp = p;
	
	while (n--) {
		h = (h << 1) + TOKEN2int(*tkp++);
		if (h < 0) {
			/* shifted into negativity; correct by taking % */
			h %= N_HASH;
			if (h < 0) {
				/* remainder may still be negative */
				h +=N_HASH;
			}
		}
	}
	return h % N_HASH;
}

PRIVATE int
hash2(p)
	TOKEN *p;
{
	/*	a simple-minded hashing for the secondary sweep;
		first and last token combined in a short
	*/
	return (TOKEN2int(*p) << 8) + TOKEN2int(*(p+min_run_size-1));
}

make_forw_ref() {
	/*	Constructs the forward references table.
	*/
	/*	Hash[] is a local array and we should like to keep it
		that way, were it not that some feeble compilers/machines
		don't like large arrays on the stack.  So we allocate it
		through malloc() and free it upon exit (all exits!).
	*/
	register unsigned int *hash;	/* to be filled by malloc() */
	register int n;
	register unsigned int i;
	
	/* allocate and clear the arrays */
	hash = (unsigned int *)calloc(N_HASH, sizeof (unsigned int));
	forw_ref =
		options['x'] ? 0 :
		(unsigned int *)
			calloc(text_length(), sizeof (unsigned int));
	if (!hash || !forw_ref) {
		printf(">>> Not enough memory for the hash tables, ");
		printf("this is going to take time!\n\n");
		if (hash) {
			free((char *)hash);
		}
		return;
	}

	/* set up the forward references using the hash table */
	for (n = 0; n < ntexts; n++) {
		register struct text *txt = &text[n];
		register unsigned int j;
		
		for (	/* all positions in txt except the last min_run_size */
			j = txt->tx_start;
			j + min_run_size < txt->tx_limit + 1;
			j++
		) {
			if (MayBeStartOfRun(tk_buff[j])) {
				register int h = hash1(&tk_buff[j]);
				
				if (hash[h]) {
					forw_ref[hash[h]] = j;
				}
				hash[h] = j;
			}
		}
	}
	free((char *)hash);

#ifdef	DB_HASH
	db_forw_ref();
#endif	/* DB_HASH */

	/* clean out spurious matches */
	for (i = 0; i+min_run_size < text_length(); i++) {
		register unsigned int j = i;
		register int h2 = hash2(&tk_buff[i]);

		/* find first token in chain with same secondary hash code */
		while ((j = forw_ref[j]) && hash2(&tk_buff[j]) != h2) {
			/* do nothing */
		}
		/* short-circuit forward reference to it */
		forw_ref[i] = j;
	}
#ifdef	DB_HASH
	db_forw_ref();
#endif	/* DB_HASH */

}

free_forw_ref() {
	if (forw_ref) {
		free((char *)forw_ref);
	}
}
