/* wordtable.c -- Copyright 1989, 1990 Liam R. Quin.  All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file ../COPYRIGHT for full details.
 */

/* Symbol Table Interface to text retrieval database.
 * Handles both the internal and external indexes.
 *
 * This originally used a linked list.  Converting to a hash table reduced
 * the time to index comp.os.vms from nearly an hour to one and a half
 * minutes...
 *
 * Liam Quin, 1989
 */

/* 
 * $Id: wordtable.c,v 2.18 92/08/24 00:23:46 lee Exp $
 */

#ifndef lint
 static char *Rcs = "$Id: wordtable.c,v 2.18 92/08/24 00:23:46 lee Exp $";
#endif

#include <stdio.h>
#include <malloc.h>
#include <ctype.h>
#include <sys/types.h>
#include <fcntl.h> /* for O_RDWR wtc */

#include "globals.h" /* defines and declarations for database filenames */
#include "error.h"

#include "smalldb.h"
#include "fileinfo.h"
#include "wordinfo.h"
#include "pblock.h"
#include "wordrules.h"
#include "emalloc.h"
#include "addfile.h"

extern t_WID GetNextWID();

static void NewEntry();
static void UpdateEntry();

#ifndef MaxWordPlacesInAWordBlock
# define MaxWordPlacesInAWordBlock ((WIDBLOCKSIZE-(MinWordLength+2)/3))
#endif

#ifndef HASHSIZ
# define HASHSIZ 32768 /* MUST be a power of two */
#endif /*!HASHSIZ*/

#ifndef MAXWORDSINCACHE
# define MAXWORDSINCACHE  (HASHSIZ * 10)
#endif

int MaxWordsInCache = MAXWORDSINCACHE;

extern int AsciiTrace;

/** System calls and library functions used in this file: **/

/** Lqtext calls */
extern unsigned int Putpblock();
extern void DeleteWordPlaces();
extern t_WordInfo *MakeWordInfo();

/** System calls: */

/** Library Functions: */
extern char *strncpy();
extern void perror();
extern void exit();

#ifdef ASCIITRACE
extern void fprintWordInfo();
#endif
/**/

extern char *progname;
static int HashSize = HASHSIZ; /* MUST be a power of two */

#define NPLACES 4
#define NPLACESBIGINCR 16
#define NPLACESHUGEINCR 128
/* This is small to optimise the common case -- by far the majority of
 * words are used less than 10 times.  In the cases where we've gone
 * wrong, well, there'll be a few thousand.  We add slowly until we
 * get to NPLACE * 3, and then we go up in NPLACESBIGINCR lumps.
 */

typedef struct s_HashEl {
    char *Word;
    t_WID WID;
    int PlacesUsed;
    int PlacesAllocated;
    t_WordPlace *Places;
} t_HashEl;

static t_HashEl **SymbolTable;
static int LastEl = 0;
static int WordsInCache = 0;
static int DumpThresh = DUMP_FAST_THRESH;

void SetDumpThresh(Thresh)
    int Thresh;
{
    /* Set the threshhold for fast dumping.
     * If a word has less than this many occurrences in the cache, it gets
     * written out.  -1 disables this feature, and 0 uses the default.
     */

    DumpThresh = Thresh;

    if (!DumpThresh) {
	DumpThresh = DUMP_FAST_THRESH;
    }
}

static void
InitHash()
{
    if (MaxWordsInCache) {
#ifdef ASCIITRACE
	HashSize = 1; /* no minimum when debugging */
#else
	HashSize = 1024; /* silently enforced minimum... */
#endif
	/* BUG: for really large MaxWordsInCache (2^31), could loop forever */
	while (HashSize < MaxWordsInCache / 10) {
	    HashSize <<= 1;
	}
    }

    if (HashSize < 1) {
	Error(E_FATAL, "InitHash: hash size (%d/%d) is too small!\n",
				HashSize, MaxWordsInCache);
    }

    SymbolTable = (t_HashEl **) ecalloc(HashSize, sizeof(t_HashEl *));
    LastEl = HashSize; /* Used as a sentinel */
#ifdef ASCIITRACE
    if (AsciiTrace > 2) {
	fprintf(stderr, "%s: allocated %ld hash slots for up to %ld words\n",
	    progname,
	    HashSize,
	    MaxWordsInCache
	);
    }
#endif
}

static t_HashEl ZeroEl = {
    0,
};

static void
SetElEmpty(El)	/* Initialisation function for Hash Elements */
    t_HashEl *El;
{
    *El = ZeroEl; /* structure assignment */
    El->PlacesAllocated = NPLACES;
    El->PlacesUsed = 0;
    El->Places = (t_WordPlace *) emalloc(sizeof(t_WordPlace) * NPLACES);
}

#ifndef Hash
INLINE
int
Hash(Word, Length)
    register char *Word;
    register int Length;
{
    register unsigned long n = 0;

#ifndef NODUFF /* clever stuff for speedup... dmr-approved!... */

#define HASHC	n = *Word++ + 65599 * n

    if (Length > 0) {
	register int loop = (Length + 8 - 1) >> 3;

	switch(Length & (8 - 1)) {
	case 0:	do {
		HASHC;	case 7:	HASHC;
	case 6:	HASHC;	case 5:	HASHC;
	case 4:	HASHC;	case 3:	HASHC;
	case 2:	HASHC;	case 1:	HASHC;
		} while (--loop);
	}

    }
#else /* NODUFF */
    while (Length--)
	n = *Word++ + 65599 * n;
#endif /* NODUFF */
    /**
    return n & (HashSize - 1);
    **/
    return n % HashSize;
}
#endif /* Hash */

void DumpCache(); /* see ../h/addfile.h */
static char FirstTimeRound = 1;

void
AddWord(WordInfo)
    t_WordInfo *WordInfo;
{
    register t_HashEl *HashEl;
    int Slot, FirstSlot;

    if (!WordInfo || !WordInfo->Word || !WordInfo->Word[0]) {
	Error(E_WARN, "Null word in AddWord(0x%x)", WordInfo);
	return;
    }

    if (!LastEl) {
	InitHash();
	if (FirstTimeRound) {
	    /* Special check to save looking up the WIDs first time round */
	    t_WID W = GetMaxWID();

	    if (W == 0L) {
		if (AsciiTrace > 1) {
		    fprintf(stderr, "\n%s: first ever run, allocating WIDs.\n",
			    progname);
		}
		FirstTimeRound = 1; /* actually it's already 1 here */
	    } else {
		FirstTimeRound = 0;
	    }
	}
    }
    
    if (MaxWordsInCache && WordsInCache > MaxWordsInCache) {
	DumpCache(DUMP_CACHE);
    }

    if (WordInfo->WordPlace.FID == 0) {
	Error(E_BUG, "AddWord: FID 0 for \"%s\"", WordInfo->Word);
    }

    FirstSlot = Slot = Hash(WordInfo->Word, WordInfo->Length);

    for (;;) {
	if (SymbolTable[Slot] == (t_HashEl *) NULL) {
	    extern char *strcpy();
	    extern t_WID Word2WID();

	    /* make a new element */
	    HashEl = SymbolTable[Slot] = (t_HashEl *) emalloc(sizeof(t_HashEl));
	    SetElEmpty(HashEl);
	    HashEl->Word = emalloc(WordInfo->Length + 1);
	    (void) strcpy(HashEl->Word, WordInfo->Word);
	    if (FirstTimeRound) {
		/* No point looking for the WID, it won't be there;
		 * code in DumpCache() will assign a new one.  Word2WID()
		 * will always return zero on a new index.
		 */
		HashEl->WID = 0;
	    } else {
		HashEl->WID = Word2WID(HashEl->Word, WordInfo->Length);
	    }
	    break;
	} else if (STREQ(SymbolTable[Slot]->Word, WordInfo->Word)) {
	    HashEl = SymbolTable[Slot];
	    break;
	}

	if (++Slot >= HashSize) Slot = 0;

	if (Slot == FirstSlot) {
	    /* We need to dump the cache and start again */
	    DumpCache(DUMP_CACHE|DUMP_FAST);
	}
    }

    /* If we get here, all we need to do is add the WordPlace */

    if (HashEl->PlacesAllocated - HashEl->PlacesUsed <= 0) {
	if (HashEl->PlacesAllocated <= NPLACES * 3) {
	    HashEl->PlacesAllocated += NPLACES;
	} else if (HashEl->PlacesAllocated <= NPLACESBIGINCR) {
	    HashEl->PlacesAllocated += NPLACESBIGINCR;
	} else {
	    HashEl->PlacesAllocated += NPLACESHUGEINCR;
	}
	HashEl->Places = (t_WordPlace *) erealloc(
	    (char *) HashEl->Places,
	    sizeof(t_WordPlace) * HashEl->PlacesAllocated
	);
    }

#ifdef ASCIITRACE
    if (AsciiTrace > 3) {
	if (HashEl->PlacesUsed) {
	    t_WordPlace *wp = &HashEl->Places[HashEl->PlacesUsed - 1];

	    if (wp->FID > WordInfo->WordPlace.FID) {
		Error(E_BUG, "AddWord: %s: place %d: FID %ld > %ld", 
		    HashEl->Word,
		    HashEl->PlacesUsed + 1,
		    wp->FID,
		    WordInfo->WordPlace.FID
		);
	    } else if (wp->FID == WordInfo->WordPlace.FID) {
		if (wp->BlockInFile > WordInfo->WordPlace.BlockInFile) {
		    Error(E_BUG, "AddWord: %s: place %d: FID %ld: Blk %ld > %ld",
			HashEl->Word,
			HashEl->PlacesUsed + 1,
			wp->FID,
			wp->BlockInFile,
			WordInfo->WordPlace.BlockInFile
		    );
		} else if (wp->BlockInFile == WordInfo->WordPlace.BlockInFile &&
			    wp->WordInBlock >= WordInfo->WordPlace.WordInBlock) {
		    Error(E_BUG,
			"AddWord: %s: place %d: FID %ld: Blk %ld: WIB %d > %d",
			HashEl->Word,
			HashEl->PlacesUsed + 1,
			wp->FID,
			wp->BlockInFile,
			wp->WordInBlock,
			WordInfo->WordPlace.WordInBlock
		    );
		}
	    }
	}
    }
#endif
    HashEl->Places[HashEl->PlacesUsed++] = WordInfo->WordPlace;

    WordsInCache++;

#ifdef ASCIITRACE
    if (AsciiTrace > 9) {
	fprintf(stderr, "Slot %d Word %s len %d places %d\n",
		Slot, SymbolTable[Slot]->Word,
		WordInfo->Length, SymbolTable[Slot]->PlacesUsed);
    }
#endif
    return;
}

static void
FastDump(CallFree)
    int CallFree;
{
    extern void FlushCache();
    register int i;
    register t_HashEl *HashEl;
    int Progress = 0;

    if (AsciiTrace > 1) {
	fprintf(stderr, "fast mode (dump where n <= %d): ", DumpThresh);
    }

    for (i = 0; i != LastEl; i++) {
	int ForceDump;
	int len;

	if (!SymbolTable[i]) continue;

	HashEl = SymbolTable[i];
	len = strlen(HashEl->Word);
	ForceDump = 0;

	if (SymbolTable[i]->PlacesUsed > DumpThresh) {
	    /* We can't simply delete the SymbolTable entry, because
	     * if there's a collision (two or more entries have the same
	     * hash code), we would no longer be able to find the second
	     * entry.  So we have to make sure that everything that is left
	     * is in the correct slot.
	     */
	    int Slot = Hash(HashEl->Word, len);

	    if (Slot != i) {
		/* See if we can put the entry in the right place */
		if (SymbolTable[Slot] == 0) {
		    /* In this case we can simply put it in the right place*/
		    SymbolTable[Slot] = SymbolTable[i];
		    SymbolTable[i] = 0;
		} else if (SymbolTable[Slot]->PlacesUsed <= DumpThresh) {
		    register t_HashEl *tmp = SymbolTable[Slot];

		    SymbolTable[Slot] = SymbolTable[i];
		    HashEl = SymbolTable[i] = tmp;
		} else {
		    ForceDump = 1;
		}
	    }

	    if (!SymbolTable[i]) {
		continue; /* dealt with slot i. */
	    }
	}

	if (SymbolTable[i]->PlacesUsed <= DumpThresh || ForceDump) {
	    /* We are going to make a new index entry for the word.
	     * There are two cases -- depending on whether the word
	     * is already indexed or not.
	     * In the former case we must merge the new information.
	     * In the latter case we don't have to read the old info,
	     * but we must make a new entry in the WID Index.
	     */

	    WordsInCache -= HashEl->PlacesUsed;

	    if (HashEl->WID == 0) {
		NewEntry(HashEl, len);
	    } else {
		UpdateEntry(HashEl, len);
	    }

	    /* Reclaim storage */
	    if ((CallFree & DUMP_NOFREE) == 0) {
		extern void SlayWordInfo();

		efree(HashEl->Word);
		efree((char *) HashEl->Places);
		efree((char *) HashEl);
	    }
	    SymbolTable[i] = 0;
	}

	if (AsciiTrace > 1) {
	    if (i >= Progress * (HashSize / 16)) {
		fputc("1234567890abcdef?!"[Progress], stderr);
		fflush(stderr);
		++Progress;
	    }
	}

    }

    if (AsciiTrace > 1) {
	fprintf(stderr,
	    "-- cache now %d/%d]\n",
	    WordsInCache,
	    MaxWordsInCache
	);
    }

    FirstTimeRound = 0;
    FlushCache(0); /* the number is non-zero if we only need to clear 1 slot */
}


void
DumpCache(CallFree)
    int CallFree;
{
    extern void FlushCache();
    register int i;
    register t_HashEl *HashEl;
    int Progress = 0;

    if (AsciiTrace > 1) {
	fputs("\n[cache dump ", stderr);
    }

    if (DumpThresh == -1) {
	CallFree |= DUMP_SYNC;
    } else if ((CallFree & DUMP_SYNC) == 0 || (CallFree & DUMP_FAST) != 0) {
	int SaveThresh = DumpThresh;
	do {
	    FastDump(CallFree);
	    DumpThresh *= 2;
	} while (!DUMP_CACHE_BELOW_LOW_WATER(WordsInCache, MaxWordsInCache));
	DumpThresh = SaveThresh;
	return;
    }

    for (i = 0; i != LastEl; i++) {
	if (SymbolTable[i]) {
	    unsigned len;

	    HashEl = SymbolTable[i];

	    /* We are going to make a new index entry for the word.
	     * There are two cases -- depending on whether the word
	     * is already indexed or not.
	     * In the former case we must merge the new information.
	     * In the latter case we don't have to read the old info,
	     * but we must make a new entry in the WID Index.
	     */

	    len = strlen(HashEl->Word);

	    if (HashEl->WID == (t_WID) 0) {
		NewEntry(HashEl, len);
	    } else {
		UpdateEntry(HashEl, len);
	    }

	    WordsInCache -= HashEl->PlacesUsed;

	    /* Reclaim storage */
	    if ((CallFree & DUMP_NOFREE) == 0) {
		extern void SlayWordInfo();

		efree(HashEl->Word);
		efree((char *) HashEl->Places);
		efree((char *) HashEl);
	    }
	    SymbolTable[i] = 0;
	}

	if (AsciiTrace > 1) {
	    if (i >= Progress * (HashSize / 16)) {
		fputc("1234567890ABCDEF?!"[Progress], stderr);
		++Progress;
	    }
	}

#if 0 /* TODO */
	if (WordsInCache <= 0) break;
#endif
    }

    WordsInCache = 0;
    FlushCache(0); /* the number is non-zero if we only need to clear 1 slot */

    if (AsciiTrace > 1) {
	fputs("]\n", stderr);
    }
    FirstTimeRound = 0;
}

static void
NewEntry(HashEl, Length)
    t_HashEl *HashEl;
    int Length;
{
    t_pblock *pblock = 0;
    register int i;
    t_WordInfo *WordInfo;
    /* TODO: add MightNeedToSort check */

    /** make a WIDIndex entry and mark it as invalid (NOTDONE) */

    /* In order to do this, we must make a "pblock", a structure that
     * reflects the physical database.  This is fairly low-level stuff
     * for efficiency's sake...
     */

    /* allocate a pblock structure.  These are rather devious things, a
     * structure with an array tacked onto the end.
     */
    pblock = (t_pblock *) emalloc(sizeof(t_pblock) +
			HashEl->PlacesUsed * sizeof(t_WordPlace));
    
    if (HashEl->WID == 0) {
	HashEl->WID = AllocateWID(HashEl->Word, Length, GetNextWID(0));
    }
 
    pblock->WID = HashEl->WID;
    pblock->ChainStart = 0L; /* address on disk -- not there yet, so 0! */
    pblock->NumberOfWordPlaces = HashEl->PlacesUsed;

    /* fill in the WordPlaces */
    for (i = 0; i < HashEl->PlacesUsed; i++) {
	pblock->WordPlaces[i] = HashEl->Places[i]; /* struct copy */
	/* TODO: call qcmp to see if we need a sort */
    }

    /* Now fill in enough of WordInfo to let us use the low-level routines: */
    WordInfo = MakeWordInfo(HashEl->WID, Length, HashEl->Word);
    WordInfo->Offset = 0L;

    WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces;

    /* First, let's make an index entry: */

    if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock) {
	(void) MkWIB(WordInfo, pblock);
    }

    /** write out the new entry */
    if (WordInfo->WordPlacesInHere == pblock->NumberOfWordPlaces) {
	/* In this case it all fits into the WID index */
	pblock->ChainStart = 0L;
    } else {
	(void) Putpblock(WordInfo, pblock);
    }

    if (PutWordInfoIntoIndex(WordInfo, pblock->ChainStart) < 0) {
	Error(E_SYS|E_FATAL,
	    "NewEntry: Couldn't insert \"%s\" in database at 0x%lx",
			    WordInfo->Word, pblock->ChainStart);
    }

    /** reclaim storage */
    if (pblock) {
	(void) efree((char *) pblock);
    }
    SlayWordInfo(WordInfo);
}

static void
UpdateEntry(HashEl, Length)
    t_HashEl *HashEl;
    int Length;
{
    extern t_pblock *Getpblock();
    extern t_WordInfo *WID2WordInfo();
    register int i;
    t_pblock *pblock;
    t_WordInfo *WordInfo;
    int MightNeedToSort = 0;

#ifdef ASCIITRACE
    if (AsciiTrace >= 4) {
	fprintf(stderr, "UpdateEntry(%s/WID %ld, wordlen %d)\n",
			HashEl->Word, HashEl->WID, Length);
    }
#endif

    /** get the old entry */

    if (!HashEl->WID || !(WordInfo = WID2WordInfo(HashEl->WID))) {
	Error(E_BUG, "Word %s WID %ld went away!", HashEl->Word, HashEl->WID);
	NewEntry(HashEl, Length);
	return;
    }

    /* It would be best if we could append to the old entry... which is what
     * I had in mind when I designed the disk storage stuff... but you can't.
     */
#ifdef ASCIITRACE
    if (AsciiTrace & 32) {
	fprintWordInfo(stderr, WordInfo, "UpdateEntry");
    }
#endif

    if (WordInfo->WordPlacesInHere == WordInfo->NumberOfWordPlaces) {
	pblock = (t_pblock *) 0;
    } else {
	pblock = Getpblock(WordInfo);
    }

    if (pblock) {
	pblock = (t_pblock *) erealloc((char *) pblock, sizeof(t_pblock) +
	     (pblock->NumberOfWordPlaces +
				    HashEl->PlacesUsed) * sizeof(t_WordPlace));

    } else {
	pblock = (t_pblock *) emalloc(sizeof(t_pblock) +
			(WordInfo->WordPlacesInHere + HashEl->PlacesUsed) *
			sizeof(t_WordPlace));
	pblock->NumberOfWordPlaces = 0;
        if (WordInfo->WordPlacesInHere < WordInfo->NumberOfWordPlaces) {
	    extern t_WordPlace *GetWordPlaces();

	    if (WordInfo->WordPlaceStart) {
		WordInfo->WordPlaces = GetWordPlaces(
		    WordInfo->WID,
		    WordInfo->WordPlaceStart,
		    WIDBLOCKSIZE - (WordInfo->WordPlaceStart - WordInfo->DataBlock),
		    0L,
		    WordInfo->NumberOfWordPlaces
		);
	    }
	}

	/* Assert: the wordplaces in WordInfo are sorted */
	for (i = 0; i < WordInfo->NumberOfWordPlaces; i++) {
	    pblock->WordPlaces[pblock->NumberOfWordPlaces++] =
				WordInfo->WordPlaces[i]; /* structure copy */

	}
    }

    /* delete the old entry from disk */
    if (WordInfo->Offset) {
	/* Remove the old information from disk.
	 * This isn't as bad as it sounds, as it will be at the start
	 * of the freelist, so when we write it out again it will be
	 * in the buffer cache...  But it would still be faster to append.
	 */
	DeleteWordPlaces(WordInfo->Offset, WordInfo->WID);
    }

    pblock->WID = HashEl->WID;
    WordInfo->Offset = pblock->ChainStart = 0L; /* it's invalid now... */

    /* Merge the WordPlaces */

    /* Assert: we need only compare the last old entry and the
     * first new one to see if we might need a sort.  Note that
     * there must _be_ entries in pblock, as otherwise we'd have called
     * NewEntry() and not UpdateEntry().
     */

    if (pblock->WordPlaces[pblock->NumberOfWordPlaces - 1].FID >=
				HashEl->Places[0].FID) {
	MightNeedToSort = 1;
    }

    for (i = 0; i < HashEl->PlacesUsed; i++) {
	pblock->WordPlaces[pblock->NumberOfWordPlaces++] =
				HashEl->Places[i]; /* copy the struct: */
	/* TODO: call qcmp to check for sorting (actually only need to
	 * check the FIDs of the new entries)
	 */
    }
    
    if (MightNeedToSort) {
	extern void SortWordPlaces();

	SortWordPlaces(pblock->NumberOfWordPlaces, pblock->WordPlaces);
    }

    WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces;

    /* First, let's make an index entry: */
    if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock) {
	(void) MkWIB(WordInfo, pblock);
    }

    /** write out the new entry */
    if (WordInfo->WordPlacesInHere == pblock->NumberOfWordPlaces) {
	/* In this case it all fits into the WID index */
	pblock->ChainStart = 0L;
    } else {
	(void) Putpblock(WordInfo, pblock);
    }
    if (PutWordInfoIntoIndex(WordInfo, pblock->ChainStart) < 0) {
	Error(E_FATAL|E_SYS,
	    "UpdateEntry: Couldn't update \"%s\" in database at 0x%lx",
			    WordInfo->Word, pblock->ChainStart
	);
    }

    /** reclaim storage */
    if (pblock) {
	(void) efree((char *)pblock);
    }
    (void) SlayWordInfo(WordInfo);

/* paranoia */
    WordInfo = WID2WordInfo(HashEl->WID);
    (void) SlayWordInfo(WordInfo);
}

