/***************************************************************************
                          findex.cpp  -  description
                             -------------------
    begin                : Mon Sep 24 2001
    copyright            : (C) 2001 by Yinglian Xie
    email                : ylxie@cs.cmu.edu
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/

// Findex.cpp: implementation of the CFindex class.
//
//////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>
#include <dirent.h>
#include <sys/stat.h>
#include <assert.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <time.h>
#include <list>
#include <string>
#include <vector>
#include <algorithm>
#include "findex.h"
#include "user.h"
#include "utils.h"

//////////////////////////////////////////////////////////////////////
//  class  Icache implementation
//////////////////////////////////////////////////////////////////////
Icache::Icache()
{
	m_availableSize = m_tsize;
	m_usedSize = 0;
}

Icache::~Icache()
{
}

void Icache::select_victim()
{
	assert(m_popularWords.size() > 0);

	/* select the last words in the list to kick out */
	CacheWord &w = m_popularWords.back();
	m_availableSize += w.size;
	m_usedSize -= w.size;
	m_popularWords.pop_back();
}

void Icache::insert(string& word, int size)
{
	CacheWord npword;
	npword.word = word;
	npword.lrtime = time(0);
	npword.size = size + sizeof(npword) + 2 * sizeof(int);
	npword.numRefer = 1;

	m_availableSize -= npword.size;
	m_usedSize += npword.size;
	m_popularWords.push_front(npword);
}

void Icache::lookup_update(const char* word)
{
	cwordList::iterator i;
	for (i = m_popularWords.begin(); i != m_popularWords.end(); i ++){
		if (word == i->word){
			i->lrtime = time(0);
			i->numRefer ++;
		}
	}
}

void Icache::clear()
{
	m_availableSize = m_tsize;
	m_usedSize = 0;

	m_popularWords.clear();
}

//////////////////////////////////////////////////////////////////////
//  class  CFindex implementation
//////////////////////////////////////////////////////////////////////

CFindex::CFindex(){
	m_num_word = 0;
	m_indexUsedSize = 0;
	m_wordUsedSize = 0;
	m_fstrUsedSize = 0;
	m_hitUsedSize = 0;
	m_threshhold = DEFAULT_THRESH;
	m_memcheckFrequency = DEFAULT_CHECKFRE;
}

CFindex::~CFindex()
{
}

////////////////////////////////////
// public functions
//////////////////////////////////

/////////////////////////////////////////////////////////////////
//  functions to set up parameters based on configuration options

void CFindex::set_cache_size(int size)
{
	m_cache.m_tsize = size * 1024;
	m_cache.m_availableSize = size * 1024;
}

void CFindex::set_max_wordlen(int len)
{
	if (len <= 0){
		printf("Maximum word length must be positive! exit now...\n");
		exit(0);
	}
	m_maxWordLen = (uint)len;
}

void CFindex::set_min_wordlen(int len)
{	
	if (len < 0)
		m_minWordLen = 0;
	else
		m_minWordLen = (uint)len;
}

void CFindex::set_wordchar(string& wordchar)
{
	m_wordchar.erase();
	m_wordchar = wordchar;
	if (wordchar.find_first_of("0123456789") < wordchar.length())
		m_allowDigits = 1;
	else
		m_allowDigits = 0;
}

void CFindex::set_delimitchar(string& delimitchar)
{
	m_delimitchar = delimitchar + " \t\n\r";
}

void CFindex::set_index_memory(int size)
{
	m_indexSize = size*1024*1024;
}

void CFindex::set_stopwords(string& stwlist)
{
	string::size_type pos_s, pos_e = 0;

	while (pos_e < stwlist.length()){
		pos_s = stwlist.find_first_of("\"", pos_e);
		if (pos_s > stwlist.length())
			break;
		
		pos_e = stwlist.find_first_of("\"", pos_s+1);
		if (pos_e < stwlist.length()){
			string word = stwlist.substr(pos_s+1, pos_e-pos_s-1);
			m_stopwords.push_back(word);
			pos_e ++;
		}
	}
}

/* 
 * set up a list of files (file extensions) that need not be indexed 
 * e.g. /usr0/ylxie/work/mingled/oldindex/lexicon,  *.tar.gz
 */
void CFindex::set_exception(string& explist)
{
	string::size_type pos_s, pos_e = 0;

	while (pos_e < explist.length()){
		pos_s = explist.find_first_of("\"", pos_e);
		if (pos_s > explist.length())
			break;
		
		pos_e = explist.find_first_of("\"", pos_s+1);
		if (pos_e < explist.length()){
			string word = explist.substr(pos_s+1, pos_e-pos_s-1);
			if (word[0] == '*')
				word.erase(0, 1);
			m_exceptlist.push_back(word);
			pos_e ++;
		}
	}
}


/////////////////////////////////////////////////////////////////
//  interfaces for module request manager and query processor
//  provide indexing and query lookup results

int CFindex::start_index(const char* name, Identity& user)
{
	string namestr = name;
	int returncode;
	
	/* get file or dir information by "lstat" system call */
	struct stat fst;
	if (lstat(namestr.c_str(), &fst) == -1){
	    mingle_debug1("\nFindex::start_index(): stat error on file ");
	    mingle_debug2(namestr.c_str(), errno);
	    return -1;
	}

	/* if directory, recursively index the dir */
	if (fst.st_mode & 0x4000){
	    returncode = index_dir(namestr.c_str());
	    flush_index();
	    return returncode;
	}

	/* we have a regular file */
	returncode = index_file(namestr.c_str(), fst);
	flush_index();
	return returncode;
}

void CFindex::lookup(string& word, Identity& user, THitList &hl)
{
	int id = hash(word.c_str());
	
	TWordList::iterator witer = find_if(m_words[id].begin(), 
					    m_words[id].end(), 
					    Iword_eq(word.c_str()));
	if (witer == m_words[id].end())
		return;
	
	/* the word is already in cache */
	if (witer->status != INDISK){
		m_cache.lookup_update(word.c_str());
	}else		
		/* need to fetch the word hitlist from the disk */
		if (read_index_item(witer->word.c_str()) == 0){

			int size = sizeof_word(*witer) + sizeof_hitlist(witer->hitList);
			while (m_cache.m_availableSize < size)
				m_cache.select_victim();
			
			m_cache.insert(witer->word, size);
		}else{
			return;
		} 
	
	/* need to check permission for search here */
	THitList::iterator hiter;
	for (hiter = witer->hitList.begin(); hiter != witer->hitList.end(); hiter ++){
		if (m_usergroup->search_file_permitted(user, m_files[hiter->fNo].fname))
			hl.push_back(*hiter);
	}
}


//////////////////////////////////
// private functions
//////////////////////////////////

/////////////////////////////////////////////////////////////////
//  functions to set up index table from disk

void CFindex::set_indexdir(string& dir)
{
	struct stat fst;

	get_absolute_fname(dir, m_indexdir);
	m_foutput = m_indexdir + "/" + FILE_FNAME;
	m_doutput = m_indexdir + "/" + DIR_FNAME;
	m_indexHit = m_indexdir + "/" + HIT_FNAME;
	m_indexWord = m_indexdir + "/" + WORD_FNAME;

	/* dir already exists */
	if (stat(m_indexdir.c_str(), &fst) == 0){
		if (fst.st_mode & 0x4000){
			setup_index();
		}else{
			printf("Index dir setup error! exit now...\n");
			exit(0);
		}
	}else{
		
		/* we need to create a new dir */
		if ((errno != ENOENT) ||
		    (mkdir(m_indexdir.c_str(), S_IRWXU | S_IRGRP | S_IROTH) == -1)){
			printf("Index dir set up error! (%s)  exit now...\n", strerror(errno));
			exit(0);
		}
	}
}

void CFindex::setup_index()
{
	FILE *ifp;
	char buf[MAXSTRSIZE];
	int i;

	/* set up indexed dir list */
	if ((ifp = fopen(m_doutput.c_str(), "r")) == 0){
		// OK, we have the directory, but nothing in, which is OK
		mingle_debug1("\nCFindex::setup_index(): open file error ");
		mingle_debug1(m_doutput.c_str());
		mingle_debug1(" Will create a new one.");
		return;
	}
	while (fgets(buf, MAXSTRSIZE, ifp)){
		i = 0;
		while ((i < MAXSTRSIZE - 1) && (buf[i] != '\n')) i ++;
		buf[i] = '\0';
		Idir newdir;
		newdir.dname = buf;
		m_dirs.push_back(newdir);
	}
	fclose(ifp);

	/* set up indexed file list */
	if ((ifp = fopen(m_foutput.c_str(), "r")) == 0){
		// we can open dir file but not file-file, this is strange, so abort
		printf("Set up index error! --couldn't open file %s. exit now...\n", 
		       m_foutput.c_str());
		exit (0);
	}
	while (fgets(buf, MAXSTRSIZE, ifp)){
		i = 0;
		while ((i < MAXSTRSIZE - 1) && (buf[i] != '\n')) i ++;
		buf[i] = '\0';

		Ifile newfile;
		newfile.fname = buf;
		m_files.push_back(newfile);
	}
	fclose(ifp);

	/* read in all the words (lexicon) */
	setup_lexicon();
}

void CFindex::setup_lexicon()
{
	FILE *ifp;

	// open lexicon file 
	if (!(ifp = fopen(m_indexWord.c_str(), "r"))){
		// we can open dir file but not word-file, this is strange, so abort
		printf("Set up index error! --couldn't open file %s. exit now...\n", 
		       m_indexWord.c_str());
		exit (0);
	}
	    
	char buf[MAXSTRSIZE], buf1[MAXSTRSIZE]; 
	uint dindex, totalhits;

	// read in word record
	while (fgets(buf, MAXSTRSIZE, ifp)){
		bzero(buf1, MAXSTRSIZE);
		sscanf(buf, "%s %u %u\n", buf1, &dindex, &totalhits);

		// look the word up in the hash table
		int id = hash(buf1);
		TWordList::iterator witer = find_if(m_words[id].begin(), m_words[id].end(), Iword_eq(buf));
		if (witer != m_words[id].end()){
			printf("Set up index error! -- inconsistent index status in file %s. exit now...\n", m_indexWord.c_str());
			fclose(ifp);
			exit(0);
		}
		
		Iword w;
		w.word = buf1;
		w.dindex = dindex;
		w.totalhits = totalhits;
		w.status = INDISK;
		m_words[id].push_back(w);
	}

	// close the lexicon file
	fclose(ifp);
}

int CFindex::read_index_item(const char* word)
{
	/* find the word record */
	int id = hash(word);
	TWordList::iterator witer = find_if(m_words[id].begin(), m_words[id].end(), Iword_eq(word));
	if (witer == m_words[id].end())
		return -1;
	
	/* the hitlist has not to be in memory */
	assert( (witer->hitList.size() <= 0) && (witer->status == INDISK));
	
	/* open the hitlist file */
	FILE *ifp = fopen(m_indexHit.c_str(), "r");
	if (!ifp){
		mingle_debug1("\nCFindex::read_index_item(): open file error ");
		mingle_debug1(m_indexHit.c_str());
	}

	/* fetch the hitlist from the disk */	
	int numHits, nextP = witer->dindex;
	while (nextP != 0){
		
		// seek to next pointer
		fseek(ifp, nextP, SEEK_CUR);
		
		if (fread(&numHits, sizeof(int), 1, ifp) != 1)
			break;
		
		for (int i = 0; i < numHits; i ++){
			Ihit h;
			if (fread(&h, sizeof(Ihit), 1, ifp) != 1)
				break;
			witer->hitList.push_back(h);
		}
		
		if (fread(&nextP, sizeof(int), 1, ifp) != 1)
			break;
	}
	fclose(ifp);
	
	if (nextP != 0){
		mingle_debug1("\nCFindex::read_index_item(): read file error ");
		mingle_debug1(m_indexHit.c_str());
		return -1;
	}

	/* update word status */
	witer->status = UNCHANGED;
	return 0;
}


/////////////////////////////////////////////////////////////////
//  functions to do actual indexing

int CFindex::insert_file(const char* fname)
{
	Ifile newfile;
	newfile.fname = fname;
	
	m_files.push_back(newfile);
	m_newfiles.push_back(newfile);

	/* update memory usage */
	int size = strlen(fname);
	m_fstrUsedSize += size + sizeof(int);
	m_indexUsedSize += size * 2 + 2 * sizeof(int);

	return m_files.size() - 1;	        	
}

void CFindex::insert_dir(const char* dir)
{
	/* insert dir into index-dir-list */
	Idir newdir;
	newdir.dname = dir;

	m_dirs.push_front(newdir);
	m_newdirs.push_front(newdir);

	/* update memory usage */
	int size = strlen(dir);
	m_fstrUsedSize += size + 2 * sizeof(int);
	m_indexUsedSize += size * 2 + 4 * sizeof(int) ;

}

void CFindex::update_index_word(const char* word, 
				int no, uchar position, long offset)
{
	/* use hash to map to hashtable */
	int id = hash(word);
	TWordList::iterator witer;
	witer = find_if(m_words[id].begin(), m_words[id].end(), Iword_eq(word));

	/* we have a new word */
	if (witer == m_words[id].end()){

		Iword w;
		w.word = word;
		w.status = CREATED;
		w.totalhits = 0;
		w.dindex = 0;
		m_words[id].push_back(w);
		witer = m_words[id].end();
		witer --;
		m_num_word ++;

		/* update memory usage info */
		int size = sizeof_word(w);
		m_wordUsedSize += size;
		m_indexUsedSize += size;
	}

	/* update word status */
	if ((witer->status == UNCHANGED) || (witer->status == INDISK))
	    witer->status = UPDATED;
		
	/* update hit list */
	THitList::iterator hiter = find_if(witer->hitList.begin(),
					   witer->hitList.end(),
					   Ihit_eq(no));
	Ihit h;
	if (hiter == witer->hitList.end()){
		h.fNo = no;
		h.first_location = offset;
		witer->hitList.push_back(h);
		witer->totalhits ++;

		/* update index memory usage */
		int size = sizeof(h) + sizeof(int);
		m_hitUsedSize += size;
		m_indexUsedSize += size;
	}else{
	    if (hiter->first_location == -1)
		hiter->first_location = offset;
	}
}

void CFindex::index_file_name(const char* fname, int no)
{
	string fnstr = fname;
	string word;
	string::size_type pos_s, pos_e, pos_1, pos_2;

	pos_e = fnstr.find_first_of("/");
	while (pos_e < fnstr.length()){
		pos_s = pos_e;
		pos_e = fnstr.find_first_of("/", pos_s+1);
		word = fnstr.substr(pos_s+1, pos_e-pos_s-1);
		
		/* remove invalid word from the file name term */
		pos_1 = pos_2 = 0;
		while (pos_2 < word.length()){
			pos_1 = word.find_first_of(m_wordchar, pos_2);
			if (pos_1 > word.length())
				break;
			pos_2 = word.find_first_not_of(m_wordchar, pos_1);
			string subword = word.substr(pos_1, pos_2-pos_1);
			for (unsigned int i = 0; i < subword.length(); i ++)
				subword[i] = tolower(subword[i]);

			if ((subword.length() < m_minWordLen) || 
			    (subword.length() > m_maxWordLen) || 
			    (isstopword(subword.c_str())) || 
			    (is_digits(subword.c_str()))){
				subword.erase();
				continue;
			}

			update_index_word(subword.c_str(), no, TITLE, -1);
			subword.erase();
		}
		
		word.erase();
	}
	fnstr.erase();
}

/*
 * index_file: main function to index a file
 */
int CFindex::index_file(const char* fname, struct stat &fst)
{
	mingle_debug1("\nindexing file ");
	mingle_debug1(fname);

	/* if the file is our index disk file, return */
	if ((m_doutput == fname) || (m_foutput == fname) ||
	    (m_indexWord == fname) || (m_indexHit == fname))
		return 0;

	/* if the file already exists, return */
	TIndexFileList::iterator fiter;
	fiter = find_if(m_files.begin(), m_files.end(), Ifile_eq(fname));
	if (fiter != m_files.end())
		return 0;
	
	/* insert file into index-file-list */
	int no = insert_file(fname);
	if (no < 0) return -1;

	/* first index file names */
	index_file_name(fname, no);

	/* if soft link or executable, do not index content */
	if ((S_ISLNK(fst.st_mode)) || (fst.st_mode & 0x49)){
		return 0;
	}

	/* decide if to index content based on exception list */
	if (in_exception_list(fname))
		return 0;

	/* if not ascii, skip the content */
	int fd = open(fname, O_RDONLY);
	if (fd < 0){
		mingle_debug1("\nCFindex::index_file(): cannot open file ");
		mingle_debug2(fname, errno);
		return -1;
	}
	
	int nbytes;
	unsigned char buf[MAXSTRSIZE];
	if ((nbytes = read(fd, (char*)buf, HOWMANY)) == -1){
		mingle_debug1("\nCFindex::index_file(): read file error ");
		mingle_debug2(fname, errno);
		close(fd);
		return -1;
	}
	close(fd);

	int i;
	for (i = 0; i < nbytes; i ++){
		// not ascii
		if (!is_ascii_char(buf[i]))
			break;
	}
	if (i < nbytes){
		return 0;
	}

	/* we have a ascii file, index file content */
	char nbuf[MAXSTRSIZE];

	FILE *fp = fopen(fname, "r");
	if (!fp){
		mingle_debug1("\nCFindex::index_file(): cannot open file ");
		mingle_debug1(fname);
		return -1;
	}

	//	printf("index: %s ", fname);
	//	fflush(stdout);

	int numWord = 0;
	while (!feof(fp)){
		
		uint wlen;
		/* get next word */
		if ((wlen = get_next_word(nbuf, MAXSTRSIZE, fp)) == 0){
			// file eof
			break;
		}
		
		if ((wlen < m_minWordLen)|| (wlen > m_maxWordLen) || 
		    (isstopword(nbuf)) || (is_digits(nbuf)))
			continue;

		update_index_word(nbuf, no, CONTENT, ftell(fp));
		numWord ++;
		m_nword ++;

		if (m_nword >= m_memcheckFrequency){
			/*			printf("totalsize=%d\tusedsize= %d\twordindexsize=%d\thitindexsize=%d\n", 
			       m_indexSize, m_indexUsedSize, m_wordUsedSize, m_hitUsedSize);
			fflush(stdout);
			*/
			if (m_indexUsedSize > (m_indexSize - m_threshhold)){
				flush_index();
			}
			m_nword = 0;
		}
	}
	fclose(fp);

	//	printf("(num_word %d)\n", numWord);

	return 0;
}

/*
 * index_dir: main function to index a directory
 */
int CFindex::index_dir(const char* dir)
{
	mingle_debug1("\n\nindexing dir: ");
	mingle_debug1(dir);

	/* check if the dir already being indexed */
	string dirstr = dir;
	TIndexDirList::iterator siter;
	for (siter = m_dirs.begin(); siter != m_dirs.end(); siter ++){
	    string::size_type p;
	    string  curdir = siter->dname;
	    if (((p = dirstr.find(curdir)) == 0) && (dirstr[p+curdir.length()] == '/'))

{
		return 0;
	    }
	}
       
	/* open dir and read it */
	DIR *fdir = opendir(dirstr.c_str());
	if (!fdir){
		mingle_debug1("\nCFindex::index_dir(): cannot open dir - ");
		mingle_debug2(dirstr.c_str(), errno);
		return -1;
	}
	
	struct dirent *dent;
	while ((dent = readdir(fdir))){
		if ((strcmp(dent->d_name, ".") == 0)||
			(strcmp(dent->d_name, "..") == 0))
			continue;
		string stritem = dirstr;
		stritem += "/";
		stritem += dent->d_name;
		struct stat fst;
		if (lstat(stritem.c_str(), &fst) == -1){
			mingle_debug1("\nFindex::index_dir(): lstat error on file ");
			mingle_debug2(stritem.c_str(), errno);
			continue;
		}

		/* if directory, recursively index the dir */
		if (fst.st_mode & 0x4000){

			if (in_exception_list(stritem.c_str()))
				return 0;
			
			index_dir(stritem.c_str());
			continue;
		}

		/* regular file, index it */
		index_file(stritem.c_str(), fst);
		stritem.erase();
	}
	closedir(fdir);

	// insert dir into dir_list
	insert_dir(dir);

	return 0;
}

/*
 * update_index_table(): rebuild an inverted index table 
 *                       called regularly based on a timer set by set_index_interval()
 */
void CFindex::update_index_table()
{
	TIndexDirList   odir = m_dirs;
	TIndexFileList  ofile = m_files;

	/* flush cache content */
	m_cache.clear();

	/* flush old index table */
	m_dirs.clear();
	m_files.clear();
	m_nrfiles.clear();
	m_newdirs.clear();
	m_newfiles.clear();
	m_num_word = 0;
	m_nword = 0;
	m_hitUsedSize = m_wordUsedSize = m_indexUsedSize = 0;

	char buf[MAXSTRSIZE];
	sprintf(buf, "rm -f %s %s %s %s", 
		m_doutput.c_str(), m_foutput.c_str(), 
		m_indexHit.c_str(), m_indexWord.c_str());
	system(buf);

	for (int i = 0; i < HASH_TABLE_SIZE; i ++){
		if (m_words[i].size() > 0){
			m_words[i].clear();
		}
	}

	/* build a new one now */
	/* index all the directories */
	TIndexDirList::iterator i;
	for (i = odir.begin(); i != odir.end(); i ++){
		index_dir(i->dname.c_str());
	}

	/* index all the individual files */
	TIndexFileList::iterator j;
	for (j = ofile.begin(); j != ofile.end(); j ++){
		TIndexFileList::iterator k;
		k = find_if(m_files.begin(), m_files.end(), Ifile_eq(j->fname.c_str()));
		if (k == m_files.end()){
			struct stat fst;
			if (lstat(j->fname.c_str(), &fst) == -1){
				mingle_debug1("\nFindex::index_dir(): lstat error on file ");
				mingle_debug2(j->fname.c_str(), errno);
				continue;
			}else{
				index_file(j->fname.c_str(), fst);
			}
		}
	}

	/* flush index to the disk */
	flush_index();
	merge_index();
}


/////////////////////////////////////////////////////////////////
//  functions to flush index table to the disk

void CFindex::flush_word(TWordList::iterator witer, FILE *ifp)
{
	/* update the current index into the disk file */
	int last_dindex = witer->dindex; 
	witer->dindex = ftell(ifp);
	
	/* write hit list to disk */
	int size = witer->hitList.size();
	fwrite(&size, sizeof(int), 1, ifp);
	
	THitList::iterator hiter;
	for (hiter = witer->hitList.begin(); 
	     hiter != witer->hitList.end(); 
	     hiter ++){
		Ihit h = *hiter;
		fwrite(&h, sizeof(h), 1, ifp);
	}
	
	/* write the last index, using relative value */
	int diff;
	if (last_dindex == 0)
		diff = 0;
	else
		diff = last_dindex - (ftell(ifp) + sizeof(diff));
	fwrite(&diff, sizeof(diff), 1, ifp);
	
	/* remove the hit list */
	witer->hitList.clear();
	witer->status = INDISK;
}

void CFindex::flush_index()
{
	FILE *ifp;
	
	/* flush newly indexed dirs */
	if ((ifp = fopen(m_doutput.c_str(), "a")) == 0){
		mingle_debug1("\nCFindex::flush_index(): file open error ");
		mingle_debug1(m_doutput.c_str());
		return;
	}
	TIndexDirList::iterator siter;
	for (siter = m_newdirs.begin(); siter != m_newdirs.end(); siter ++){
		fprintf(ifp, "%s\n", siter->dname.c_str());
	}
	fclose(ifp);
	m_newdirs.clear();
	
	/* flush newly indexed file list */
	if ((ifp = fopen(m_foutput.c_str(), "a")) == 0){
		mingle_debug1("\nCFindex::flush_index(): file open error ");
		mingle_debug1(m_foutput.c_str());
		return;
	}
	for (unsigned int i = 0; i < m_newfiles.size(); i ++){
		fprintf(ifp, "%s\n", m_newfiles[i].fname.c_str());
	}
	fclose(ifp);
	m_newfiles.clear();
	
	/* do not read index file in, use append mode */
	// open file
	if ((ifp = fopen(m_indexHit.c_str(), "a")) == 0){
		mingle_debug1("\nCFindex::flush_index(): fopen error ");
		mingle_debug1(m_indexHit.c_str());
		return;
	}

	// adjust pointer not to be 0
	if (ftell(ifp) == 0){
		fprintf(ifp, "Disk file for hitlist:\n");
	}

	// start flushing
	for (int i = 0; i < HASH_TABLE_SIZE; i ++){
		if (m_words[i].size() <= 0){
			continue;
		}
		
		TWordList::iterator witer;
		for (witer = m_words[i].begin(); witer != m_words[i].end(); witer ++){
			/* word status: INDISK */
			if (witer->status == INDISK)
				continue;
			
			/* word status: UNCHANGED */
			if (witer->status == UNCHANGED){
				witer->status = INDISK;
				witer->hitList.clear();
				continue;
			}
			
			/* word status: CREATED or UPDATED */
			flush_word(witer, ifp);
		}
	}
	fclose(ifp);

	/* flush lexicon */
	if ((ifp = fopen(m_indexWord.c_str(), "w")) == 0){
		mingle_debug1("\nCFindex::flush_index(): fopen error ");
		mingle_debug1(m_indexWord.c_str());
		return;
	}

	for (int i = 0; i < HASH_TABLE_SIZE; i ++){
		if (m_words[i].size() <= 0){
			continue;
		}
		
		TWordList::iterator witer;
		for (witer = m_words[i].begin(); witer != m_words[i].end(); witer ++){
			fprintf(ifp, "%s %u %u\n", 
				witer->word.c_str(), witer->dindex, witer->totalhits);
		}
	}

	fclose(ifp);

	// update memory usage
	m_indexUsedSize = m_wordUsedSize + m_fstrUsedSize;
	m_hitUsedSize = 0;
	return;
}

void CFindex::merge_index()
{
	string tempfile = m_indexdir + "/" + TEMP_FNAME;

	FILE *ifp = fopen(tempfile.c_str(), "w");

	// adjust pointer not to be 0
	if (ftell(ifp) == 0){
		fprintf(ifp, "Disk file for hitlist:\n");
	}

	// start merging word by word
	for (int i = 0; i < HASH_TABLE_SIZE; i ++){
		if (m_words[i].size() <= 0){
			continue;
		}
		
		TWordList::iterator witer;
		for (witer = m_words[i].begin(); witer != m_words[i].end(); witer ++){
			assert(witer->status != UPDATED);

			/* word status: INDISK */
			if (witer->status == INDISK){
				if (read_index_item(witer->word.c_str()) < 0)
					continue;
			}
			
			/* flush the index to the temp file */
			witer->dindex = 0;
			flush_word(witer, ifp);
		}
	}
	fclose(ifp);

	// mv the temp file to the disk index file
	char buf[MAXSTRSIZE];
	sprintf(buf, "mv -f %s %s", 
		tempfile.c_str(), m_indexHit.c_str());
	system(buf);
}


/////////////////////////////////////////////////////////////////
//  help functions 

int CFindex::hash(const char* word)
{
	int hashval;
		
	for (hashval = 0; *word != '\0'; word++){
		hashval = *word + 31 * hashval;
		if (hashval > HASH_TABLE_SIZE)
		  hashval = hashval % HASH_TABLE_SIZE;
	}
	return hashval % HASH_TABLE_SIZE;	
}

const char *CFindex::m_defaultstopwords[] = {
"a", "above", "according", "across", "actually", "adj", "after",
"afterwards", "again", "against", "all", "almost", "alone", "along",
"already", "also", "although", "always", "among", "amongst", "an", "and",
"another", "any", "anyhow", "anyone", "anything", "anywhere", "are", "aren",
"aren't", "around", "as", "at", "be", "became", "because", "become", "becomes",
"becoming", "been", "before", "beforehand", "begin", "beginning", "behind",
"being", "below", "beside", "besides", "between", "beyond", "billion", "both",
"but", "by", "can", "can't", "cannot", "caption", "co", "could", "couldn",
"couldn't", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", "don",
"don't", "down", "during", "each", "eg", "eight", "eighty", "either", "else",
"elsewhere", "end", "ending", "enough", "etc", "even", "ever", "every",
"everyone", "everything", "everywhere", "except", "few", "fifty", "first",
"five", "for", "former", "formerly", "forty", "found", "four", "from",
"further", "had", "has", "hasn", "hasn't", "have", "haven", "haven't",
"he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon",
"hers", "herself", "him", "himself", "his", "how", "however", "hundred",
"ie", "i.e.", "if", "in", "inc", "inc.", "indeed", "instead", "into", "is",
"isn", "isn't", "it", "its", "itself", "last", "later", "latter", "latterly",
"least", "less", "let", "like", "likely", "ll", "ltd", "made", "make",
"makes", "many", "maybe", "me", "meantime", "meanwhile", "might", "million",
"miss", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "must",
"my", "myself", "namely", "neither", "never", "nevertheless", "next", "nine",
"ninety", "no", "nobody", "none", "nonetheless", "noone", "nor", "not",
"nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one",
"only", "onto", "or", "others", "otherwise", "our", "ours",
"ourselves", "out", "over", "overall", "own", "per", "perhaps", "rather",
"re", "recent", "recently", "same", "seem", "seemed", "seeming", "seems",
"seven", "seventy", "several", "she", "should", "shouldn", "shouldn't",
"since", "six", "sixty", "so", "some", "somehow", "someone", "something",
"sometime", "sometimes", "somewhere", "still", "stop", "such", "taking",
"ten", "than", "that", "the", "their", "them", "themselves", "then",
"thence", "there", "thereafter", "thereby", "therefore", "therein",
"thereupon", "these", "they", "thirty", "this", "those", "though",
"thousand", "three", "through", "throughout", "thru", "thus", "to",
"together", "too", "toward", "towards", "trillion", "twenty", "two", "under",
"unless", "unlike", "unlikely", "until", "up", "upon", "us", "used", "using",
"ve", "very", "via", "was", "wasn", "we", "we", "well", "were", "weren",
"weren't", "what", "whatever", "when", "whence", "whenever", "where",
"whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever",
"whether", "which", "while", "whither", "who", "whoever", "whole", "whom",
"whomever", "whose", "why", "will", "with", "within", "without", "won",
"would", "wouldn", "wouldn't", "yes", "yet", "you", "your", "yours",
"yourself", "yourselves", 0
};


/* word to file name map */
void CFindex::wnmap(const char* word, string& name)
{
    assert(name.empty());
    name += word[0];
    //    name += word[1];
}

/* return size of the word excluding the hit list size */
int CFindex::sizeof_word(Iword& w)
{
	int size = sizeof(w) + sizeof(int) * 2 + w.word.length();
	return size;
}

/* return size of the hitlist excluding the word record */
int CFindex::sizeof_hitlist(THitList& hl)
{
	int size = 0;

	THitList::iterator hiter;
	for (hiter = hl.begin(); hiter != hl.end(); hiter ++){
		size += sizeof(Ihit) + sizeof(int);
	}

	return size;
}


