/***************************************************************************
                          Findex.h  -  description
                             -------------------
    begin                : Wed Aug 8 2001
    copyright            : (C) 2001 by Yinglian Xie
    email                : ylxie@cs.cmu.edu
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/
// Findex.h: interface for the CFindex class.
//
//////////////////////////////////////////////////////////////////////

#ifndef FINDEX_H
#define FINDEX_H

#include <stdio.h>
#include <time.h>
#include <ctype.h>
#include <string>
#include <list>
#include <vector>
#include <functional>
#include "user.h"
#include "utils.h"
using namespace std;

#define HASH_TABLE_SIZE  10029
#define MAXSIZE 1024
#define HOWMANY            256

#define UNCHANGED 0
#define CREATED   1
#define UPDATED   2
#define INDISK    3

#define TITLE   0
#define CONTENT 1
#define BOTH    2
#define EITHER  4

#define ALLOW     1
#define DENY      0

//number of new words indexed before disk flush
#define DEFAULT_THRESH      5000 

//frequency to check memory usage (represented as per num_of_words)
#define DEFAULT_CHECKFRE    50000

#define FILE_FNAME ".indexedfile"
#define DIR_FNAME  ".indexeddir"
#define WORD_FNAME "lexicon"
#define HIT_FNAME  "hitlist"
#define TEMP_FNAME "temp"


/*----Index Dir List ----------*/
typedef struct{
	string dname;
}Idir;

//typedef vector<Idir, allocator<Idir> > TIndexDirList;
typedef list<Idir, allocator<Idir> > TIndexDirList;

class Idir_eq: public unary_function<Idir, int>{
	string s;
 public:
	explicit Idir_eq(const char* ss): s(ss){}
	int operator() (const Idir& d) const{ return d.dname == s;}
};

/*--- Index File List ---------------*/
typedef struct{
	string fname;
}Ifile;

typedef vector<Ifile, allocator<Ifile> > TIndexFileList;

class Ifile_eq: public unary_function<Ifile, int>{
	string s;
 public:
	explicit Ifile_eq(const char* ss): s(ss){}
	int operator() (const Ifile& f) const{ return f.fname == s;}
};

/*---- Index Hit List -----------*/
typedef struct{
	int     fNo;
	long    first_location;
}Ihit;

typedef vector<Ihit, allocator<Ihit> > THitList;

class Ihit_eq: public unary_function<Ihit, int>{
	int n;
 public:
	explicit Ihit_eq(const int& nn): n(nn){}
	int operator() (const Ihit& hit) const{ return  hit.fNo == n;}
};

/*--- Index Word List ------------*/
typedef struct{
	string   word;             // word content 
	uint     dindex;           // first index in the disk file
	unsigned totalhits: 30;    // total number of hits
	unsigned status: 2;        // current status of the word
	THitList hitList;          // hitlist 
}Iword;

typedef vector<Iword, allocator<Iword> > TWordList;

class Iword_eq: public unary_function<Iword, int>{
	string s;
 public:
	explicit Iword_eq(const char* ss): s(ss){}
	int operator() (const Iword& word) const{ return  word.word == s;}
};

/*--- class Index Cache -------------*/
typedef struct{
	string   word;
	time_t   lrtime;     // last referenced time
	int      numRefer;   // num of reference
	int      size;       // size of word + hit list size + overhead
}CacheWord;

typedef list<CacheWord, allocator<CacheWord> > cwordList;

class Icache
{
 public:
	int  m_tsize;
	int  m_availableSize;
	int  m_usedSize;

	cwordList m_popularWords;

 public:
	Icache();
	~Icache();

	void select_victim();
	void insert(string& word, int size);
	void lookup_update(const char* word);
	void clear();
};


/*---- class CFIndex -------------*/
class CFindex  
{
public:
	TIndexDirList    m_dirs;
	TIndexFileList   m_files;

	TStrList    m_nrfiles; // non regular files, like executable, image...
	TWordList   m_words[HASH_TABLE_SIZE];
	string      m_indexdir;
	int         m_num_word;  // number of words indexed
	int         m_nword;     // number of words found in file (for debug)
	uint        m_maxWordLen; 
	uint        m_minWordLen;

	string      m_wordchar;
	string      m_delimitchar;
	TStrArray   m_stopwords;
	TStrArray   m_exceptlist; // list of files (extensions) not indexed
	static const char* m_defaultstopwords[];

	CUserGroup *m_usergroup;

 private:
	Icache      m_cache;

	/* indexed file output and indexed dir output */
	string      m_foutput;
	string      m_doutput;
	string      m_indexHit;
	string      m_indexWord;

	/* used for newly indexed dirs and files */
	TIndexDirList    m_newdirs;
	TIndexFileList   m_newfiles;	

	/* whether allow digits for indexing */
	int         m_allowDigits;

	/* memory usage management */
	int         m_indexSize;
	int         m_threshhold;
	int         m_hitUsedSize;
	int         m_wordUsedSize;
	int         m_fstrUsedSize;
	int         m_indexUsedSize;
	int         m_memcheckFrequency;

public:
	CFindex();
	virtual ~CFindex();

	/* set up parameters based on config file */
	void  set_cache_size(int size);
	void  set_max_wordlen(int len);
	void  set_min_wordlen(int len);
	void  set_indexdir(string& dir);
	void  set_wordchar(string& wordchar);
	void  set_delimitchar(string& delimitchar);
	void  set_index_memory(int size);
	void  set_stopwords(string& stwlist);
	void  set_exception(string& explist);
	
	/* interfaces for query processor so that process is consistent */
	inline int   iswordchar(char c);
	inline int   isdelimitchar(char c);
	inline int   isstopword(const char* w);

	/* interfaces for indexing and searching operation */
	int   start_index(const char* name, Identity& user);
	void  lookup(string& word, Identity& user, THitList& hl);
        void   update_index_table();

private:

	/* set up index table from disk at start up */
	void   setup_index();
	void   setup_lexicon();
	int    read_index_item(const char* word);
	
	/* flush index table to the disk */
	void   flush_word(TWordList::iterator witer, FILE *ifp);
	void   flush_index();
	void   merge_index();

	/* file and dir indexing functions */
	int    insert_file(const char* fname);
	void   insert_dir(const char* dir);
	void   update_index_word(const char* word, 
				 int no, uchar position, long offsetq);

	void   index_file_name(const char* fname, int no);
	int    index_file(const char* fname, struct stat &fst);
	int    index_dir(const char* dir);

	/* help functions */
	int    hash(const char* word);
	void   wnmap(const char* word, string& name);
	int    sizeof_word(Iword& w);
	int    sizeof_hitlist(THitList& hl);

	/* inline help functions */
	inline int    is_ascii_char(char c);
	inline int    is_digits(const char* w);
	inline int    in_exception_list(const char* fname);
	inline uint   get_next_word(char* buf, int size, FILE *fp);
};

//////////////////////////////////////////////////////////////////////
//  class  CFindex implementation
//////////////////////////////////////////////////////////////////////

////////////////////////////////////
// inline  functions
//////////////////////////////////

/* return 1 if the char is a ASCII char */
inline int CFindex::is_ascii_char(char c)
{
	if ((c < '\0') || (c > '~')) return 0;
	else  return 1;
}

/* return 1 if the word consists of only digits */
inline int CFindex::is_digits(const char* w)
{
	/* check if digits are allowed first */
	if (!m_allowDigits) return 0;

	/* we allow digits, so have to check more here */
	if ((w[0] == '0') && (w[1] == 'x')) return 1;

	string sw = w;
	string::size_type p1, p2;
	p1 = sw.find_first_not_of("0123456789abcdefABCDEF+-.");
	p2 = sw.find_first_not_of("0123456789");
	
	if ((p2 == (sw.length()-1)) || (p1 > sw.length())){
		return 1;
	}else{
		return 0;
	}
}

/*
 * return 1 if the file does not need to be indexed 
 * based on the exception list from config file
 */
inline int CFindex::in_exception_list(const char* fname)
{
	string::size_type p;
	string  name = fname;
	int len;

	for (unsigned int i = 0; i < m_exceptlist.size(); i ++){
		if (m_exceptlist[i][0] == '.'){
			len = m_exceptlist[i].length();
			p = name.rfind(m_exceptlist[i]);
			if ((p < name.length()) && 
			    ((p + len == name.length()) || (name[p+len] == ' ')))
				return 1;
		}else if (name == m_exceptlist[i])
			return 1;
	}
	
	return 0;
}

/* return next word in the file, if binary character, return -1
 *  else return word length
 */
inline uint CFindex::get_next_word(char *buf, int size, FILE *fp)
{
	char c = 0;

	// find first valid char 
	while ( (!feof(fp)) && (isdelimitchar(c = fgetc(fp))) );
	if (feof(fp)){
		buf[0] = '\0';
		return 0;
	}

	// scan the word
	int i = 0;
	while ( (i < size - 1) && (!feof(fp)) && (iswordchar(c)) ){
		buf[i ++] = tolower(c);
		c = fgetc(fp);
	}

	// check if we meet the delimit charater
	if ((i == size - 1) || (feof(fp)) || (isdelimitchar(c))){
		buf[i] = '\0';
		return i;
	}

	// if not, seek until meet one, and return maxWordLen+1
	do{
		c = fgetc(fp);
	}while ((!feof(fp)) && (!isdelimitchar(c)));
	
	if (!feof(fp))
		return m_maxWordLen + 1;
	else
		return 0;
}



/////////////////////////////////////////////////////////////////
//  inline interfaces for query processor 

/* return 1 if the char can appear in a word */
inline int CFindex::iswordchar(char c)
{
	if (m_wordchar.find(c) < m_wordchar.length())
		return 1;
	else
		return 0;
}

inline int CFindex::isdelimitchar(char c)
{
	if (m_delimitchar.find(c) < m_delimitchar.length())
		return 1;
	else
		return 0;
}

inline int CFindex::isstopword(const char* w)
{
    unsigned int i = 0;

    while ((i < m_stopwords.size()) && 
	   (m_stopwords[i] != w))
	i ++;
    if (i < m_stopwords.size()) return 1;
    else                        return 0;
}



#endif // define FINDEX_H
