/* wordlist2.c -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 */

/* wordlist2 -- simple program to print sorted wordlist
 *
 * $Id: lqwordlist.c,v 1.4 92/04/21 16:00:16 lee Exp $
 */

#include "globals.h" /* defines and declarations for database filenames */
#include "error.h"
#include "numbers.h"

#include <stdio.h>
#include <sys/types.h>
#include <malloc.h>
#include <ctype.h>

#ifdef BSD
# define USI_MAX ((unsigned int) -1)
#else
# include <limits.h>
  /* for USI_MAX, the largest unsigned integer.
   * 4.3 BSD doesn't seem to have this.  I don't know how to get this
   * on BSD systems.
   */
#endif

#include "fileinfo.h"
#include "wordinfo.h"
#include "smalldb.h"
#include "pblock.h"
#include "wordrules.h"
#include "emalloc.h"

/*** Declarations: ***/
/** System calls and library routines: **/
extern void exit();

/** System calls: **/

/** Unix Library Functions: **/
extern char *strncpy();
#ifndef tolower
 extern int tolower();
#endif

/** lqtext library functions: **/
extern void cleanupdb();
extern void SetDefaults();
extern void DefaultUsage();
extern long GetMaxWID();

/** functions defined within this file: */
void PrintWordInfo();
void DumpMyCache(), AddSort();
void indexmarch();
static int WantEgrep(), WantAll(), WantPrefix();
static int (* WantWord)() = WantAll;

/* for more speed: */
static int CanOptimise = 0;
static char Opt_First = 0;

/** Macros and variable definitions **/

#define STRNCMP(henry, utzoo, n) \
     ( (n > 0 && *(henry) == *(utzoo)) ? strncmp(henry,utzoo,n) : \
       ( ( (int)(unsigned char) *(henry)) - ((int)(unsigned char) *(utzoo)) ))
#define STRCMP(henry, utzoo) \
     ( (*(henry) == *(utzoo)) ? strcmp(henry,utzoo) : \
       ( ( (int)(unsigned char) *(henry)) - ((int)(unsigned char) *(utzoo)) ))
/* Note: the double casts are in case there's an 8-bit value, and chars
 * are signed on the local machine, both of which do happen in practice.
 * Thanks to msb@sq.com for pointing this out.
 */


char *progname = 0;
    /* Used for error messages */

int AsciiTrace = 0;
    /* If this is non-zero, we provide debugging information.  The lqtext
     * library also uses this variable.  Setting it to values greater
     * than 1 or 2 will generally provide large amounts of debugging
     * information.  If the library was compiled with -UASCIITRACE,
     * however, there will be much less diagnostic output at higher
     * levels.
     */

char *Prefix = NULL;
int PrefixLength = 0;
static int DoingSort = 0; /* sort off by default */

static char *Revision = "$Revision: 1.4 $";

/** end of declarations... **/

int
main(argc, argv)
    int argc;
    char *argv[];
{
    extern int optind, getopt();  /* For getopt(3) */
    extern char *optarg;	  /* For getopt(3) */
    int ch;			  /* For getopt(3) */
    int ErrorFlag = 0;		  /* For getopt(3) */

    progname = argv[0];
	/* I see this as a library program, so I am leaving the full
	 * path.  lqaddfile(1L) and lqphrase(1L) set progname to be
	 * the filename of the command, rather than the full pathname.
	 */

    SetDefaults(argc, argv);
	/* Deal with any arguments that are understood by all lqtext
	 * programs.
	 */

    while ((ch = getopt(argc, argv, "g:p:sVxZz:")) != EOF) {
	switch (ch) {
	case 'g':
	    if (WantWord == WantPrefix) {
		Error(E_FATAL|E_XHINT, "can't mix -g and -p.");
	    }
	    WantWord = WantEgrep;
	    Prefix = optarg;
	    PrefixLength = strlen(Prefix);
	    break;
	case 'p':
	    if (WantWord == WantEgrep) {
		Error(E_FATAL|E_XHINT, "can't mix -g and -p.");
	    }
	    Prefix = optarg;
	    PrefixLength = strlen(Prefix);
	    WantWord = WantPrefix;
	    break;
	case 's':
	    DoingSort = 1; /* TODO: allow both numeric and alpha sorts */
	    break;
	case 'V':
	    fprintf(stderr, "%s version %s\n", progname, Revision);
	    break;
	case 'x':
	    ErrorFlag++;
	    break;
	case '?':
	    ErrorFlag++;
	    break;
	case 'z':
	case 'Z':
	    break; /* done by SetDefaults(); */
	}
    }

    if (ErrorFlag) {
	fprintf(stderr, "%s: options are:\n", progname);
	fprintf(stderr, "%s\n%s\n%s\n",
	    "-p prefix -- only print words starting with \"prefix\"\n",
	    "-g e      -- only print words matching egrep pattern \"e\"\n",
	    "-s        -- print the words in sorted order\n"
	);
	DefaultUsage();
	    /* DefaultUsage() prints the list of the standard options. */
	exit(1);
    }

    if (optind > argc) {
	fprintf(stderr, "Usage: %s [-p prefix | -g egrep-pat ] [-xvV]\n",
	    progname
	);
	exit(1);
    }

    InitCache(GetMaxWID());

    indexmarch();
    cleanupdb();
    exit(0); /* 0 or 1 (this is a little devious) */
#ifdef lint
    /*NOTREACHED*/
    return 1;
	/* this is for versions of lint and gcc that don't understand
	 * that exit() doesn't return -- or, if it douse, that there is
	 * nothing that can be done about it!
	 */
#endif
}

static char *DefaultCache[10];
static long MaxInCache = 10;
static char **MyCache = DefaultCache;
static long CacheCount = 0;

int
InitCache(MaxWords)
    long MaxWords;
{
    MaxInCache = sizeof(DefaultCache[0]) / sizeof(DefaultCache);

    if (MaxWords < MaxInCache) {
	MyCache = DefaultCache;
	return 0;
    }
    MyCache = (char **) malloc((MaxWords + 2) * sizeof(char *));
    if (!MyCache) {
	Error(E_FATAL|E_MEMORY,
	    "Couldn't callocate %ld bytes of memory for wordlist",
	    (MaxWords + 2) * sizeof(char *));
	exit(1);
    }
    MaxInCache = MaxWords + 1;
    /*
     * fprintf(stderr, "Init cache %ld Max set to %ld\n", MaxWords, MaxInCache);
     */
    return 0;
}


static char *
GetEntry(f, n)
    t_WID n;
{
    static char Buf[WIDBLOCKSIZE * 1024];
    static unsigned long StartPos = 0L;
    static unsigned long EndPos = 0L;

    unsigned long Where = n * WIDBLOCKSIZE;

    if (Where + WIDBLOCKSIZE > EndPos + 1 || Where < StartPos) {
	int i;

	if (lseek(f, Where, 0) < 0) {
	    Error(E_FATAL|E_SYS,
		"WIDINDEX lseek(%d, pos=%ld, SEEK_SET) failed",
		f,
		Where
	    );
	}
	StartPos = Where;
	if ((i = read(f, Buf, sizeof Buf)) < 0) {
	    Error(E_FATAL|E_SYS,
		"WIDINDEX read(fd=%d, buf, n=%d) failed",
		f,
		sizeof Buf
	    );
	}
	EndPos = Where + i;
    }
    return &Buf[Where - StartPos];
}

void
indexmarch()
{
    void AddSort();
    int fd;
    long offset;
    t_WID WID;
    t_WID MaxWid = GetMaxWID();
    char *p;
    

    if ((fd = open(WidIndexFile, O_RDONLY, 0)) < 0) {
	Error(E_FATAL|E_SYS,
	    "Couldn't open Word Index File \"%s\" for reading",
	    WidIndexFile
	);
    }

    for (WID = 0; WID < MaxWid; WID++) {
	char *Block = GetEntry(fd, WID);
	int Len = sReadNumber(&Block);
	unsigned long TotalCount;
	char *p = &Block[Len];

	/* first byte is word length */

	if (CanOptimise && Opt_First != *Block) {
	    continue;
	}

	TotalCount = sReadNumber(&p); /* actually the Offset comes first */
	TotalCount = sReadNumber(&p);

	if (WantWord(Len, Block, TotalCount)) {
	    AddSort((int) Len, Block, TotalCount);
	}
    }

    if (close(fd) < 0) {
	Error(E_WARN,
	    "error whilst closing file %d=\"%s\"",
	    fd,
	    WidIndexFile
	);
    }
    DumpMyCache();
}

static int
WantAll()
{
    return 1;
}

static int
WantPrefix(Length, Word, TotalCount)
    int Length;
    char *Word;
    t_WID TotalCount;
{
    if (Prefix) {
	if (Length < PrefixLength ||
			STRNCMP(Prefix, Word, PrefixLength) != 0) {
	    return 0; /* not wanted */
	}
    }
    return 1;
}


void
AddSort(Length, Word, TotalCount)
    int Length;
    char *Word;
    unsigned long TotalCount;
{
    if (!DoingSort) {
	(void) printf("%.*s\t%ld\n", Length, Word, TotalCount);
	return;
    }

    if (CacheCount >= MaxInCache) {
	DumpMyCache();
	CacheCount = 1; /* including this word... */
    }

    if ((MyCache[CacheCount] = (char *) malloc(Length + 12)) == (char *) 0) {
	Error(E_FATAL|E_MEMORY, "malloc for %d bytes failed", Length + 12);
	exit(1);
    }

    (void) strncpy(MyCache[CacheCount], Word, Length);
    MyCache[CacheCount][Length] = '\0';
    {
	char buf[20];
	(void) sprintf(buf, "\t%ld", TotalCount);
	(void) strcat(MyCache[CacheCount], buf);
    }
    ++CacheCount;
}

int
CompareStringsByPointersForQsort(s1p, s2p)
    void *s1p; 
    void *s2p;
{
    return STRCMP(*(char **)s1p, *(char **)s2p);
}

void
DumpMyCache()
{
    extern int strcmp();
    register int i;

    if (!CacheCount) return;

	(void) qsort(&MyCache[0], CacheCount, (int) sizeof(char *), CompareStringsByPointersForQsort);

    for (i = 0; i < CacheCount; i++) {
	/** printf("%d\t%s\n", i, MyCache[i]); **/
	(void) puts(MyCache[i]);
	(void) free(MyCache[i]);
    }
}


/* Porting: if this causes problems, possibilities are:
 * (1) use Henry Spencer's regexp package, a  p.d. replacement
 * (2) compile with -DNO_REGEXP...
 */

#ifndef NO_REGEXP

static char *
Message(val)
    int val;
{
    switch (val) {
    case 11: return "Range endpoint too large [???]";
    case 16: return "Bad number [????]";
    case 25: return "``\\digit'' out of range - max is probably 9";
    case 36: return "Illegal or missing delimiter";
    case 41: return "No remembered search string";
    case 42: return "\\( \\) imbalance";
    case 43: return "Too many \\( -- max is 9";
    case 44: return "More than 2 numbers given in \\{min,max\\}";
    case 45: return "} expected after \\ in \\{min,max\\}";
    case 46: return "First number exceeds second in \\{min,max\\}";
    case 49: return "[] imbalance; use \\[ and \\] to match brackets";
    case 50: return "Regular expression too long";
    default: return "[no pre-defined error message, see man 3 regexp]";
    }
}

static char *Expression = 0;
static char buf[8192*3];

#define INIT register char *p = instring;
#define GETC() (*p++)
#define PEEKC() (*p)
#define UNGETC(c) (--p)
#define RETURN(ptr) \
	{ \
	    Expression = emalloc(ptr - buf + 1); \
	    (void) bcopy(buf, Expression, ptr - buf + 1); \
	    return; \
	}

static void e(val)
{
    Error(E_FATAL,
	"Regexp error %d [%s] in `\"%s\"",
	val,
	Message(val),
	Prefix
    );
}


#define ERROR(val) { e(val); return; }

#include <regexp.h>

Compile(Pattern)
    char *Pattern;
{
    extern char *compile();
    register char *pp = Pattern;

    (void) compile(Prefix, buf, &buf[sizeof buf], '\0');
    if (!Expression) {
	Error(E_FATAL, "patern compilation into internal form failed");
    }

    /* precompute some optimisations */
    if (*pp == '^') {
	++pp;
	if (isalnum(*pp)) {
	    Opt_First = *pp;
	    pp++;
	    if (!*pp || isalnum(*pp)) {
		CanOptimise = 1;
		return;
	    }
	}
    }
}

static int
WantEgrep(Length, Word, TotalCount)
    register int Length;
    register char *Word;
    t_WID TotalCount;
{
    char c;

    if (!Expression) {
	Compile(Prefix);
    }

    c = Word[Length];
    Word[Length] = '\0';
    if (step(Word, Expression)) {
	Word[Length] = c;
	return 1;
    } else {
	Word[Length] = c;
	return 0;
    }
}

#endif /* !NO_REGEXP */
