/* Root.c -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 */

/*
 * $Id: Root.c,v 2.7 91/03/03 00:13:36 lee Rel1-10 $
 *
 * $Log:	Root.c,v $
 * Revision 2.7  91/03/03  00:13:36  lee
 * cosmetic changes.
 * 
 * Revision 2.6  90/10/06  00:11:59  lee
 * Prepared for first beta release.
 * 
 * Revision 2.5  90/08/29  21:46:42  lee
 * Alpha release.
 * 
 * Revision 2.4  90/08/09  19:16:29  lee
 * BSD lint and fixes...
 * 
 * Revision 2.3  90/03/29  23:00:04  lee
 * Now passes gcc -Wall
 * 
 * Revision 2.2  89/10/08  20:44:56  lee
 * Working version of nx-text engine.  Addfile and wordinfo work OK.
 * 
 * Revision 2.1  89/10/02  01:13:07  lee
 * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
 * 
 *
 */

#include "globals.h" /* defines and declarations for database filenames */

#include <sys/types.h>
#include <fcntl.h> /* for my header files, sorry */
#include <stdio.h>
#include <malloc.h>
#include <ctype.h>

#include "fileinfo.h"
#include "wordinfo.h"
#include "wordrules.h"
#include "emalloc.h"

/** Unix system calls that need to be declared: **/
    /* (none) */
/** C Library functions that nees to be declared: **/
extern void perror();
extern int strcmp();
extern int strlen();
extern char *strcpy();
extern char *strcat();
#ifndef tolower
 extern int toupper();
#endif

/** lqtext functions that need to be declared: **/
/** Functions from this file that need to be declared: **/
void InsertCommonWord();
/** **/

/** Useful macros **/
#define new(type) ((type *) emalloc(sizeof(type)))
	/* so you can say
	 * struct foo *x = enew(struct foo)
	 */

#define STRCMP(s1,s2) ((*(s1) == *(s2)) ? strcmp(s1, s2) : *(s1) - *(s2))
	/* faster then strcmp in the (common) case where the
	 * strings differ at the first character.
	 * From an idea by Henry Spencer (utzoo!henry)
	 */

/** **/

extern int AsciiTrace;

/* This routine is only sensible for English (although it could be
 * modified...), but that does not matter.
 */
char *
WordRoot(WordInfo)
    t_WordInfo *WordInfo;
{
    char *Word;

    if (!WordInfo) return "@#!!";

    Word = WordInfo->Word;

    if (!Word) {
	return "oh dear";
    }

    if (!*Word) {
	return Word;
    }

    /** delete trailing <'s> and mark posessive */
    while (WordInfo->Length >= 3 && Word[WordInfo->Length - 1] == 's' &&
				    Word[WordInfo->Length - 2] == '\'') {
	WordInfo->Length -= 2;
	Word[WordInfo->Length] = '\0';
	WordInfo->WordPlace.Flags |= WPF_POSSESSIVE;
    }

    /** delete trailing plural suffix and mark plural */

    /* It's important to realise that the purpose of this routine is not
     * in any way to reduce a word to an etymological root.  In other words,
     * no attempt is made to differentiate between plurals and present
     * participles, or words that simply happen to end in `s'.
     * Hence, elephants, blunderbus, hostess, runs and tomatoes are all
     * candidates.  Of course, one would like to do as well as one can!
     * Again, the object isn't to derive the correct singular, but instead
     * to be fairly fast, and, above all, to ensure that any transformations
     * are reversible!
     *
     * The result is that I can store dog and dogs in the same Wordinfo
     * chain.  In the case that either word is unusual, there is a space
     * saving of (on average) 30 or so bytes.  More usefully, if you ask
     * for `Window', I will automatically find `Windows' as well.
     *
     * so...
     * XXXo, XXXss, XXXsh, XXXch, XXXx --> +es
     * 	except: pianos, dynamos, photos
     * XXCy --> XXCies [ C consonant]
     * XXVy --> XXVys [ V vowel ]
     * f or fe --> ves (12 cases only)
     * vowel change:
     * foot/feet (why bother with these? -- use a thesaurus!)
     * need to keep penny/pence separate
     * See Thomson & Martinet, section 8ff (I think)
     */
    if (WordInfo->Length > 2 && Word[WordInfo->Length - 1] == 's') {
	WordInfo->WordPlace.Flags |= WPF_WASPLURAL; /* WRONG */
	switch (Word[WordInfo->Length - 2]) {
	case 'e':
	    if (WordInfo->Length >= 3) switch (Word[WordInfo->Length - 3]) {
	    case 'i': /* xxcies --> xxxy */
		if (WordInfo->Length > 3) {
		    Word[WordInfo->Length - 3] = 'y';
		    WordInfo->Length -= 2;
		} else { /* ies not a plural, but lies is :-) */
		    WordInfo->Length--; /* just the s */
		}
		break;
	    case 's':
	    case 'h':
	    case 'x':
	    case 'o': /* xxxoes --> xxx */
		WordInfo->Length -= 2;
		break;
	    default: /* xxxes -> xxxe */
		WordInfo->Length -= 1;
		break;
	    } else { /* too short */
		WordInfo->WordPlace.Flags &=
			(unsigned short)~(unsigned short)WPF_WASPLURAL;
	    }
	    break;
	case 'y': /* xxxvys --> xxxvy */
	    switch (Word[WordInfo->Length - 2]) { /* e.g. holidays */
	    case 'a': /* flays */
	    case 'e': /* beys */
	    case 'i': /* ??iys?? */
	    case 'o': /* boys */
	    case 'u': /* guys */
		WordInfo->Length--; /* just remove the s */
		break;
	    default: /*not a plural, e.g. Unixsys, happy */
		WordInfo->WordPlace.Flags &=
			(unsigned short)~(unsigned short)WPF_WASPLURAL;
		break;
	    }
	    break;
	case 's': /* trailing ss doesn't mark a plural! */
	    WordInfo->WordPlace.Flags &=
			    (unsigned short)~(unsigned short)WPF_WASPLURAL;
	    break;
	case 'u':
	    /* ONE bus, thus, omnibus; TWO gnus, TWO emus */
	    /* So it doesn't work for gnus and emus right now! */
	    WordInfo->WordPlace.Flags &=
		    (unsigned short)~(unsigned short)WPF_WASPLURAL;
	    break;
	case 'i': /* not a plural.. this, his, fleur-de-lis */
	    WordInfo->WordPlace.Flags &=
		    (unsigned short)~(unsigned short)WPF_WASPLURAL;
	    break;
	case 'a': /* has */
	case 'o': /* cos */
	    if (WordInfo->Length < 4) {
		WordInfo->WordPlace.Flags &=
			(unsigned short)~(unsigned short)WPF_WASPLURAL;
		break;
	    }
	    /* else fall through */
	default: /* just plain s */
	    WordInfo->Length -= 1;
	    break;
	}
	Word[WordInfo->Length] = '\0';
    } 
    /* Should check for ii --> ius here, but that would increase the length
     * of the word and therefore will break lots of things.
     */
    return WordInfo->Word;
}

char *
UnFlag(WordInfo, Flags)
    t_WordInfo *WordInfo;
    unsigned int Flags;
{
    static char Buffer[MaxWordLength + 5]; /* 's + es + \0 */
    register char *p, *q;
    int Length;

    if (!WordInfo) return "(null word info)";
    if (!WordInfo->Word) return "(null word)";
    if (!WordInfo->Word[0]) return "(empty word)";

    p = Buffer;
    q = WordInfo->Word;
    while (*p++ = *q++)
	;
    *p = '\0';
    
    if ((Length = p - Buffer) != WordInfo->Length) {
	/* Well, maybe I can't count */
	WordInfo->Length = Length = strlen(Buffer);
    }

    if (Flags & WPF_WASPLURAL) {
	if (Length >= 2) switch (Buffer[Length - 1]) {
	case 'y':
	    if (Length > 2) switch (Buffer[Length - 2]) {
	    case 'a':
	    case 'e':
	    case 'i':
	    case 'o':
	    case 'u':
		Buffer[Length++] = 's'; /* e.g. days */
		break;
	    default:
		strcpy(&Buffer[Length - 1], "ies"); /* ladies */
		Length += 2;
	    }
	    break;
	case 's':
	    if (Length > 2) if (Buffer[Length - 2] == 'u') {
		strcpy(&Buffer[Length - 1], "ii"); /* Genii */
		break;
	    } /* else fall through... */
	case 'o':
	case 'h':
	case 'x':
	    strcat(Buffer, "es");
	    Length += 2;
	    break;
	default:
	    Buffer[Length++] = 's';
	}
	Buffer[Length] = '\0';
    }

    if (Flags & WPF_POSSESSIVE) {
	Buffer[Length++] = '\'';
	Buffer[Length++] = 's';
	Buffer[Length] = '\0';
    }

    if (Flags & WPF_UPPERCASE) {
	Buffer[0] = toupper(Buffer[0]);
    }

    return Buffer;
}

typedef struct s_WordList {
    char *Word;
    unsigned short Flags;
    struct s_WordList *Next;
} t_WordList;

static t_WordList *CommonWords = 0;

int
TooCommon(WordInfo)
    t_WordInfo *WordInfo;
{
    register char *Word = WordInfo->Word;
    register t_WordList **WP;

    for (WP = &CommonWords; *WP; WP = &(*WP)->Next) {
	int i = STRCMP((*WP)->Word, Word);

	if (i == 0) return 1; /* yes, it's common */
	else if (i > 0) return 0;
    }
    return 0;
}

static char *FileName = "Internal Error";
/* should be set before being printed! */

int
ReadCommonWords(CommonFile)
    char *CommonFile;
{
    extern char *fgets();
    extern int AsciiTrace;

    FILE *fd;
    extern int errno;
    char Buffer[200];
    t_WordInfo W;
    char *Root;
    t_WordList *WP;

    errno = 0;

    if ((fd = fopen(CommonFile, "r")) == (FILE *) 0) {
	int e = errno;

	fprintf(stderr, "Can't open common word list ");
	errno = e;
	perror(CommonFile);
	return -1;
    }

    FileName = CommonFile;

    while (fgets(Buffer, sizeof(Buffer), fd) != (char *) 0) {
	register char *p;
	char *Start;

	for (p = Buffer; *p; p++) {
	    if (*p == '#') break;
	    if (StartsWord(*p)) break;
	}

	if (*p == '#' || !*p) {
	    continue;
	}

	Start = p;

	for (; *p; p++) {
	    if (!WithinWord(*p)) break;
	    if (*p == '\'' && !WithinWord(p[1])) break;
	}

	if (p - Start + 1 < MinWordLength) continue;

	*p = '\0'; /* delete trailing \n or whatever */
	W.WordPlace.Flags = 0;
	W.Word = Start;
	W.Length = p - Start; /* length excludes the \0 */

	Root = WordRoot(&W);
	InsertCommonWord(Root, W.WordPlace.Flags);
    }
    (void) fclose(fd);

#if 0
    if (!CommonWords) {
	fprintf(stderr, "No common words found in file \"%s\"\n", FileName);
	exit(1);
    }
#endif

    if (AsciiTrace > 1) {
	for (WP = CommonWords; WP; WP = WP->Next) {
	    fprintf(stderr, "Ignore: \"%s\"\n", WP->Word);
	}
    }
    FileName = "Internal Error";
    return 0;
}

void
InsertCommonWord(Root, Flags)
    char *Root;
    unsigned int Flags;
{
    register t_WordList **WP;
    t_WordList *W;

    for (WP = &CommonWords; *WP; WP = &(*WP)->Next) {
	int i = STRCMP((*WP)->Word, Root);

	if (i == 0) return;
	else if (i > 0) break;
    }
    /* insert it before this one! */
    W = (*WP);
    (*WP) = (t_WordList *) emalloc(sizeof(t_WordList));
    (*WP)->Word = emalloc(strlen(Root) + 1);
    (void) strcpy((*WP)->Word, Root);
    (*WP)->Flags = Flags;
    (*WP)->Next = W;
    return;
}
