/*
 *  This module contains the routines which build a finite state machine
 *  table based on a series of strings built in kgen.y.
 *
 *  To Do:
 *     add code to reopen stdin
 */

#include <stdio.h>
#ifdef PC
#include <alloc.h>
#else
#include <malloc.h>
#endif

#define EXTERN 
#include <stdio.h>
#include <ctype.h>
#include "kgen.h"

int debug = 0;       /* non-zero integers turn on increasing levels
                        of debug detail */


/*
 *  KGEN main routine, set debug flag, initialize, start Yacc.
 */

main(argc, argv)
int argc;
char *argv[];
{
   int sts;

   if (argc > 1) {    /* if any arguments present enable debug code */
      if (strncmp(argv[1], "-d", 2) == 0)
         debug = strlen(argv[1]) - 1;
      else
         fprintf(stderr, "Invalid command line argument ignored...\n");
   }

   init();
   sts = yyparse();     /* start up YACC */
   printf("END\n");

   return sts;
}



/*
 *  Initialize global variables for rule generator.
 */

init()
{
   reinit();

   memset(all_lex, (char)0, sizeof(all_lex));
   strcpy(all_lex, " ");                  /* fill zeroth position */
   memset(all_surf, (char)0, sizeof(all_surf));
   strcpy(all_surf, " ");
   allusd = 0;
   subsetusd = 1;
   memset(subset, (char)0, (unsigned)(MAXSUBSET*sizeof(struct subset)));
}


/*
 *  Initialize the variables which must be reinitialized after each
 *  table.
 */

reinit()
{
   colusd = 0;
   stateusd = 2;             /* we don't use state 0 */
   lhs = NULL;

   memset(colseg, (char)0, sizeof(colseg));
   memset(state, (char)0, sizeof(state));
   memset(default_to_fail, (char)0, sizeof(default_to_fail));
   segsetusd = 1;
   memset(segset, (char)0, (unsigned)(MAXSUBSET*sizeof(struct segset)));
}


/*
 *  Error printer for YACC
 */

yyerror(str)
char *str;
{
   int c;

   fprintf(stderr, "\nError: %s\n %s<--...", str, curline);
   c = getchar();
   while (c != '\n' && c != EOF) {
      putc(c, stderr);
      c = getchar();
   }
   fprintf(stderr, "\n");
}



/*
 *  This routine is called after the rule has been completely parsed
 *  in order to build the state transition table for the rule.
 */

build_table(strs)
struct lstr *strs;
{
   int i;

   if (debug >= 2)
      dumplstr("before alternate expansion", strs);
   expand_alternate_strs(strs);
   if (debug)
      dumplstr("after alternate expansion", strs);

   create_columns(strs);

   insert_strings(strs);
   if (debug > 1)
      dump_table(1);

   add_default_transitions();
   if (debug > 1)
      dump_table(1);
   add_back_loops();

   optimize();

   dump_table(0);

   freestr(strs);
   reinit();         /* reinitialize global data areas */
}



/*
 *  Search through all strings expanding those that contain
 *  alternate character sets.
 *
 *  The sequence {A,B,C} in the input is mapped into three
 *  consecutive segset references.  The first two of these
 *  have the ALTERNATIVE bit set on.  This routine checks for the
 *  presence of alternative sets in each string.
 *  If multiple alternative sets are present the routine verifies
 *  that they all have the same length.
 *    (e.g.  {A,B}xyz{C,D,E} is not a valid form.)
 *  If alternative sets are present expand_alternate_str is called
 *  to change them into multiple strings.
 */

expand_alternate_strs(strs)
struct lstr *strs;
{
   int cnt;
   int cnt2;
   short *p;

   while (strs != NULL) {
      cnt = 0;
      for (p = strs->text; *p; ++p)
         if ( *p & ALTERNATIVE ) {
            for (cnt2=1; *p & ALTERNATIVE; ++p, ++cnt2) {}
            if (cnt != 0 && cnt != cnt2) {
               fprintf(stderr, "\n\nUnequal length alternate sets...\n");
               exit(1);
            }
            cnt = cnt2;
         }

      if (cnt) {
         expand_alternate_str(strs, cnt);
         while (--cnt)                     /* skip newly expanded strings */
            strs = strs->nextstr;
      }
      strs = strs->nextstr;
   }
}



/*
 *  Expand a character string which contains an alternate character
 *  set.
 *  For example the single string {A,B}xyz{C,D} would be changed into
 *  two strings AxyzC and BxyzD.
 */

expand_alternate_str(str, cnt)
struct lstr *str;   /* string containing alternates */
int cnt;            /* number of alternates present (2 in above example) */
{
   int i;
   int j;
   short *src;
   short *dst;

   for (i=1; i<cnt; ++i)
      lstradd(&(str->nextstr), str->text);    /* insert cnt-1 more copies */

   for (i=0; i<cnt; ++i) {
      dst = str->text;
      src = str->text;

      /*  This loop works by skipping all but one of the alternates
       *  in the copy process.
       */
      while (*src)
         if (*src & ALTERNATIVE) {
            for (j=0; j<i; ++j, ++src) {}       /* skip unused initial segs */
            *dst++ = *(src++) & (~ALTERNATIVE); 
            for (j=i+1; j<cnt; ++j, ++src) {}   /* skip unused trailing segs */
         }
         else
            *dst++ = *src++;
      *dst = 0;

      str = str->nextstr;
   }
}



/*
 *  Examine all target strings and add column headers for all input
 *  feasible pairs found.
 */

create_columns(strs)
struct lstr *strs;
{
   short *p;
   int col;

   for (; strs != NULL; strs = strs->nextstr) 
      for (p = strs->text; *p; ++p) 
         add_column(*p, 0);
   if (debug > 1)
      dump_headers("after add_column");

   fix_overlapping_columns();
   if (debug > 1)
      dump_headers("after fix overlapping columns");

   delete_less_specific();
   if (debug > 1)
      dump_headers("after delete less specific");

   remove_unused_columns();
   if (debug > 1)
      dump_headers("after remove unused");
}



/*
 *  Do debugging dump of column headers.
 */
dump_headers(str)
char *str;
{
   int col;
   struct segset *segp;

   printf("\nColumn headers(%s)\n", str);
   for (col=0; col<colusd; ++col) {
      segp = &segset[colseg[col]];
      printf("%6s /", segment_name(segp->lexseg));
      printf("%6s (%d) ", segment_name(segp->surfseg), segp->modified);
      dumppairs("", segp->segs);
   }
}





/*
 *  If two columns neither of which is the subset of the other overlap
 *  then the overlapping pairs need to be split off into their own 
 *  columns.
 *  See T54a in book.
 */

fix_overlapping_columns()
{
   int cola, colb;
   short *p;

   for (cola=0; cola<colusd-1; ++cola)
      for (colb=cola+1; colb<colusd; ++colb) 
         if (! is_a(colseg[cola], colseg[colb]) &&
             ! is_a(colseg[colb], colseg[cola]))
            for (p=segset[colseg[cola]].segs; *p; ++p)
               if (in_segset(*p, colseg[colb]))
                  add_column(
                         create_segset(all_lex[*p], all_surf[*p]), 
                         default_to_fail[cola] | default_to_fail[colb]);
}



/*
 *  If a possible pair is assigned to multiple columns it is deleted
 *  from all columns except the one containg the fewest entries.
 *  See page 51 "mapping feasible pairs to columns".
 */

delete_less_specific()
{
   int pair;
   int col;
   int matched;        /* number of columns which matched current pair */
   int minsize;        /* size of smallest column containing current pair */
#define COLUMN_COUNT tstrlen(segset[colseg[col]].segs)

   for (pair=1; pair<allusd; ++pair) {
      matched = 0;
      minsize = 1000;
      for (col=0; col<colusd; ++col)
         if (in_segset(pair, colseg[col])) {
            ++matched;
            if ( COLUMN_COUNT < minsize)
               minsize = COLUMN_COUNT;
         }

      if (matched > 1)
         for (col=0; col<colusd; ++col)
            if ((COLUMN_COUNT > minsize) &&
                 in_segset(pair, colseg[col]))
               delete_pair_from_column(pair, col);
   }
}



/*
 *  Delete the specified lex/surf pair from the specified column.
 *  If we have not already done so we make a copy of the setseg
 *  since we do not wish to delete the pair in a segset which is
 *  referenced by one of the strings we are to insert.
 */

delete_pair_from_column(pair, col)
int pair;
int col;
{
   short *src;
   short *dst;

   /*  Copy segset, if we haven't already done so.
    */
   if (! segset[colseg[col]].modified ) {
      if (segsetusd == MAXSUBSET) {
         fprintf(stderr, "Too many segments referenced...\n");
         exit(1);
      }
      memcpy(segset+segsetusd, segset+colseg[col], sizeof(struct segset));
      colseg[col] = segsetusd;
      ++segsetusd;
      segset[colseg[col]].modified = 1;
   }

   /*  Delete pair from segset.
    */
   src = dst = segset[colseg[col]].segs;
   for (; *src; ++src)
      if (src[0] != pair) 
         *dst++ = src[0];

   *dst = 0;
}



/*
 *  Any columns which no longer have any pairs assigned to them 
 *  must be removed.
 */

remove_unused_columns()
{
   short dst;
   short src;

   dst = 0;
   for (src=0; src<colusd; ++src)
      if (tstrlen(segset[colseg[src]].segs) != 0)
         colseg[dst++] = colseg[src];

   colusd = dst;
}



/*
 *  Look to see if we do not yet have a column header corresponding
 *  to the passed segment set.  
 *  If we don't have one, add it.
 */

add_column(seg, commit_flag)
int seg;
{
   int col;
   char colname[32];

   seg &= INDEXMASK;
   for (col=0; col<colusd; ++col) {
      strcpy(colname, segset_name(colseg[col]));
      if (strcmp(colname, segset_name(seg)) == 0) {
         default_to_fail[col] |= commit_flag;
         return;
      }
   }

   if (colusd == MAXCOL) {
      fprintf(stderr, "Too many columns required...\n");
      exit(1);
   }

   default_to_fail[colusd] = commit_flag;
   colseg[colusd] = seg;
   ++colusd;
}



/*
 *  Insert all target strings into the state transition table.
 */

insert_strings(strs)
struct lstr *strs;
{
   while (strs != NULL) {
      insert_string(strs->text, 1, 0);
      strs = strs->nextstr;
   }
}



/*
 *  Insert a target string into the state transition table starting at
 *  row 'num'.
 */

insert_string(str, num, commit_flag)
short *str;              /* string of segsets to insert */
int num;                 /* current state number */
int commit_flag;
{
   int col;
   int transition;


   for ( ;*str & REPEAT; ++str)
      add_star_pair(*str, num);
   if (*str == 0) {
      fprintf(stderr, "Translator cannot handle X* at end of pattern...\n");
      exit(1);
   }

   state[num].commit = commit_flag;

   if (*str & COMMIT)
      commit_flag = 1;
   if (str[1] == 0) {
      if (commit_flag)
         transition = 1;
      else
         transition = FAIL;
   }
   else
      transition = calc_next_state(str, num);
   if (debug >= 2) {
      dumpstr("insert_string", str);
      printf("num = %d, commit_flag = %d, transition = %d\n",
               num, commit_flag, transition);

   }

   for (col=0; col<colusd; ++col)
      if (is_a(colseg[col], str[0]) )
         if ( !default_to_fail[col] || 
              (str[0] & COMMIT)) {
                  if (debug >= 2)
                     printf("set %s\n", segset_name(colseg[col]));
                  state[num].next[col] = transition;
         }

   if (str[1])
      insert_string(str+1, transition, commit_flag);
}



/*
 *  If all the column matching the current segment have a transition defined
 *  and it is the same transition for all columns, then return this transition.
 *  Otherwise create a new state and return its state number.
 */

calc_next_state(str, num)
short *str;
int num;              /* current state */
{
   int col;
   int transition = 0;

   for (col=0; col<colusd; ++col)
      if (is_a(colseg[col], str[0]) &&
           ( !default_to_fail[col] || (str[0] & COMMIT) ) ) {
         if (debug >= 3) 
            printf("calc next state: %s, trans = %d, cur =%d\n",
                   segset_name(colseg[col]), transition, state[num].next[col]);
         if (transition == 0) {
            if (state[num].next[col] != 0 && state[num].next[col] != num)
               transition = state[num].next[col];
         }
         else {
            /*  If we already have a potential next state and we
             *  find another matching column which goes to either
             *  no state at all or a different state, then we need
             *  to create a new state.
             */
            if (state[num].next[col] != transition && 
                   state[num].next[col] != num) {
               transition = 0;
               break;
            }
         }
      }

   for (col=0; col<colusd && transition; ++col)
      if (!is_a(colseg[col], str[0]) && state[num].next[col] == transition)
         transition = 0;

   if (transition == 0)
      transition = create_state(num, str[0]);

   return transition;
}



/*
 *  Create a new state.  Fill in context field by copying context of
 *  previous state and adding new segset.
 */

create_state(oldnum, seg)
int oldnum;       /* old state number */
int seg;
{
   short *old = state[oldnum].context;
   short *new = state[stateusd].context;

   if (stateusd >= MAXSTATE) {
      fprintf(stderr, "Too many states needed...\n");
      exit(1);
   }

   while (*old)
      *new++ = *old++;
   *new++ = seg;
   *new = 0;

   return stateusd++;
}




/*
 *  Set all the columns matching the starred pair to loop back to the
 *  current state.
 */

add_star_pair(seg, num)
int seg;
int num;          /* current state */
{
   int col;

   for (col=0; col<colusd; ++col)
      if (state[num].next[col] == 0)
         if (is_a(colseg[col], seg))
            state[num].next[col] = num;
}



/*
 *  Add default state transitions for entries that have not been
 *  explicitly filled in by some string.
 *  Normally we deafult to return to state 1.
 *  In commited states and columns for which default_to_fail is true
 *  we default to the FAILURE transition.
 */

add_default_transitions()
{
   int row;
   int col;
   int back;

   for (row=1; row<stateusd; ++row)
      for (col=0; col<colusd; ++col) 
         if (state[row].next[col] == 0) {

            if (state[row].commit == 0) {
               back = compute_back_loop(row, col);
               if (default_to_fail[col]) {
                  if (!state[back].commit)
                     back = FAIL;
	       }
               else {
                  if (state[back].commit)
                     back = 1;
               }
            }
            else
               back = FAIL;

            state[row].next[col] = back;
         }
}




/*
 *  For columns which have been marked to make the default transition back 
 *  to state see if there exists a transition to a later in state in which
 *  the trailing context of the current state matches the leading context
 *  of the target state.
 */

add_back_loops()
{
   int row;
   int col;
   int back;

   for (row=1; row<stateusd; ++row)
      for (col=0; col<colusd; ++col) 
         if (state[row].next[col] == 1) {
            back = compute_back_loop(row, col);
            if (!state[back].commit)
               state[row].next[col] = back;
         }
}



/*
 *  Determine which state has the longest context which is compatible with
 *  the tail of the current states context with the current columns
 *  characters appended.
 */

compute_back_loop(row, col)
int row;
int col;
{
   short tmp[MAXSTR];
   int tmplen;
   int specificity;
   int most_specific = 100;
   int longest = 0;
   int longrow = 1;
   int matchlen;
   int row2;
   short *src = state[row].context;

   /*  Append characters for current column to context.
    */
   for (tmplen=0; *src; ++src, ++tmplen)
      tmp[tmplen] = *src;
   tmp[tmplen++] = colseg[col];
   tmp[tmplen] = 0;

   for (row2=2; row2<stateusd; ++row2) {
      src = state[row2].context;
      for (matchlen=0; *src; ++matchlen, ++src) {}

      /*  Ignore any row which has context longer than the current
       *  string.  
       */
      if (matchlen > tmplen)
         continue;

      --src;
      specificity = tstrlen(segset[*src & INDEXMASK].segs);
      if (specificity > most_specific ||
          (specificity == most_specific && matchlen <= longest))
         continue;

      /*  We don't get here unless string is more specific than previous
       *  matched context.  So if this context matches we use it.
       */
      if (string_match(tmp+(tmplen-matchlen), state[row2].context)) {
         most_specific = specificity;
         longest = matchlen;
         longrow = row2;
      }
   }

   return longrow;   /* return index of longest context matched */
                     /* state 1 always matches (with a length of 0) */
}



/*
 *  Determine whether current string matches context.
 *  Cur and ctx are guarenteed to be the same length.
 */

string_match(cur, ctx)
short *cur;
short *ctx;
{
   while (*cur) {
      if (! is_a(*cur, *ctx))
         return 0;
      ++cur;
      ++ctx;
   }

   return 1;
}



/*
 *   Dump header lines: alphabet, null, any, boundary, subset
 */
dump_kimmo_header()
{
   char alpha[2*MAXALL];
   char *p;
   int i;

   memset(alpha, 0, 2*MAXALL);
   add_uniq(alpha, all_lex);
   add_uniq(alpha, all_surf);
   for (i=1; i<subsetusd; ++i)
      add_uniq(alpha, subset[i].subs);

   printf("ALPHABET\n   ");
   for (p=alpha; *p; ++p)
      printf("%c ", *p);
   printf("\n");

   printf("NULL 0\n");
   printf("ANY @\n");
   printf("BOUNDARY #\n");

   for (i=1; i<subsetusd; ++i) {
      printf("SUBSET %s   ", subset[i].subname);
      for (p=subset[i].subs; *p; ++p)
         printf("%c ", *p);
      printf("\n");
   }
   dump_defaults();
}



dump_defaults()
{
   int i;
   int j;
   int len;

   for (i=1; i<strlen(all_lex); i+=30) {
      len = strlen(all_lex+1) - i + 1;
      if (len > 30)
         len = 30;
      printf("\nRULE \"defaults\" 1 %d\n", len+1);

      printf("    ");
      for (j=0; j<len; ++j)
         printf("%c ", all_lex[i+j]);
      printf("@\n");

      printf("    ");
      for (j=0; j<len; ++j)
         printf("%c ", all_surf[i+j]);
      printf("@\n");

      printf(" 1: ");
      for (j=0; j<len+1; ++j)
         printf("1 ");
      printf("\n");
   }
}


add_uniq(alpha, str)
char *alpha;
char *str;
{
   char *p;

   for (; *str; ++str) {
      if (*str == '0' || *str == '#') continue;
      for (p=alpha; *p && (*p != *str); ++p) {}
      if (*p == 0)
         *p = *str;
   }
}



/*
 *  Dump contents of state transition table to stdout
 */

dump_table(dbg)
int dbg;          /* true if this is a debug dump request */
{
   int row;
   int col;
   short *p;
   int ind;
   char txt[20];

   /* trim trailing whitespace
    */
   while ((curlineind > 1) && isspace(curline[curlineind-1])) {
      curline[curlineind-1] = 0;
      --curlineind;
   }
  
   printf("\nRULE \"%s\"", 
            strncmp(curline, "RULE", 4) == 0 ? curline+4 : curline);
   printf(" %d %d\n", stateusd-1, colusd+1);
   printf("\n    ");

   for (col=0; col<colusd; ++col) {
      /* turboC doesn't seem to like %*s so I need the following gyrations */
      ind = segset[colseg[col]].lexseg;
      strcpy(txt, "        ");
      strcat(txt, segment_name(ind));
      printf("%s", txt+(strlen(txt)-column_width(col)));
   }
   printf("   @\n    ");

   for (col=0; col<colusd; ++col) {
      ind = segset[colseg[col]].surfseg;
      strcpy(txt, "        ");
      strcat(txt, segment_name(ind));
      printf("%s", txt+(strlen(txt)-column_width(col)));
   }
   printf("   @\n");

   for (row=1; row<stateusd; ++row) {
      printf("%2d%c ", row, state[row].commit ? '.' : ':');
      for (col=0; col<colusd; ++col)
         printf("%*d", column_width(col), 
            state[row].next[col] > 0 || dbg? state[row].next[col] : 0);

      printf("   %d", 1-state[row].commit);

      if (debug) {
         printf("   ");
         for (p=state[row].context; *p; ++p)
            printf("%s", segset_name(*p));
      }
      printf("\n");
   }
   printf("\n");
}


/*
 *  Compute width of specified column table making it wide
 *  enough for the segment names in the column header.
 */

column_width(col)
int col;
{
   int len1 = strlen( segment_name(segset[colseg[col]].lexseg) ) + 2;
   int len2 = strlen( segment_name(segset[colseg[col]].surfseg) ) + 2;

   if (len2 > len1)
      len1 = len2;
   if (len1 < 3)
      len1 = 3;
   return len1;
}



optimize()
{
   int i;
   int j;

   for (i=stateusd; i>0; --i)
      for (j=i-1; j>0; --j)
         if (equal_rows(i, j)) {
            remove_row(j, i);
            i = stateusd;      /* reset loop counters */
            j = i-1;
	 }
}


equal_rows(row1, row2)
int row1;
int row2;
{
   int i;

   if (state[row1].commit != state[row2].commit)
      return 0;

   for (i=0; i<colusd; ++i)
      if (state[row1].next[i] != state[row2].next[i])
         return 0;

   return 1;
}


remove_row(keep_row, delete_row)
int keep_row;
int delete_row;
{
   int row;
   int col;

   for (row=1; row<=stateusd; ++row)
      for (col=0; col<colusd; ++col)
         if (state[row].next[col] > delete_row)
            --state[row].next[col];
         else if (state[row].next[col] == delete_row)
            state[row].next[col] = keep_row;

   for (row=delete_row; row<=stateusd; ++row)
      memcpy(&state[row], &state[row+1], sizeof(struct state));

   --stateusd;
}
