%{

/* WARNING:  
 *   The YACC program outputs the following line at the beginning
 *   of the file y.tab.c (ytab.c under DOS)
 *
 *       extern char *malloc(), *realloc();
 *
 *   This line must be removed before compiling using Turboc or
 *   terrible bizarre crashes happen.
 */


/*
 *  This is the YACC syntax for the PC kimmo table generator program.
 *  
 *  04/18/91  Version 0.2
 *
 *            Do not strip blanks from rule description field.
 *            Allow multiple underscores.
 *            Change DEFAULT keyword to PAIRS.
 *            Allow optional new line after PAIRS and RULE.
 *            Allow optional END keyword which terminates parse.
 */

#ifdef PC
#include <alloc.h>
#endif

#include <stdio.h>
#include <ctype.h>

#define EXTERN extern
#include "kgen.h"


#define YYDEBUG 1
EXTERN int yydebug = 0;

int subset_section = 1;    /* true whenever an undefined subset name should
                              be treated as the start of a definition */

char segstr[128];          /* temporary holder for strings of segments */
char savesegstr[128];
int segstrusd = 0;         /* next available location in segstr */

int altusd = 0;            /* index of alternate set we are filling */

#define MAXALT 40

struct altset {               /* zero terminated strings of segments/SUBSETs */
   short alts[MAXSTR];
} altset[2];

int first_rule = 1;        /* true until first RULE seen */

%}

%union {
   int ival;
   struct lstr *lval;
}

%type <ival> rule rule_keyword lhs lhs_pair segment alternates
%type <ival> segment_or_subset oneseg alternate 

%type <lval> rhs pattern_list pattern_element alternative_list
%type <lval> segment_pair rhs_elm


%token SEGMENT
%token SUBSET_KEYWORD
%token SUBSET_NAME
%token PAIRS
%token RULE

%%

/*
 *  Example rules:  (blanks are ignored)
 *
 *      SUBSET V      a e i o u
 *      SUBSET Vhigh  i u
 *      PAIRS  tcdkk
 *             tcdkg
 *      c:t => _ i:i
 *      c:t <= x:X (z [ f | :g | h: ]) _
 *      c:t \<= x _
 *      c:t <=> _ m '
 */

ruleset: opt_eol                     { if (debug >= 5) yydebug = 1; }
         subsets
         pairs
         rules ;

subsets :
        | subsets subset
        ;
subset: SUBSET_KEYWORD opt_eol SUBSET_NAME  { subset_section = 0; }
             segment_string eol      { if (strlen(segstr) >= MAXMEMBERS) {
                                          fprintf(stderr, 
                                              "SUBSET too long...\n");
                                          exit(1);
				       }
                                       strcpy(subset[subsetusd-1].subs,
                                              segstr);
                                       if (debug > 1)
                                          printf("%d [%s] %s\n", 
                                                 subsetusd-1,
                                                 subset[subsetusd-1].subname,
                                                 segstr);
                                       subset_section = 1;
                                     }
      ;

eol : '\n'
    | eol '\n'
    ;
opt_eol :
        | eol
        ;

segment_string: oneseg               { segstrusd = 0;
                                       add_to_segstr($1); }
        | segment_string oneseg      { add_to_segstr($2); }
        ;
oneseg: SEGMENT                      { $$ = yylval.ival; }
      ;


/*-------------------------------------------------------------*/

pairs:                              { subset_section = 0; }
     |  pairs pairlist
     ;
pairlist: PAIRS  opt_eol
                 segment_string '\n'   { strcpy(savesegstr, segstr); }
                 segment_string eol    { add_to_defaults(savesegstr, segstr); }
       ;


/*-------------------------------------------------------------*/

rules : rule
      | rules rule
      ;

rule: rule_keyword
      lhs rhs '\n'                  { build_table($3); }
    | '\n'                          { $$ = 0;          }
    ;
rule_keyword: RULE opt_eol          { if (first_rule) {
                                         dump_kimmo_header();
                                         first_rule = 0;
				      }
                                      $$ = 0;
                                    }
            ;

lhs: lhs_pair '=' '>'               { add_opt_lhs(lhs); }
   | lhs_pair '<' '='               { add_obl_lhs(lhs); }
   | lhs_pair '/' '<' '='           { $$ = 1; }
   | lhs_pair '<' '=' '>'           { lstradd(&lhs, lhs->text);
                                      add_opt_lhs(lhs->nextstr);
                                      add_obl_lhs(lhs); }
   ;

lhs_pair : segment ':' segment      { lhs = crtpair($1, $3); }
         | alternate ':' segment    { lhs = expand_alternate(1,  0, 0, $3); }
         | segment ':' alternate    { lhs = expand_alternate(0, $1, 1, 0); }
         | alternate ':' alternate  { lhs = expand_alternate(1,  0, 1, 1); }
         ;


rhs : rhs_elm                       { $$ = $1; }
    | rhs '|' rhs_elm               { $$ = lstrapp($1, $3); }
    ;

rhs_elm :
      pattern_list under               { $$ = lstralt($1, lhs, 0); }
    | pattern_list under pattern_list  { $$ = lstralt($1, 
                                                    lstralt(lhs, $3, 0),
                                                    0); }
    | under pattern_list               { $$ = lstralt(lhs, $2, 0); }
    ;

under: '_'
     | under '_'
     ;

pattern_list : pattern_element                    { $$ = $1; }
             | pattern_list pattern_element       { $$ = lstralt($1, $2, 1); }
             ;

pattern_element : segment_pair                    { $$ = $1; }       
                | segment_pair '*'                { $1->text[0] |= REPEAT; 
                                                    $$ = $1;
                                                  }
                | '[' pattern_list 
                      alternative_list ']'        { $$ = lstrapp($2, $3); }
                | '(' pattern_list ')'            { $$ = lstropt($2); }
                ;

alternative_list : '|' pattern_list               { $$ = $2; }
                 | alternative_list 
                           '|' pattern_list       { $$ = lstrapp($1, $3); }
                 ;

segment_pair: segment ':' segment     { $$ = crtpair($1, $3); }
            | segment ':'             { $$ = crtpair($1, '@'); }
            | ':' segment             { $$ = crtpair('@', $2); }
            | segment                 { $$ = crtpair($1, $1); }
            | alternate ':' segment   { $$ = expand_alternate(1,  0, 0, $3); }
            | segment ':' alternate   { $$ = expand_alternate(0, $1, 1, 0); }
            | alternate ':' alternate { $$ = expand_alternate(1,  0, 1, 1); }
            | alternate ':'           { $$ = expand_alternate(1,  0, 0, '@');}
            | alternate               { $$ = expand_alternate(1,  0, 1, 0); }
            ;

segment: SEGMENT          { $$ = yylval.ival; }
       | SUBSET_NAME      { $$ = yylval.ival; }
       ;

alternate: '{' segment_or_subset       { add_to_alternate($2); }
               alternates '}'          { $$ = altusd++; }
              ;
alternates: ',' segment_or_subset             { add_to_alternate($2); }
          | alternates ',' segment_or_subset  { add_to_alternate($3); }
          ;

segment_or_subset: SEGMENT            { $$ = yylval.ival; }
                 | SUBSET_NAME        { $$ = yylval.ival; }
                 ;

%%

/*
 *  Yylex is called by Yacc whenever it is ready to scan the next token
 *  from the input stream.
 *  We return:
 *
 *      0            on end of file
 *      SEGMENT      a lexical or surface segment
 *      SUBSET_NAME  a subset name
 *      RULE, PAIRS, SUBSET   the corresponding keyword
 *      c            an ascii delimeter ( _ < > ... )
 */

yylex()
{
   int c;
   static int lastc = 0;

   if (lastc == '\n') {
      curlineind = 0;
      curline[0] = 0;
   }

   /*  Skip leading blanks
    */
   while ((c = getchar()) == ' ')
      if (curlineind+1 < sizeof(curline)) {
         curline[curlineind++] = ' ';
         curline[curlineind] = 0;
      }
   if (c == EOF)
      return 0;

   while (c == ';' || c == '!' || c == ' ') {
      /*  Skip comment characters
       */
      if (c == ';') {
         do {
            c = getchar();
            if (c == EOF)
               return 0;
         } while (c != '\n');
         break;
      } 
      else if (c == '!') {
         do {
            c = getchar();
            putchar(c);
            if (c == EOF)
               return 0;
         } while (c != '\n');
         c = getchar();
      }
      else
         c = getchar();
   }

   lastc = c;
   yylval.ival = c;
   if (curlineind+1 < sizeof(curline)) {
      curline[curlineind++] = c;
      curline[curlineind] = 0;
   }

   if ( c >= 'A' && c <= 'Z')
      return get_subset_name(c);
   else
      if ( index("=<>/:|_*[]{}(),\n", c) != NULL)
         return(c);

   return(SEGMENT);
}


index(str, c)
char *str;
int c;
{
   for (; *str; ++str)
      if (*str == c)
         return 1;

   return NULL;
}



/*
 *   At this point a subset name (or the keyord SUBSET or PAIRS)
 *   appears in the input.  Gather all the characters up to the first
 *   blank.
 *
 *   If we are in the subset definition section we must not have seen this
 *   identifier previously.  If we are not in the subset section we
 *   must have already seen it.
 *
 *   Set lval to the index of the identifier and return token type
 *   SUBSET.
 */

get_subset_name(c)
char c;              /* first character in token */
{
   char name[MAXSTR];
   int nameind = 0;

   memset(name, 0, MAXSTR);
   name[nameind++] = c;

   while ( isalpha(c = getchar()) ) {
      if (curlineind+1 < sizeof(curline)) {
         curline[curlineind++] = c;
         curline[curlineind] = 0;
      }

      if (nameind < MAXSTR-1) 
         name[nameind++] = c;
      else {
         fprintf(stderr, "\n\nSubset name (%s) too long...\n", name);
         exit(1);
      }
   }
   name[nameind] = 0;
   ungetc(c, stdin);    /* push back character that is not part of name */

   if (strcmp(name, "SUBSET") == 0)
      return SUBSET_KEYWORD;

   if (strcmp(name, "PAIRS") == 0)
      return PAIRS;

   if (strcmp(name, "RULE") == 0)
      return RULE;

   if (strcmp(name, "END") == 0)
      return 0;

   yylval.ival = check_subset_name(name);
   if (debug >= 3)
      printf("check_subset_name(%x) = %d\n", name, yylval.ival);

   return yylval.ival & SUBSET ? SUBSET_NAME : SEGMENT;
}


/*
 *   If we are in subset section and named subset is not defined
 *   setup to define subset.
 *   If we are in subset section and named subset is already defined
 *   then we have a duplicate subset defintion.
 *
 *   If we are NOT in subset section and named subset is not defined
 *   then we have a reference to an undefined subset name.
 *   If we are NOT is subset section and named subset is defined then
 *   return the subset index.
 */

check_subset_name(name)
char *name;
{
   int i;

   for (i=0; i<subsetusd; ++i) 
      if (strcmp(name, subset[i].subname) == 0)
         break;

   if (i == subsetusd) {
      /*  Subset name not found
       */
      if ( subset_section ) {
         if (subsetusd >= MAXSUBSET) {
            fprintf(stderr, "\n\nToo many sets defined...\n");
            exit(1);
         }
         ++subsetusd;
         strcpy(subset[i].subname, name);
         return i | SUBSET;
      }
      else {
         /*  Undefined single character names are assumed to represent
          *  capital letters and not undefined subsets.
          */
         if (strlen(name) == 1) 
            return name[0];
         fprintf(stderr, "\n\nUndefined subset (%s) referenced...\n", name);
         exit(1);
      }
   }
   else {
      /* Subset name found.
       */
      if ( subset_section ) {
         fprintf(stderr, "\n\nDuplicate definition for subset (%s) ...\n", name);
         exit(1);
      }
      else {
         return i | SUBSET;
      }
   }
}



/*
 *  Add a single character the 'segstr' array and check for overflow.
 */

add_to_segstr(c)
char c;
{
   if (segstrusd+1 >= sizeof(segstr)) {
      fprintf(stderr, "\nToo many characters in set...\n");
      exit(1);
   }

   segstr[segstrusd] = c;
   ++segstrusd;
   segstr[segstrusd] = 0;
}



/*
 *  This routine is called from the parser immediately after it recoginizes
 *  that this is an optional ( => ) rule.
 *  It modifies the segset index (possibly more than one if {}'s)
 *  to all mark the current point as the COMMIT point.
 */

add_opt_lhs(str)
struct lstr *str;
{
   short *txt = str->text;

   while (*txt) {     /* examine all alaternates */
      *txt |= COMMIT;          /* set COMMIT flag */
      add_column(*txt, 1);     /* create column and mark default transition
                                  for this column to be to FAIL state */
      ++txt;
   }
   if (debug >= 2)
      dumplstr("after add_opt_lhs", str);
}



/*
 *  This routine is called by parser immediately after it recognizes that
 *  it has an obligatory rule ( <= ).
 *  It modifies the lists of pairs pointed to by lhs.
 */

add_obl_lhs(str)
struct lstr *str;
{
   short *txt = str->text;
   int alt_flag;

   while (*txt) {
      /*  Within an obligatory rule 0:X is a special case.
       *  We translate
       *      0:x <= a _ b
       *  To
       *      Forbid  a b
       *  SEE R49 in PC-KIMMO book 
       *  Also the generator cannot handle the following:
       *    {x,y}:{0,z} <= _ b
       *  so we forbid it.
       */
      if (segset[*txt & INDEXMASK].lexseg == '0') {
         short *q = txt+1;
         for (; *q; ++q)
            if (segset[*q & INDEXMASK].lexseg != '0') {
               fprintf(stderr, "Cannot mix X:0 and X:Y in alternate...\n");
               exit(1);
	    }
         *txt = 0;   
         break;
      }

      /*  Within an obligatory rule we change the specified pair into
       *  a string that looks for any other mapping.
       *  We also make sure that the original pair is one of the columns
       *  so that it can be used to match on the OK case.
       */
      add_column(*txt, 0);
      alt_flag = *txt & ALTERNATIVE;
      *txt = create_segset(segset[*txt & INDEXMASK].lexseg,
                       segset[*txt & INDEXMASK].surfseg | COMPLEMENT);
      *txt |= alt_flag;
      add_column(*txt, 0);
      ++txt;
   }

   if (debug >= 2)
      dumplstr("after add_obl_lhs", str);
}



/*
 *  Add a set of lexical/surface default pairs to table.
 */

add_to_defaults(lex, surf)
char *lex;
char *surf;
{
   if (strlen(lex) != strlen(surf)) {
      fprintf(stderr, "Number of lexical and surface characters does not match...\n");
      exit(1);
   }

   if (strlen(lex) + strlen(all_lex) >= MAXALL) {
      fprintf(stderr, "Too many default pairs defined...\n");
      exit(1);
   }

   strcat(all_lex, lex);
   strcat(all_surf, surf);
   allusd = strlen(all_lex);
}



/*
 *  Add a segment which is either a character or a subset index to
 *  a set of such characters.
 */

add_to_alternate(seg)
int seg;
{
   int len;

   len = tstrlen(altset[altusd].alts);
   if (len+1 >= MAXALT) {
      fprintf(stderr, "Too many characters within {}'s...\n");
      exit(1);
   }

   altset[altusd].alts[len] = seg;
}



/*
 *  A segment specification has been found of the form x:y where at least
 *  one of the entries has been specified using the {} notation.
 *  Expand this into a single lstr.
 */

struct lstr *
expand_alternate(is_alt1, alt1, is_alt2, alt2)
int is_alt1;
int alt1;
int is_alt2;
int alt2;
{
   struct lstr *tmp;
   int cnt;

   /*  If lexical and surface forms were both alternates we just verify
    *  that they contain the same number of choices.
    */
   if (is_alt1 && is_alt2) {
      if (alt2 == 0)
         tstrcpy(altset[1].alts, altset[0].alts);
      if (tstrlen(altset[0].alts) != tstrlen(altset[1].alts)) {
         fprintf(stderr, "Unmatched lengths in {}:{} ...\n");
         exit(1);
      }
   }

   /*  If the lexical form only is alternate we make N copies of the
    *  surface form so that there is a 1-1 match.
    */
   else if (is_alt1) {
      for (cnt=0; cnt<tstrlen(altset[0].alts); ++cnt)
         altset[1].alts[cnt] = alt2;
      altset[1].alts[cnt] = 0;
   }

   /*  If the surface form only is an alternate we make N copies
    *  of the lexical form.
    */
   else {
      for (cnt=0; cnt<tstrlen(altset[0].alts); ++cnt) {
         altset[1].alts[cnt] = altset[0].alts[cnt];
         altset[0].alts[cnt] = alt1;   
      }
      altset[1].alts[cnt] = 0;
   }

   tmp = crtpair(altset[0].alts[0], altset[1].alts[0]);
   tmp->text[0] |= ALTERNATIVE;
   for (cnt=1; altset[0].alts[cnt]; ++cnt) {
      tmp->text[cnt] = create_segset(altset[0].alts[cnt], altset[1].alts[cnt]);
      if (altset[0].alts[cnt+1])
         tmp->text[cnt] |= ALTERNATIVE;
   }
   if (debug >= 2)
      dumpstr("Expand alternate", tmp->text);

   memset(altset, 0, sizeof(altset));
   altusd = 0;
   return tmp;
}



/*
 *  Return the index of a segment set containing all lex/surface pairs
 *  which match lex/surf.
 *  Arguments to this routine can either be single segment ascii values
 *  or subset indices.   Subset indices have the SUBSET bit set on.
 */

create_segset(lex, surf)
int lex;
int surf;
{
   struct segset sset;
   int ind;
   int all;

   /*  If we already have a segset which has the correct name,
    *  return its index.   (also fills in name into sset if not found)
    */
   ind = findsegset(lex, surf, &sset);
   if (ind < segsetusd)
      return ind;

   if (segsetusd >= MAXSUBSET) {
      fprintf(stderr, "Too many segments defined...\n");
      exit(1);
   }

   /*  Find all lex/surf pairs which match the passed description
    *  and add them to a segset.
    */
   ind = 0;
   for (all=1; all<allusd; ++all)
      if (matches(all_lex[all], lex) && matches(all_surf[all], surf)) {
         if (ind >= MAXSUBSET) {
            fprintf(stderr, "Segset overflow...\n");
            exit(1);
         }
         sset.segs[ind++] = all;
      }

   /*  If we couldn't find any matching pairs and one of the segments
    *  was a subset, we abort with an error.
    *  Otherwise we add the new pair to the default pair array.
    */
   if (ind == 0) {
      if ((lex & SUBSET) || (surf & SUBSET) || (surf & COMPLEMENT)) {
         fprintf(stderr, "No pairs match %s:", segment_name(lex));
         fprintf(stderr, "%s...\n", segment_name(surf));
         exit(1);
      }
      if (debug > 2)
         printf("add pair %c:%c\n", lex, surf);
      all_lex[allusd] = lex;
      all_surf[allusd] = surf;
      sset.segs[ind++] = allusd++;
      if (allusd >= MAXALL) {
         fprintf(stderr, "Too many pairs defined...\n");
         exit(1);
      }
   }

   sset.segs[ind] = 0;

   memcpy(&segset[segsetusd], &sset, sizeof(sset));
   if (debug >= 2) {
      printf("Create_segset (%x) %s, ", segsetusd, segment_name(lex));
      printf("%s ", segment_name(surf));
      dumppairs("", sset.segs);
   }

   return segsetusd++;
}



/*
 *  Determine if there is already a segment set representing the
 *  passed pair.   If so return its index, otherwise return
 *  segsetusd.
 */

findsegset(lex, surf, sset)
int lex;
int surf;
struct segset *sset;
{
   int ind;

   memset(sset, 0, sizeof(struct segset));
   sset->lexseg = lex;
   sset->surfseg = surf;

   for (ind=0; ind<segsetusd; ++ind)
      if (sset->lexseg == segset[ind].lexseg &&
          sset->surfseg == segset[ind].surfseg)
               break;

   if (debug >= 3) {
      printf("findsegset (%x) %s, ", ind, segment_name(lex));
      printf("%s\n ", segment_name(surf));
   }

   return ind;
}



/*
 *  Create an lstr which contains a single pair of lex/surface segments.
 */

struct lstr *
crtpair(lex, surf)
int lex;
int surf;
{
   struct lstr *tmp = NULL;
   static int empty[2] = {0, 0};

   lstradd(&tmp, empty);
   memset(tmp->text, 0, MAXSTR * sizeof(short));
   tmp->text[0] = create_segset(lex, surf);

   return tmp;
}



/*
 *  Determine if an ascii character is
 *     1) equal to chk if it is single character
 *     2) a member of the subset of which chk is the index (chk & SUBSET)
 *     3) anything if chk == @
 *
 *  Note: if complement bit is set we invert the sense of the match.
 */

matches(seg, chk)
int seg;
int chk;
{
   char *p;
   int ans = 0;

   if (chk & SUBSET) {
      p = subset[chk & INDEXMASK].subs;
      for (; *p; p++)
         if (*p == seg)
            break;
      if (*p)
         ans = 1;
   }
   else if (chk == '@')
      ans = 1;
   else
      ans = ( seg == (chk & INDEXMASK) );

   if (chk & COMPLEMENT)
      ans = (! ans);

   if (debug >= 4) {
      printf("Matches  %s, ", segment_name(seg));
      printf("%s = %d\n ", segment_name(chk), ans);
   }

   return ans;
}




