 /*
    [jan94] (air)
    Added command-line invocation
    Changes to 'dangling_terminator' processing, also apostrphied strings.
    Fixes due to examination of wsj90-92 data:
    tightened up abbreviation processing (wrt to .period); 
    removed even more bogus abbrevs, added others.
    more comprehensive treatment of bracketing
    fixed context processing for some (overlooked) tokens

    [30sept93] (air) 

    This is mostly a reworking of the Cambridge version of the 
    MITalk Format module.
    Note that this version is more suitable for language modelling 
    preprocessing than for synthesis.  This code was tested on AP wire 
    data from Tipster.

    Note that this analyzer is too big for lex; you have to use flex.

    Fixes:
    now does ordinals
    numbers in thousands work correctly
    number ranges, mixed digit/alpha string fixed
    currency is correct
    period, quotes done right
    passes SGML stuff
    some logic redone for clarity.
    abbreviation handling revamped.

 */


	/* set up table sizes for LEX */  /* from Cambridge version... */

%e 5000
%p 15000
%a 7000
%o 7000
%n 1500
%k 3000

%{


/* some constants used by process_abbrev()  */
#define	INTERNAL 1
#define UNKNOWN	 0

#define TRUE	1
#define	FALSE	0

/* some forward declarations */
char *strip_comma(char *string),
     *sound_integer(char *string),
     *sound_year(char *integer);

void pending_action(int flush_flag);

/* a period could either terminate an abbreviation or a number, or also be
an end-of-sentence period.  This flag keeps track of this possibility and
sometimes allows the right thing to happen */
int dangling_period   = 0;

int end_of_sentence   = 0;
int sentence_ended    = 0;
int start_of_sentence = 1;	/* assume that we're always starting a sentence */
int sentence_count = 0;
int need_a_new_line   = 0;
int last_was_whitespace = 0;

char paragraph_id[32];		/* id part of <p> sgml token; note hard-coded length */
char dangling_terminator[128];	/* string of accumulated tokens */

%}

	/* define types for construct recognition */

D				[0-9]
DST                             {D}+[s][t]
DND                             {D}+[n][d]
DRD                             {D}+[r][d]
DTH                             {D}+[t][h]
E				[DEde][-+]?{D}+

LETTERS				([A-Z][.])+
CAPSWD				[A-Z]+


WORD				[A-Za-z][a-z]*
HYPWORD 			{WORD}\-{WORD}(\-{WORD})*
 /* SPLTWD				{WORD}\-$ /**/
 /* MIXHYPWD			\-{WORD} /**/

WSPACEA				[ \t]+
WSPACEB				[\n]
QUOTES  			[\"`']+
OPEN_BRACKET			[\(\[\{]+
CLOSE_BRACKET			[\)\]\}]+
PUNCTN  			[\,\;\:\.\?\!]+

%%
"<p.".+[\n]                     {start_paragraph(yytext);}
"</p>"[\n]                      process_paragraph();

^\<.+\>{WSPACEA}*[\n]           printf(""); /* siphon off any other SGML */

{WSPACEA}			{ printf(""); process_whitespace();}
{WSPACEB}			{ printf(" ");need_a_new_line = 1; process_whitespace();}

{D}+         			|
{D}{1,3}(","{D}{D}{D})+		print_integer(yytext);
"-"{D}+       			|
"-"{D}{1,3}(","{D}{D}{D})+	print_integer_minus(yytext);

{DST}                           print_first(yytext);
{DND}                           print_second(yytext);
{DRD}                           print_third(yytext);
{DTH}                           print_nth(yytext);

"'"?{D}+"s"                     print_plural_integer(yytext);
{D}+"-"{D}+                     print_integer_range(yytext);


{D}+"."{D}*{E}?				|
{D}*"."{D}+{E}?				|
{D}+{E}					print_real(yytext);


{D}{1,3}(","{D}{D}{D})+"."{D}*({E})?	|
{D}{1,3}(","{D}{D}{D})+{E}		print_real(strip_comma(yytext));



"-"{D}+"."{D}*{E}?			|
"-"{D}*"."{D}+{E}?			|
"-"{D}+{E}				print_real_minus(yytext);

"-"{D}{1,3}(","{D}{D}{D})+"."{D}*({E})?	|
"-"{D}{1,3}(","{D}{D}{D})+{E}		print_real_minus(strip_comma(yytext));



{D}{D}?[-/.]{D}{D}?[-/.]{D}{D}({D}{D})?		print_date(yytext);



{D}{D}?":"{D}{D}				|
{D}{D}?(":"{D}{D})?(am|pm)			print_time(yytext);



"$"{D}+("."{D}{D})?				|
"$"{D}+("."{D}+)?(K|M|Bn)			print_money(yytext);

"$"{D}{1,3}(","{D}{D}{D})+("."{D}{D})?		|
"$"{D}{1,3}(","{D}{D}{D})+("."{D}+)?(K|M|Bn)	print_money(strip_comma(yytext));


\${D}+([.]{D}{1,3})?({WSPACEA}|{WSPACEB})(million|billion|trillion)     print_big_money(yytext);



[\001-\026]		printf(" ");  /* trash control characters */
\#			print_other_punct("#HASH ");
\%			print_other_punct("%PERCENT ");
\&			print_other_punct("&AMPERSAND ");
 /* \*			print_other_punct("times ");  /**/
\+			print_other_punct("+PLUS ");
\-			print_other_punct("-HYPHEN ");
\-\-			print_other_punct("--DASH ");
\/			print_other_punct("/SLASH ");
\<			print_other_punct("<LESS-THAN ");
\=			print_other_punct("EQUALS ");
\>			print_other_punct(">GREATER-THAN ");
 /* \\			print_other_punct("back slash ");  /**/
\^			print_other_punct("CARET ");
\_			print_other_punct("_UNDERSCORE ");
\@			print_other_punct("@AT-SIGN ");
\|			print_other_punct("|VERTICAL-BAR ");
\~			print_other_punct("~TILDE ");
\127			printf(" ");  /**/


[Ww]orld[ ][Ww]ar[ ]II  process_abbrev(yytext,"World War Two ",UNKNOWN);
[Ww]orld[ ][Ww]ar[ ]I   process_abbrev(yytext,"World War One ",UNKNOWN);

	/* the list of abbreviations has been culled to remove
	   items that conflict or that are not American usage.
	   Some N. American terms have been added.  Refer to 
	   original to recover British expressions.
	The list in any case still needs to be trimmed; too many wierd words.

	process_abbrev() deals with abbrevs that might occur at the
	end of a sentence; need to do this more thouroughly, though.
	*/

"a.m."			process_abbrev(yytext,"A. M. ",UNKNOWN);
"p.m."			process_abbrev(yytext,"P. M. ",UNKNOWN);



[Aa]bbr[.]?		process_abbrev(yytext,"abbreviated ",UNKNOWN);
[Aa]bbrev[.]?		process_abbrev(yytext,"abbreviation ",UNKNOWN);
[Aa]bs[.]?		process_abbrev(yytext,"absolute ",UNKNOWN);
ac			process_abbrev(yytext,"a. c. ",UNKNOWN);
[Aa]"d inf"[.]?		process_abbrev(yytext,"ad infinitum ",UNKNOWN);
Adm[.]?			process_abbrev(yytext,"Admiral ",INTERNAL);
[Aa]dmin[.]		process_abbrev(yytext,"administration ",UNKNOWN);
[Aa]dvt[.]?		process_abbrev(yytext,"advertisment ",UNKNOWN);
[Aa]gric[.]?		process_abbrev(yytext,"agriculture ",UNKNOWN);
[Aa]mt[.]?		process_abbrev(yytext,"amount ",UNKNOWN);
[Aa]nat[.]?		process_abbrev(yytext,"anatomy ",UNKNOWN);
[Aa]non[.]?		process_abbrev(yytext,"anonymous ",UNKNOWN);
[Aa]ns[.]?		process_abbrev(yytext,"answer ",UNKNOWN);
Apr[.]?			process_abbrev(yytext,"April ",UNKNOWN);
[Aa]rchit[.]?		process_abbrev(yytext,"architecture ",UNKNOWN);
[Aa]ssoc[.]		process_abbrev(yytext,"association ",UNKNOWN);
[Aa]sst[.]		process_abbrev(yytext,"assistant ",UNKNOWN);
Att[.]			process_abbrev(yytext,"Attorney ",UNKNOWN);
Aug[.]			process_abbrev(yytext,"August ",UNKNOWN);
[Bb]al[.]?		process_abbrev(yytext,"balance ",UNKNOWN);
[Bb]iol[.]?		process_abbrev(yytext,"biology ",UNKNOWN);
[Bb]ldg[.]?		process_abbrev(yytext,"building ",UNKNOWN);
Blvd[.]			process_abbrev(yytext,"boulevard ",UNKNOWN);
[Bb]ro[.]		process_abbrev(yytext,"brother ",UNKNOWN);
[Bb]ros[.]		process_abbrev(yytext,"brothers ",UNKNOWN);
Capt[.]			process_abbrev(yytext,"Captain ",INTERNAL);
Card[.]			process_abbrev(yytext,"Cardinal ",INTERNAL);
[Cc]ertif[.]		process_abbrev(yytext,"certificate ",UNKNOWN);
[Cc]hem[.]?		process_abbrev(yytext,"chemistry ",UNKNOWN);
cm			process_abbrev(yytext,"centimetre ",UNKNOWN); 
[Cc]o[.]		process_abbrev(yytext,"company ",UNKNOWN); 
Col[.]			process_abbrev(yytext,"Colonel ",INTERNAL);
[Cc]olloq[.]?		process_abbrev(yytext,"collooquial ",UNKNOWN); 
Comdr[.]		process_abbrev(yytext,"Commander ",INTERNAL); 
[Cc]ont[d]?[.]		process_abbrev(yytext,"continued ",UNKNOWN); 
[Cc]f[.]?		process_abbrev(yytext,"compare with ",UNKNOWN); 
[Cc]orp[.]		process_abbrev(yytext,"corporation ",UNKNOWN); 
[Cc]res[.]		process_abbrev(yytext,"cresent ",UNKNOWN); 
cu[.]			process_abbrev(yytext,"cubic ",UNKNOWN); 
dB			process_abbrev(yytext,"decibel ",UNKNOWN); 
[Dd]bl[.]?		process_abbrev(yytext,"double ",UNKNOWN); 
Dec[.]?			process_abbrev(yytext,"December ",UNKNOWN); 
dec[.]?			process_abbrev(yytext,"deceased ",UNKNOWN); 
[Dd]ef[.]?		process_abbrev(yytext,"definition ",UNKNOWN); 
[Dd]ep[.]?		process_abbrev(yytext,"departs ",UNKNOWN); 
[Dd]ept[.]		process_abbrev(yytext,"department ",UNKNOWN); 
[Dd]iam[.]?		process_abbrev(yytext,"diameter ",UNKNOWN); 
Dr[.]			process_abbrev(yytext,"doctor ",INTERNAL); 
ed[.]?			process_abbrev(yytext,"editor ",UNKNOWN); 
[Ee]sp[.]?		process_abbrev(yytext,"especially ",UNKNOWN); 
Esq[.]			process_abbrev(yytext,"esquire ",UNKNOWN); 
[Ee]tc[.]		process_abbrev(yytext,"et cetera ",UNKNOWN); 
[Ee]xec[.]		process_abbrev(yytext,"executive ",UNKNOWN); 
Feb[.]			process_abbrev(yytext,"February ",UNKNOWN); 
[Ff]ig[.]		process_abbrev(yytext,"figure ",UNKNOWN); 
fl[.]?			process_abbrev(yytext,"fluid ",UNKNOWN); 
Fri[.]			process_abbrev(yytext,"Friday ",UNKNOWN); 
ft[.]?			process_abbrev(yytext,"feet ",UNKNOWN);
Gen[.]			process_abbrev(yytext,"General ",INTERNAL); 
[Gg]eog[.]?		process_abbrev(yytext,"geography ",UNKNOWN); 
[Gg]eol[.]?		process_abbrev(yytext,"geology ",UNKNOWN); 
[Gg]ovt[.]		process_abbrev(yytext,"government ",UNKNOWN); 
Gov[.]                  process_abbrev(yytext,"Governor ",INTERNAL);
Hon[.]			process_abbrev(yytext,"Honourable ",INTERNAL); 
hr			process_abbrev(yytext,"hour ",UNKNOWN); 
Hz			process_abbrev(yytext,"Hertz ",UNKNOWN); 
[Ii]ncorp[.]		process_abbrev(yytext,"incorporated ",UNKNOWN); 
[Ii]nc[.]		process_abbrev(yytext,"incorporated ",UNKNOWN); 
[Ii]ndiv[.]?		process_abbrev(yytext,"individual ",UNKNOWN); 
[Ii]nst[.]		process_abbrev(yytext,"institute ",UNKNOWN); 
[Ii]ntro[d]?[.]?	process_abbrev(yytext,"introduction ",UNKNOWN); 
Jan[.]			process_abbrev(yytext,"January ",UNKNOWN); 
J[n]?r[.]		process_abbrev(yytext,"Junior ",UNKNOWN); 
Jul[.]			process_abbrev(yytext,"July ",UNKNOWN); 
Jun[.]			process_abbrev(yytext,"June ",UNKNOWN); 
[Jj]unc[.]?		process_abbrev(yytext,"junction ",UNKNOWN); 
kg[.]?			process_abbrev(yytext,"kilogramme ",UNKNOWN); 
km[.]?			process_abbrev(yytext,"kilometre ",UNKNOWN); 
km[/]?h[.]?		process_abbrev(yytext,"kilometres per hour ",UNKNOWN); 
kW			process_abbrev(yytext,"kilowatt ",UNKNOWN); 
kWh			process_abbrev(yytext,"kilowatt hour ",UNKNOWN); 
[Ll]ang[.]?		process_abbrev(yytext,"language ",UNKNOWN); 
Lieut[.]		process_abbrev(yytext,"Lieutenant ",INTERNAL); 
Lt[.]			process_abbrev(yytext,"Lieutenant ",INTERNAL);
[Ll]td[.]		process_abbrev(yytext,"limited ",UNKNOWN); 
Maj[.]			process_abbrev(yytext,"Major ",INTERNAL); 
Mar[.]			process_abbrev(yytext,"March ",UNKNOWN); 
mb			process_abbrev(yytext,"millibar ",UNKNOWN); 
Mdm[.]			process_abbrev(yytext,"Madam ",INTERNAL); 
Messrs[.]		process_abbrev(yytext,"Messrs ",INTERNAL);
[Mm]ech[.]?		process_abbrev(yytext,"mechanical ",UNKNOWN); 
[Mm]frs[.]		process_abbrev(yytext,"manufacturers ",UNKNOWN); 
mg			process_abbrev(yytext,"milligramme ",UNKNOWN); 
[Mm]isc[.]?		process_abbrev(yytext,"miscellaneous ",UNKNOWN);
ml			process_abbrev(yytext,"millilitre ",UNKNOWN); 
mm			process_abbrev(yytext,"millimetre ",UNKNOWN); 
Mme[.]			process_abbrev(yytext,"Madam ",UNKNOWN); 
[Mm]pg[.]?		process_abbrev(yytext,"miles per gallon ",UNKNOWN); 
[Mm]ph[.]?		process_abbrev(yytext,"miles per hour ",UNKNOWN); 
[Mm]on[.]?              process_abbrev(yytext,"Monday ",UNKNOWN);
Mr[.]			process_abbrev(yytext,"Mr. ",INTERNAL); 
Mrs[.]			process_abbrev(yytext,"Mrs. ",INTERNAL); 
Ms[.]			process_abbrev(yytext,"Ms. ",INTERNAL); 
ms			process_abbrev(yytext,"millisecond ",UNKNOWN); 
Natl[.]			process_abbrev(yytext,"national ",UNKNOWN); 
No[.]			process_abbrev(yytext,"number ",INTERNAL);
[Nn]os[.]		process_abbrev(yytext,"numbers ",INTERNAL); 
Nov[.]			process_abbrev(yytext,"November ",UNKNOWN); 
ns			process_abbrev(yytext,"nanosecond ",UNKNOWN); 
Oct[.]			process_abbrev(yytext,"October ",UNKNOWN); 
[Oo]rig[.]?		process_abbrev(yytext,"origin ",UNKNOWN); 
oz[.]?			process_abbrev(yytext,"ounce ",UNKNOWN); 
[Pp]hys[.]?		process_abbrev(yytext,"physics ",UNKNOWN); 
[Pp]op[.]		process_abbrev(yytext,"population ",UNKNOWN); 
pp[.]?			process_abbrev(yytext,"pages ",UNKNOWN); 
Pres[.]			process_abbrev(yytext,"President ",INTERNAL); 
Prof[.]			process_abbrev(yytext,"Professor ",INTERNAL); 
[Pp]seud[.]?		process_abbrev(yytext,"pseudonym ",UNKNOWN); 
[Qq]ty[.]?		process_abbrev(yytext,"quantity ",UNKNOWN); 
Rep[.]                  process_abbrev(yytext,"Representative ",INTERNAL);
Rev[.]                  process_abbrev(yytext,"Reverend ",INTERNAL);
[Rr]ecd[.]?		process_abbrev(yytext,"received ",UNKNOWN); 
[Rr]ef[.]?		process_abbrev(yytext,"reference ",UNKNOWN); 
[Rr]etd[.]?		process_abbrev(yytext,"retired ",UNKNOWN); 
Rt[.]?			process_abbrev(yytext,"right ",UNKNOWN);
Sat[.]			process_abbrev(yytext,"Saturday ",UNKNOWN);
[Ss]ec[.]		process_abbrev(yytext,"secretary ",UNKNOWN);
Sen[.]			process_abbrev(yytext,"Senator ",INTERNAL);
Sept[.]			process_abbrev(yytext,"September ",UNKNOWN);
Sgt[.]			process_abbrev(yytext,"Sergeant ",INTERNAL);
S[n]?r[.]		process_abbrev(yytext,"senior ",UNKNOWN);
St[.]			process_abbrev(yytext,"saint ",UNKNOWN);
Sun[.]			process_abbrev(yytext,"Sunday ",UNKNOWN);
[Ss]ubj[.]		process_abbrev(yytext,"subject ",UNKNOWN);
[Ss]yst[.]		process_abbrev(yytext,"system ",UNKNOWN);
[Tt]el[.]		process_abbrev(yytext,"telephone ",UNKNOWN);
Thur[s]?[.]		process_abbrev(yytext,"Thursday ",UNKNOWN);
[Tt]reas[.]?		process_abbrev(yytext,"treasurer ",UNKNOWN);
Tue[s]?[.]		process_abbrev(yytext,"Tuesday ",UNKNOWN);
uk			process_abbrev(yytext,"U. K. ",UNKNOWN);
[Uu]niv[.]		process_abbrev(yytext,"university ",UNKNOWN);
Wed[.]			process_abbrev(yytext,"Wednesday ",UNKNOWN);
[Ww]k[s]?[.]?		process_abbrev(yytext,"weeks ",UNKNOWN);
[Ww]t[.]?		process_abbrev(yytext,"weight ",UNKNOWN);
Xmas			process_abbrev(yytext,"Christmas ",UNKNOWN);
yd			process_abbrev(yytext,"yard ",UNKNOWN);

 /* this list needs to be completed (items so far added only if observed) */
Ala[.]			process_abbrev(yytext,"Alabama ",UNKNOWN);
Ariz[.]			process_abbrev(yytext,"Arizona ",UNKNOWN);
Ark[.]			process_abbrev(yytext,"Arkansas ",UNKNOWN);
Calif[.]		process_abbrev(yytext,"California ",UNKNOWN);
Conn[.]			process_abbrev(yytext,"Connecticut ",UNKNOWN);
Fla[.]			process_abbrev(yytext,"Florida ",UNKNOWN);
Ill[.]			process_abbrev(yytext,"Illinois ",UNKNOWN);
La[.]			process_abbrev(yytext,"Louisiana ",UNKNOWN);
Mass[.]			process_abbrev(yytext,"Massachusetts ",UNKNOWN);
Md[.]			process_abbrev(yytext,"Maryland ",UNKNOWN);
Mich.			process_abbrev(yytext,"Michigan ",UNKNOWN);
Mo[.]			process_abbrev(yytext,"Missouri ",UNKNOWN);
Minn[.]			process_abbrev(yytext,"Minnesota ",UNKNOWN);
Neb[.]			process_abbrev(yytext,"Nebraska ",UNKNOWN);
Nev[.]			process_abbrev(yytext,"Nevada ",UNKNOWN);
Okla[.]			process_abbrev(yytext,"Oklahoma ",UNKNOWN);
Tenn[.]			process_abbrev(yytext,"Tennessee ",UNKNOWN);
Va[.]			process_abbrev(yytext,"Virginia ",UNKNOWN);
Pa[.]			process_abbrev(yytext,"Pennsylvannia ",UNKNOWN);
Wash[.]			process_abbrev(yytext,"Washington ",UNKNOWN);
Wis[.]			process_abbrev(yytext,"Wisconsin ",UNKNOWN);
Wyo[.]			process_abbrev(yytext,"Wyoming ",UNKNOWN);


 /* the following rules are probably bogus, as they allow for mixed-case expressions */
[Aa][.]?[Kk][.]?[Aa][.]?		process_abbrev(yytext,"ALSO KNOWN AS ",UNKNOWN);
[Cc][.]?[Oo][.]?[Dd][.]?		process_abbrev(yytext,"CASH ON DELIVERY ",UNKNOWN);
[Ww][Pp][Mm][.]?			process_abbrev(yytext,"WORDS PER MINUTE ",UNKNOWN);
[Ww][.]?[Rr][.]?[Tt][.]?		process_abbrev(yytext,"WITH RESPECT TO ",UNKNOWN);
[Pp][Pp][Mm][.]?			process_abbrev(yytext,"PARTS PER MILLION ",UNKNOWN);
[Rr][Pp][Mm][.]?			process_abbrev(yytext,"REVOLUTIONS PER MINUTE ",UNKNOWN);




{WORD}[']/({WSPACEA}|{WSPACEB})         print_apost_single(yytext);
{WORD}[']{WORD}                         print_apost(yytext);
['][Nn]({WSPACEA}|{WSPACEB})		print_word(yytext);
['][Ee][Mm]({WSPACEA}|{WSPACEB})	print_word(yytext);

{WORD}          			print_word(yytext);
{HYPWORD}       			print_hypword(yytext);
 /* {MIXHYPWD}	print_mixhypword(yytext);
 /* {SPLTWD}	print_split(yytext); /**/

[A-Za-z]+"-"/{D}		print_hypword(yytext);

{CAPSWD}			print_caps(yytext);
{LETTERS}			print_abbrev(yytext);
{WORD}({WORD})+			print_word(yytext);	/* random strings */
{WORD}({WORD})+[']{WORD}	print_word(yytext);	/* random strings's */

{QUOTES}        		print_quotes(yytext);
{OPEN_BRACKET}			print_open_bracket(yytext);
{CLOSE_BRACKET}			print_close_bracket(yytext);
{PUNCTN}			print_punctuation(yytext);


%%

#include <stdio.h>
#include <ctype.h>
#include <string.h>

/* define all units to be used by following routines. */

static char digit_unit[10][6] = {
  "oh", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"
};

static char digit_teen[10][10] = {
  "ten", "eleven", "twelve", "thirteen", "fourteen", 
  "fifteen", "sixteen", "seventeen", "eighteen", "nineteen"
};

static char dig_tens[10][8] = {
  "", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"
};

static char digit_triad[4][9] = {
  "thousand", "million", "billion", "trillion"
};

static char month[12][10] = {
  "January", "February", "March", "April", "May", "June", 
  "July", "August", "September", "October", "November", "December"
};


static char help_string[] = "usage: %s [-v vocabulary] [<] textfile\n";
char dict_file[128] = {""};
FILE *inp = NULL;

main (int argc, char **argv) 
{
  int iarg; 
  for (iarg = 1; iarg < argc; iarg++) {
     if (argv[iarg][0] != '-') {
	if (inp) quit(-1,help_string,argv[0]);
	if ((inp=fopen(argv[iarg],"r"))==NULL)
	    quit(-1,"%s: No such file\n", argv[iarg]);
     } 
     else
     switch (argv[iarg][1]) {
        case 'v': {strcpy(dict_file,argv[++iarg]); break;}
	case 'h':
        default:  quit (-1,help_string,argv[0]);
     }
  }

  if(strlen(dict_file)==0) strcpy(dict_file,"cmudict_vocab");

  if (inp) yyin = inp; else yyin = stdin;
  yylex();
}



print_dangling_terminator() {
    printf("%s",dangling_terminator);
    strcpy(dangling_terminator,"");
}

/* process a simple punctuation mark */ 
char *do_punct (char punct)
{
  static char token[32];

    switch (punct) { 
    case '.': strcpy(token,".PERIOD "); 
              need_a_new_line = end_of_sentence = start_of_sentence = 1; 
	      break;
    case ',': strcpy(token,",COMMA "); break;
    case ':': strcpy(token,":COLON "); break;
    case ';': strcpy(token,";SEMI-COLON "); break;

    /* treating the following as /s markers doesn't always work
       (e.g., "I saw 'Hello Dolly!' last night").  Then again Doug
       doesn't always get it right either (even though he checks for
       quote context).  This should be right often enough, though.
    */
    case '?': strcpy(token,"?QUESTION-MARK "); 
              need_a_new_line = end_of_sentence = start_of_sentence = 1; 
              break;
    case '!': strcpy(token,"!EXCLAMATION-POINT "); 
              need_a_new_line = end_of_sentence = start_of_sentence = 1; 
              break;
    }
  return(token);
}

/* print a string of one or more punctuation characters */
print_punctuation(char *string) 
{ 
   int i = 0;

  /* if it's a succession of punct marks, ignore accumulated periods */
  if (dangling_period) dangling_period = 0;    

  if (!strcmp(string,"...")) printf("...ELLIPSIS ");
  else
  if (strlen(string)==1) { 
    strcpy(dangling_terminator,do_punct(string[0]));
  }
  /* this should probably be more intelligent */
  else
  while (string[i] != '\0') printf("%s",do_punct(string[i++]));

}

/* deal with random symbols; this subroutine provides access to context control 
   these symbols are treated just as if they were other words */
print_other_punct(char *string)
{
  pending_action(FALSE);
  printf("%s",string);
  clean_up();
}


print_quotes(char *string) 
{
  int i=0;
 
  if ((strlen(dangling_terminator)==0) || last_was_whitespace) pending_action(FALSE);

  if (!strcmp(string,"``")) strcat(dangling_terminator,"\"DOUBLE-QUOTE ");
  else 
  if (!strcmp(string,"''")) strcat(dangling_terminator,"\"DOUBLE-QUOTE ");
  else
  while (string[i]!='\0') 
    switch (string[i++]) {
    case '\'': strcat(dangling_terminator,"'SINGLE-QUOTE "); break;
    case '`':  strcat(dangling_terminator,"`SINGLE-QUOTE "); break;
    case '"':  strcat(dangling_terminator,"\"DOUBLE-QUOTE "); break;
    default: break;
    }

  print_dangling_terminator();
}

print_open_bracket(char *string)
{
  int i=0;

  if ((strlen(dangling_terminator)==0) || last_was_whitespace) pending_action(FALSE);

  while (string[i]!='\0') 
    switch (string[i++]) {
    case '[': strcat(dangling_terminator,"[LEFT-BRACKET "); break;
    case '(': strcat(dangling_terminator,"(LEFT-PAREN "); break;
    case '{': strcat(dangling_terminator,"{LEFT-BRACE "); break;

    default: break;
    }

  print_dangling_terminator();
}

print_close_bracket(char *string)
{
  int i=0;

  while (string[i]!='\0') 
    switch (string[i++]) {
    case ']': strcat(dangling_terminator,"]RIGHT-BRACKET "); break;
    case ')': strcat(dangling_terminator,")RIGHT-PAREN "); break;
    case '}': strcat(dangling_terminator,"}RIGHT-BRACE "); break;

    default: break;
    }

  print_dangling_terminator();
}

/* prints out hyphenated words or phrases;
   if a single-letter element is found, emit as alphabetic ("A.") */

print_hypword(char *string) {
  char *p = string;
  char s[64],*sp;

  pending_action(FALSE); 

  sp = s;
  for ( ; *p!='\0'; p++) {
    if (isalpha(*p)) { 
      *sp++ = *p; 
    }
    else {
      *sp = '\0';
      if (strlen(s)>1) printf("%s ",s);
      if (strlen(s)==1) printf("%s. ",s);
      sp = s;
      if (*p=='-') printf("-HYPHEN "); else putchar(*p);
    }
  }
  *sp = '\0';
  if (strlen(s)>1) printf("%s ",s);
  if (strlen(s)==1) printf("%s. ",s);

  clean_up();
}

/* this is a word likely preceeded by a number, e.e., "100-year old". */

print_mixhypword(char *string) {
  printf("-HYPHEN ");
  clean_up();

  print_word(string+1);
}

/* if we find a four digit number beginning with a non-zero and having
its first digit as a one then print it out in year form, not integer
form. otherwise continue */

could_be_a_year(char *string)
{
  int value;

  if (rindex(string,',')) return(0); /* commas not allowed */

  if( (strlen(string)==4) && (string[0]=='1'))  {
    value = atoi(string);   /* of course, you can't do it all the time */
    if ((value>1099)&&(value<=1999)) return (1); 
  }
    return (0);
}

/* deal with a (hopefully) plain integer */
print_integer(char *string)
{
  char *output;

  pending_action(FALSE); 

  if (could_be_a_year(string)) printf("%s",sound_year(string));
  else printf("%s", sound_integer(strip_comma(string)));

  clean_up();

}


/* deal with the special case of ordinal numbers:
   the general idea is to get the number part processed normally, then do a specific
   substitutions for the last word of the number
*/

/* do the sounding out and return a pointer to the last word position */
char *sound_ordinal(char *string,char **output)
{
  char *ptr;

  string[strlen(string)-2] = '\0';          /* erase the "st/etc." string from the end */
  *output = sound_integer(string);          /* convert the number part */
  *(*output+strlen(*output)-1) = '\0';      /* erase the trailing blank */
  ptr= rindex(*output,' ');                 /* go back one word */
  if (!ptr) ptr = *output; else ptr++;      /* ... there was only one word */

  return(ptr);
}

print_first(char *string) {
  char *output,*ptr;

  pending_action(FALSE); 

  ptr = sound_ordinal(string,&output);
  if (string[strlen(string)-1]=='1') {      /* is affix consistent? */
    strcpy(ptr,"first");
    printf("%s ",output);
  }
  else printf("%s st ",output);             /* inconsistent, punt on it */
  clean_up();

}

print_second(char *string) {
  char *output,*ptr;
 
  pending_action(FALSE); 

  ptr = sound_ordinal(string,&output);
  if (string[strlen(string)-1]=='2') {      /* is affix consistent? */
    strcpy(ptr,"second");
    printf("%s ",output);
  }
  else printf("%s nd ",output);             /* inconsistent, punt on it */ 
  clean_up();

}

print_third(char *string) {
  char *output,*ptr;

  pending_action(FALSE); 

  ptr = sound_ordinal(string,&output);
  if (string[strlen(string)-1]=='3') {      /* is affix consistent? */
    strcpy(ptr,"third");
    printf("%s ",output);
  }
  else printf("%s rd ",output);             /* inconsistent, punt on it */  
  clean_up();

}

print_nth(char *string) {
  char *output, *ptr;
  
  pending_action(FALSE); 

  ptr = sound_ordinal(string,&output);

  /* there are exceptions to a simple concatenation of th... */
  if (!strcmp(ptr,"five")) strcpy(ptr,"fifth");
  else if (!strcmp(ptr,"eight")) strcpy(ptr,"eighth");
  else if (!strcmp(ptr,"nine")) strcpy(ptr,"ninth");

  else if (!strcmp(ptr,"twelve")) strcpy(ptr,"twelfth");
  else if (!strcmp(ptr,"twenty")) strcpy(ptr,"twentieth");
  else if (!strcmp(ptr,"thirty")) strcpy(ptr,"thirtieth");
  else if (!strcmp(ptr,"forty")) strcpy(ptr,"fortieth");
  else if (!strcmp(ptr,"fifty")) strcpy(ptr,"fiftieth");
  else if (!strcmp(ptr,"sixty")) strcpy(ptr,"sixtieth");
  else if (!strcmp(ptr,"seventy")) strcpy(ptr,"seventieth");
  else if (!strcmp(ptr,"eighty")) strcpy(ptr,"eightieth");
  else if (!strcmp(ptr,"ninety")) strcpy(ptr,"ninetieth");

  else strcat(output,"th");  /* default ought to work... */

  printf("%s ",output);
  clean_up();

}


print_plural_integer(char *string)
{
  char *str, *output, *ptr;

  pending_action(FALSE);

  str = string;
  if (*str=='\'') str++;                   /* skip over the initial apostrophe, if any */
  str[strlen(str)-1] = '\0';               /* erase the "s" string from the end */
  if (could_be_a_year(str)) output = sound_year(str);
  else output =  sound_integer(str);       /* convert the number part */
  output[strlen(output)-1] = '\0';         /* erase the trailing blank */
  ptr= rindex(output,' ');                 /* go back one word */
  if (!ptr) ptr = output; else ptr++;      /* ... there was only one word */

  if (!strcmp(ptr,"six")) strcpy(ptr,"sixes");
  else if (!strcmp(ptr,"twenty")) strcpy(ptr,"twenties");
  else if (!strcmp(ptr,"thirty")) strcpy(ptr,"thirties");
  else if (!strcmp(ptr,"forty")) strcpy(ptr,"forties");
  else if (!strcmp(ptr,"fifty")) strcpy(ptr,"fifties");
  else if (!strcmp(ptr,"sixty")) strcpy(ptr,"sixties");
  else if (!strcmp(ptr,"seventy")) strcpy(ptr,"seventies");
  else if (!strcmp(ptr,"eighty")) strcpy(ptr,"eighties");
  else if (!strcmp(ptr,"ninety")) strcpy(ptr,"nineties");

  else strcat(output, "s");  /* the default pluralization */

  printf("%s ",output);
  clean_up();

}


/* a range, like "34-56" or "1960-64".  split into components and sound each piece */
print_integer_range(char *string)
{
  char *ptr;

  pending_action(FALSE);

  ptr = index(string,'-');
  *ptr = '\0';
  print_integer(string);
  printf("-HYPHEN ");
  print_integer(ptr+1);
  clean_up();

}

/* prints out integer eg 123 is ONE HUNDRED AND TWENTY THREE */

char *sound_integer(char *string) {

  int i, digits, triads, q, flag;
  static char buffer[1024], temp[256];
  buffer[0] = '\0';  /* zero the output buffer */

  flag = 0;
  digits = strlen(string);
  triads = (digits / 3);

/* only do main part of procedure if the number has less than sixteen
digits and does not begin with zero */

    if((digits < 16) && (string[0] - '0' != 0)) {

/* loop through digit by digit from beginning to end */

      for(i = 0; i < digits; i++) {

/* q is the value of the preceding three digits (or 1 if there are
less than three previous digits) and is used to decide whether or not
to pronounce triad multiplier. zero implies do not pronounce */

        if(i > 2)
          q = ((string[i] - '0') + (string[(i - 1)] - '0') + (string[(i - 2)] - '0'));
        else
          q = 1;

/* this bit sorts out 'hundreds' digits. if digit is a zero then don't
write a number, say 'AND' if either of next two digits is non-zero.
otherwise say number followed by 'HUNDRED', then 'AND' if either of
next two digits is non-zero */

        if((digits - i) % 3 == 0) {
          if((string[i] - '0' == 0) 
	     && (i > (digits - 4)) 
	     && ((string[(i + 1)] - '0' != 0) 
		 || (string[(i + 2)] - '0' != 0))
	     )
            strcat(buffer,"and ");
          else {
            if(string[i] - '0' > 0) {
              sprintf(temp,"%s hundred ", digit_unit[string[i] - '0']);
	      strcat(buffer,temp);
              if((string[(i + 1)] - '0' != 0) || (string[(i + 2)] - '0' != 0))
                strcat(buffer,"and ");
              }
            }
          }

/* tens digits. prints out relevant tens name unless digit is zero or
one.  If one then print out relevant teen name by looking at next
digit and set flag to indicate teen. if zero do nothing */

        if((digits - i) % 3 == 2) {
          if((string[i] - '0') == 1) {
            sprintf(temp,"%s ", digit_teen[string[(i + 1)] - '0']);
	    strcat(buffer,temp);
            flag = 1;
            }
          else
            if((string[i] - '0') > 1) {
              sprintf(temp,"%s ", dig_tens[string[i] - '0']);
	      strcat(buffer,temp);
	    }
          }

/* units digits. if non-zero prints out number.  if not in last triad
or if q not zero then prints out relevant triad name does not print
out unit if flag is set by previous digit being a '1' ie 'teen.  flag
cleared if set. */

        if((digits - i) % 3 == 1) {
          if(flag == 1)
            flag = 0;
          else
            if(string[i]-'0' != 0) {
              sprintf(temp,"%s ", digit_unit[string[i]- '0']);
	      strcat(buffer,temp);
	    }
          if((i < (digits - 2)) && (q > 0)) {
            sprintf(temp,"%s ", digit_triad[(((digits - i) / 3) - 1)]);
	    strcat(buffer,temp);
          }
        }
      }
    }

/* from earlier, if more than 16 digit or beginning with '0' then
print digit by digit */

    else 
      { 
	if((strlen(string) == 1) && (string[0] == '0')) 
	 strcat(buffer,"zero ");
	else
	  for(i = 0; i < strlen(string); i++) {
	    sprintf(temp,"%s ", digit_unit[string[i] - '0']);
	    strcat(buffer,temp);
	  }
      }

  return(buffer);

}


/* prints out any integer digit by digit. 
   if string is '0' then say 'ZERO' otherwise all '0's are 'OH's */

print_number(char *string) {

  int i;

  pending_action(FALSE);

  if((strlen(string) == 1) && (string[0] == '0')) 
    printf("zero ");
  else
    for(i = 0; i < strlen(string); i++)
      printf("%s ", digit_unit[string[i] - '0']);
  clean_up();

}


/* prints out real numbers including those with exponents. 
   Uses other routines to print parts in correct format. */

print_real(char *string) {

  int i, j, k, flag;
  char substring[20];

  pending_action(FALSE);

  /* set flag if real has no fraction part (this stuff used to happen
     more or less for free, since reals used to be routed through the
     standard print_integer() code). */
  if (string[strlen(string)-1]=='.')  dangling_period = 1;

  j = 0;
  k = 0;
  flag = 0;

/* search out first non-numeric char - either '.' or 'e'. Its at
string[j] print out everything up to string[j] as integer part of
mantissa if there is anything */

  while(isdigit(string[j])) j++;
  if(j > 0) {
    strncpy(substring, string, j);
    substring[j] = 0;
    /* if this token looks suspiciously like a date at the end of a 
       sentence, just treat it as such.  */
    if (dangling_period && (could_be_a_year(substring))) {
	printf("%s",sound_year(string));
       }
    else printf("%s",sound_integer(substring));
    }

/* if j isn't last char in string then search along for next non-numeric */
/* call it k + 1. k=0 implies no other non-numerics. */

  if(j < strlen(string)) {
    for(i = j + 1; i < strlen(string); i++)
      if((flag == 0) && (isdigit(string[i])))
        k = i;
      else
        flag = 1;
      }

/* if string[j] is '.' and there is another non-numeric then bit in between */
/* is decimal part of mantissa. */

  if((k > j) && (string[j] == '.')) {
    strncpy(substring, string + j + 1, k - j);
    substring[k - j] = 0;
    printf(".POINT ");
    print_number(substring);
    }

/* if string[j] was '.' and there was another non-numeric, it must be 'e' */
/* everything after it must be exponent part. Separate and process */

  if((k > j) && (string[j] == '.') && (k + 1 < strlen(string))) { 
    strncpy(substring, string + k + 2, strlen(string) - k - 2);
    substring[strlen(string) - k - 2] = 0;
    if(strlen(substring) > 0)
      print_exponent(substring);
    }

/* if string[j] was not '.' it was 'e'. Everything after it must be exponent */
/* pull it off and process */

  if(string[j] != '.') {
    strncpy(substring, string + j + 1, strlen(string) - j - 1);
    substring[strlen(string) - j - 1] = 0;
    if(strlen(substring) > 0) 
      print_exponent(substring);
    }

  clean_up();
  if (string[strlen(string)-1]=='.') dangling_period = 1;

}



/* prints out exponent part of a real number */

print_exponent(char *string) {

  printf("times ten to the power ");
  if(string[0] == '-')
    print_integer_minus(string);
  else
    if(string[0] == '+') {
      string++;
      printf("%s",sound_integer(string));
      }
    else
      printf("%s",sound_integer(string));   
}


/* an all-caps word is spelled out; the assumption is apparently that it
   must be an acronym.  This is clearly wrong.  A better solution is to 
   to first check it against a standard dictionary, if it's there, don't
   spell it out.  This will obviously fail on neologisms.
*/

FILE *dict_fp;
int dictionary_loaded = FALSE; 
int dict_tried = FALSE;

void find_dict() { 
  if ((dict_fp=fopen(dict_file,"r"))!=NULL) dictionary_loaded = TRUE;
  else
  fprintf(stderr,"warning: unable to open dictionary file %s!\n",dict_file);
  dict_tried = TRUE;
  return;
}

int in_dict(char *string) { 
return (search_vocab(dict_fp,string)); 
}

print_caps(char *string) {
  int i;

  if(!dict_tried) find_dict();	/* open up only if needed */

  if (dictionary_loaded && in_dict(string)) print_word(string);
  else
  {
    pending_action(FALSE);
    for(i = 0; i < strlen(string); i++) printf("%c. ", string[i]); 
    clean_up();
  }
}

print_abbrev(char *string) {
  int i, len = strlen(string);

  pending_action(FALSE);

  for(i = 0; i < len; i++)
    if(string[i] != '.')
      printf("%c. ", string[i]); 

  /*  If this is a multiple-letter abbreviation, guess that it 
      might be also the end of a sentence, */
  if (len>2) dangling_period = 1;

}

/* handle abbreviations caught by the enumerated list */
process_abbrev(char *orig, char *string, int internal) 
{
  int len = strlen(orig);
  char *str = malloc(strlen(string)+1);
  strcpy(str,string);

  pending_action(FALSE);

  /* adjust case (for neatness, I guess) */
  if (isupper(orig[0]))
    if (islower(str[0])) str[0] = toupper(str[0]);
  print_word(str);

  /*  Unless there's reason to believe that it's a phrase-internal 
      abbreviation (e.g. "Mr. Smith"), consider that the ending
      period might be an end-of-sentence.  */
  if (!internal && (orig[len-1]=='.')) dangling_period = 1;
  free(str);
}


/* prints 'MINUS' then increments the string pointer to 'remove' minus
sign. then calls print_integer with the new string */

print_integer_minus(char *string) {

  pending_action(FALSE);

  printf("minus ");
  string++;
  printf("%s",sound_integer(strip_comma(string)));
  clean_up();

}


/* prints 'MINUS' then increments the string pointer to 'remove' minus
sign. then calls print_real with the new string */
print_real_minus(char *string) {

  pending_action(FALSE);

  printf("minus ");
  string++;
  print_real(string); 
  clean_up();

}


/* takes a date and prints it */

print_date(char *string) {

  char substring[10];

/* if the second character is not a digit then the day has only one
digit. Strip it off. else it has two digits. Strip it off. Then print
out this substring using print_integer. in both cases increment the
string pointer so that it points to the first character of the month
portion. */

  if(string[1] - '0' < 0) {
    strncpy(substring, string, 1);
    substring[1] = 0;
    string = string + 2;
    }
  else {
    if(string[0] == '0') {
      strncpy(substring, string + 1, 1);
      substring[1] = 0;
      string = string + 3;  
      }
    else {
      strncpy(substring, string, 2);
      substring[2] = 0;
      string = string + 3;
      }
    }
  printf("%s",sound_integer(substring));

/* if second character is a digit then month has two digits. Print out
corresponding month and increment string pointer to first digit of
year portion. else print out one digit month and increment the pointer
as before. */

  if((string[1] - '0' >= 0) && (string[0] - '0' == 1)) {
    printf("%s ", month[string[1] - '0' + 9]);
    string = string + 3;
    }
  if((string[1] - '0' >= 0) && (string[0] - '0' == 0)) {
    printf("%s ", month[string[1] - '0' - 1]);
    string = string + 3;
    }
  if(string[1] - '0' < 0) {
    printf("%s ", month[string[0] - '0' - 1]);
    string = string + 2;
    }

/* if year has two digits prefix 'NINETEEN'. If the two digits are
both zero then print 'HUNDRED' else use print_integer to print rest of
year. Otherwise year has four digits. Use print_integer to print all
four digits. That contains exception rules so that things like 1066
get printed as TEN SIXTY SIX */

  if(strlen(string) == 2) {
    printf("nineteen ");
    if((string[0] == '0') && (string[1] == '0'))
      printf("hundred ");
    else
      print_integer(string);
    }
  else
    print_integer(string);
  clean_up();

}


/* if print_integer finds something with four digits that could be
pronounced in year format it sends it here. This routine prints wxyz
in the forn wx yz so that 1991 becomes NINETEEN NINETY ONE. If last
two digits are zeros then the word 'HUNDRED' is substituted. */

char *sound_year(char *string) {

  char buffer[1024];
  char substring[10];

  strncpy(substring, string, 2);
  substring[2] = 0;
  strcpy(buffer,sound_integer(substring));
  string = string + 2;
  if((string[0] - '0' == 0) && (string[1] - '0' == 0))
    strcat(buffer,"hundred ");
  else 
    strcat(buffer,sound_integer(string));

  return(buffer);
}


/* if a word followed by a hyphen is found at the end of a line this
routine strips it of the hyphen and ensures that the next word is
concatenated with it, thus rejoining long words split across two
lines. It converts to upper-case and 'forgets' to print a newline
character after the string so that any following output is put on the
same line. */

print_split(char *string) {

  int i;

  for(i = 0; i + 1< strlen(string); i++) {
/*    string[i] = toupper(string[i]); */
    printf("%c", string[i]);
    }
}


/* prints out times including constructs such as 9am, 10:47pm, 9:01, 22:47 etc */

print_time(char *string) {

  int i, j, flag;
  char substring[10];

  flag = 0;
  j = 0;

/* find out how many digits in hour. separate hour and print. */

  for(i = 0; i < strlen(string); i++)
    if((flag == 0) && (isdigit(string[i])))
      j = i + 1;
    else 
      flag = 1;
  if(j > 0) {
    strncpy(substring, string, j);
    substring[j] = 0;
    printf("%s",sound_integer(substring));
    }

/* if next char is ':' then pull off minutes and print. */

  if(string[j] == ':') {
    strncpy(substring, string + j + 1, 2);
    substring[2] = 0;
    printf("%s",sound_integer(substring));
    }

/* if 'am' or 'pm' there are two possibilities. Could be after hour as
in 9am or after minutes 9:01am this is not elegant but it works! */

  if(string[j] == 'a')
    printf("A. M. ");
  if(string[j] == 'p')
    printf("P. M. ");
  if(strlen(string) > 5) {
    if(string[j + 3] == 'a')
      printf("A. M. ");
    if(string[j + 3] == 'p')
      printf("P. M. ");
    }
  clean_up();

}


/* deal with expressions of the form "$1.23 million" or "$12 billion" */
print_big_money(char *string) {
  char *ptr;

  pending_action(FALSE);

  for (ptr=string+strlen(string)-1; !isdigit(*ptr); ptr--); /* find last digit */
  *(ptr+1) = '\0'; /* cap it off */
  for ( ; !isalpha(*ptr) ; ptr++);  /* find start of million/billion word */
  if (rindex(string,'.')) print_real(string+1); 
    else printf("%s",sound_integer(string+1));
  printf("%s dollars ",ptr);
  clean_up();

}


/* routine to cope with money. Supports $2M as well as $2.12 */

print_money(char *string) {

  int i, j, k, flag;
  char substring[15], *ptr;

  pending_action(FALSE);

  j = 0;
  flag = 0;
  if(isdigit(string[strlen(string) - 1])) {

/* figure does not contain suffix letter therefore read as 'integer
dollars and integer cents' first find the dollars integer part,
separate and use print_integer to print. Follow by 'DOLLARS'. */ 

    for(i= 1; i < strlen(string); i++) 
      if((flag == 0) && (isdigit(string[i])))
	j = i + 1;            /* j will point to 1st non-digit in the string */
      else flag = 1; 
    if(j > 1) { 
      strncpy(substring, string + 1, j - 1); 
      substring[j - 1] = 0; 
      print_integer(substring); 
    }
    printf("dollars ");

    ptr = string+j;
    if (*ptr=='\0') return;   /* no cents, no nothing, so quit */

    /* then strip out two digits which are read by print_integer and
       followed by 'CENTS' unless both are zero. */

    strncpy(substring, string + j + 1, 2);
    substring[2] = 0;
    if((substring[0] != '0') || (substring[1] != '0')) {
      printf("and ");
      printf("%s",sound_integer(substring));
      printf("cents ");
    }
  }

/* otherwise we have something of the form integer plus decimal times
multiplier dollars eg $2.345M find integer part as before then follow
by 'POINT' if there is decimal part. Decimal part then found and
readby print_number. */

  else {   
    for(i = 1; i < strlen(string); i++)
      if((flag == 0) && (isdigit(string[i])))
        j = i + 1;
      else 
        flag = 1;
    if(j > 1) {
      strncpy(substring, string + 1, j - 1);
      substring[j - 1] = 0;
      printf("%s",sound_integer(substring));
      }
    flag = 0;
    k = 0;
    for(i = j + 1; i < strlen(string); i++)
      if((flag == 0) && (isdigit(string[i])))
        k = i + 1;
      else
        flag = 1;
    if(k - j > 1) {
      strncpy(substring, string + j + 1, k - j - 1);
      substring[k - j - 1] = 0;
      printf(".POINT ");
      print_number(substring);
      }

/* now print out correct multiplier followed by 'DOLLARS' */

    if(k == 0)
      k = j;
    if(string[k] == 'K') 
      printf("thousand ");
    if(string[k] == 'M')
      printf("million ");
    if(string[k] == 'B')
      printf("billion ");
    printf("dollars ");
    }
  clean_up();

}

    
/* strips the commas out of any string and returns the commaless string */

char *strip_comma(char *string) {

  int i, j, k;

  i = 0;
  while(string[i] != 0) { 
    if(string[i] != ',') { i++;}
    else {
      k = strlen(string);
      for(j = i; j + 1 < k; j++)
        string[j] = string[j + 1];
      string[k - 1] = 0;
      }
    }
  return string;
}


/* prints out words with apostrophes without modification.
   might want to do more for stuff like initials ("Z's") */

print_apost(char *string) {

    print_word(string);

}


/* as a special case, deal with words that end in a single apostrophe. 
 unless the preceeding character is a S, treat it as a closing quote. */

print_apost_single(char *string) {

  char *ptr = string+strlen(string)-1;

  if ((*(ptr-1)=='s')||(*(ptr-1)=='S')) {
    print_word(string);
  } else {
    *ptr = '\0';
    printf("%s 'SINGLE-QUOTE ",string);
  }
}




print_word(char *string) {

  int i;

  /* check if the current word has characteristics of a real word */
  /* basically: starts with an upper-case */
  /* the logic here will obviously fail in some cases... */
  if(dangling_period) {
    if (isalpha(*string) && isupper(*string))  {
	printf(".PERIOD ");
	end_of_sentence = start_of_sentence = need_a_new_line = TRUE;
    }
    dangling_period = 0; /* if it doesn't take, forget about it... */
  }

  pending_action(FALSE);

  /* check for single-letter initials; ignore the ones that commonly
     stand on their own (a, I);  not a perfect rule */
  if (strlen(string)==1) {
    switch (string[0]) {
	case 'a':
	case 'A':
	case 'I':  printf("%s ",string); break;
	 default:  printf("%s. ",string); break;
    }
  }
  else printf("%s ", string);
  clean_up();
}


/* remember the paragraph ID string */
start_paragraph (char *string)
{ 
  char *p,*q;
  start_of_sentence = 1;
  sentence_count = 0;
  printf("%s",string);
  p = index(string,'.');
  q = rindex(string,'>');
  if ((p==0)||(q==0)) strcpy(paragraph_id,"???"); /* apparently malformed id */
  else
  {
    strcpy(paragraph_id,p+1);
    *(paragraph_id+(q-p)-1)='\0';
  }
}

/* various housekeeping stuff at the end of a paragraph: 
   flush pending output, etc. */
process_paragraph()
{
  /* end of pgh probably means it was a real period. */
  if (dangling_period) { 
    printf(".PERIOD ");
    dangling_period = 0;
  }

  need_a_new_line   = 1;
  end_of_sentence   = 1;
  start_of_sentence = 0;

  pending_action(FALSE);

  printf("</p>\n");
  clean_up();
}


process_whitespace() 
{
  last_was_whitespace = 1;
}


/* check for any pending actions to take before processing next token */
void pending_action (int flush_flag) {

  printf("%s",dangling_terminator);
  strcpy(dangling_terminator,"");

  if (need_a_new_line) {
    printf("\n");
    need_a_new_line = 0;
  }
  if (end_of_sentence && !sentence_ended) {
    printf("</s>\n");
    end_of_sentence = 0;
    sentence_ended = 1;
  }

  if (start_of_sentence) {
    sentence_count++;
    printf("<s.%s.%d>\n",paragraph_id,sentence_count);
    start_of_sentence = 0;
  }
}

/* a housekeeping step after every 'real' token is processed */
clean_up () {

  dangling_period = 0;
  last_was_whitespace = 0;
  sentence_ended = 0;

}


finish_up()
{
  yyterminate();
}
