 /*
   [20jan94] (air)
   adding some stuff to identify tabular material: paragraphs with
   lots more puncts and numbers, relative to words.

   [15jan94] (air) 
   fixed to handle WSJ90-92 data 
	- wsj90-92 preamble format is more complex than wsj87-89
	- lead paragraph attached to following text
	- separate stories within TEXTs made into separate TEXTs
	  (these are things like separate letters to the editor)
   some minor efficiency fixes.

   [8oct93] (air)
   Strips out some of the  SGML stuff in ap, wsj files
   and generally spruces up the text.

   It assume the following model for the data:
   <DOC>
   ...				|
   <DOCNO> ... </DOCNO>		| preamble
   ...				|
   <TEXT>    |
   ...       | this part repeats 1+ times.
   </TEXT>   |
   </DOC>

   Anything else will be trashed, possibly silently.

  All paragraphs will be numbered consecutively ([1..n]) within TEXTs. 
  Note that the id scheme differs from Doug Paul's, though it can be
  easily fixed if need be (this is due to multiple texts per document).
  All paragraph id's should be unique, though no actual test for this 
  is performed.
  */

%{
#include <strings.h>

#define TRUE 1
#define FALSE 0

int in_doc = FALSE;
int in_text = FALSE;
int in_preamble = FALSE;
int first_para = FALSE;
int first_sent = FALSE;
int ready_for_docno = FALSE;
int lead_paragraph = FALSE;
int need_EOP = FALSE;
int between_texts = FALSE;
int word_count,number_count,punct_count;

int wsj90_92 = FALSE;

char para_id[32];
int para_num, text_num;
%}


PUNCT		    [^A-Za-z0-9 \t\n]
NL                  [\n]+
STRING              [A-Za-z]+
NUMBER		    [0123456789]+
WSPACE              [ \t]+

%%

"<DOC>"			{start_document(yytext);}
"<DOCNO>"      		{ready_for_docno= TRUE;}
"WSJ"{NUMBER}-{NUMBER}  {print_docno(yytext);}
"</DOCNO>"  		{ready_for_docno= FALSE;}
"<LP>"			{start_text(yytext); lead_paragraph= TRUE;}
[\n]*"</LP>\n"		{need_EOP = TRUE; between_texts = TRUE;}
"<TEXT>"		{start_text(yytext);}
"</TEXT>"		{end_text(yytext);}
"   ---"[\n]*		{divide_text(yytext);}
"</DOC>"		{finish_document(yytext);}
^"   "			{start_paragraph_id();}

"]"			{map_symbol(yytext);}

{STRING}		{print_token(yytext);}
^{WSPACE}[\n]+		{print_blank(yytext);}
{WSPACE}		{print_wspace(yytext);}
{PUNCT}			{print_punct(yytext);}
{NUMBER}		{print_number(yytext);}
{NL}			{print_nl(yytext);}

%%
#include <stdio.h>

#define THRESHOLD 5.0

start_document(char *string)
{
  printf("%s\n",string);
  in_doc = TRUE;
  in_preamble = TRUE;
  text_num=0;
}

finish_document(char *string) 
{
  if (need_EOP) {
    printf("</p>\n");
    need_EOP = FALSE;
  }
  printf("%s\n",string); 
  in_doc= FALSE; 
  lead_paragraph= FALSE; 
  in_text= FALSE;
}

start_paragraph_id()
{
  if (!in_text) return(0);
  para_num++;
  if (!first_para) printf("</p>\n");
  printf("<p.%s-%d.%d>\n",para_id,text_num,para_num);
  first_sent = TRUE;

  if (!first_para) {
  fprintf(stderr,"%s-%d.%d\twrds:%d\tnos:%d\tpunct:%d\t%4.2f %s\n",
	para_id,text_num,(para_num-1),
	word_count,number_count,punct_count,
	(float)(punct_count+number_count)/(float)word_count,
	(((float)(punct_count+number_count)/(float)word_count)>THRESHOLD)?"*":"");
  word_count = number_count = punct_count = 0;
  }

  first_para = FALSE;
}

/* unrelated items in wsj90-92 may be in the same TEXT; split these */
divide_text(char *string)
{
  if(!in_text) return (0);
  printf("</p>\n</TEXT>\n<TEXT>\n");
  text_num++;
  first_para=TRUE;
  para_num=0;
  start_paragraph_id();
}

start_text(char *string)
{
  between_texts = FALSE;
  if (lead_paragraph) return (0); /* marker and flags already dealt with */
  printf("<TEXT>",string);
  in_text = TRUE; 
  in_preamble = FALSE;
  text_num++;
  first_para=TRUE;
  para_num=0;
}

end_text(char *string)
{
  printf("</p>\n%s\n",yytext);
  lead_paragraph= FALSE;
  in_text= FALSE;
  need_EOP = FALSE;

  fprintf(stderr,"%s-%d.%d\twrds:%d\tnos:%d\tpunct:%d\t%4.2f %s\n",
	para_id,text_num,(para_num-1),
	word_count,number_count,punct_count,
	(float)(punct_count+number_count)/(float)word_count,
	(((float)(punct_count+number_count)/(float)word_count)>THRESHOLD)?"*":"");
  word_count = number_count = punct_count = 0;

}

print_docno(char *string) 
{
  /* preserve the doc number for future reference */
  printf("<DOCNO> %s </DOCNO>\n",string);  
  strcpy(para_id,string);

  /* try to guess the orgin of the document so that some corpus-specific
      fixes can be applied */
  wsj90_92 = FALSE;
  if(!strncmp(string,"WSJ90",5)) wsj90_92 = TRUE;
  else if(!strncmp(string,"WSJ91",5)) wsj90_92 = TRUE;
  else if(!strncmp(string,"WSJ92",5)) wsj90_92 = TRUE;
}

print_token (char *string)  
{
//printf(" [t %s]",string);
   if (ready_for_docno) {
      print_docno(string);
      ready_for_docno = 0;
   }
   else
   if (in_preamble) return(0);
   else
   if (in_text) printf("%s",yytext);
   word_count++;
}
 
print_wspace (char *string)  
{
//printf(" [w %s]",string);
   if (in_preamble) return(0);
   else
   if (between_texts) return(0);
   else
   if (in_text) printf(" ",yytext); /* collapse extended whitespace */
}
 
/* do not emit blank lines */
print_blank(char *string)
{
  return(0);
}

print_punct(char *string) 
{
//printf(" [p %s]",string);
   if (in_preamble) return(0);
    printf("%s",string);
    punct_count++;
}

print_number(char *string)
{
   if (in_preamble) return(0);
    printf("%s",string);
    number_count++;
}

print_nl (char *string)  
{
//printf(" [n %s]",string);
   if (in_preamble) return(0);
   else
   if (between_texts) return(0);
   else
   if (in_text) printf("\n",yytext); /* collapse multiple nl's */
}
 
/* remap some symbols that got lost during original text preparation */
map_symbol(char *string)
{
   switch(string[0]) {
	case ']': if (wsj90_92) printf("!"); break;
	default:  printf("%s",string); break;
   }
}

/**/
