// ======================================================================
// align.C
// Align a bilingual pair of documents at the sentence level.
// Adam Berger 11/99 updated 2/00
//
// Copyright (C) 2000, Carnegie Mellon University and Adam Berger
// All rights reserved.
//
// This software is made available for research purposes only.  It may be
// redistributed freely for this purpose, in full or in part, provided
// that this entire copyright notice is included on any copies of this
// software and applications and derivations thereof.
//
// This software is provided on an "as is" basis, without warranty of any
// kind, either expressed or implied, as to any matter including, but not
// limited to warranty of fitness of purpose, or merchantability, or
// results obtained from use of this software.
// ======================================================================

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <limits.h>
#include <string.h>
#include <ctype.h>
#include <stdarg.h>

#include "User.H"
#include "Array.H"
#include "LogProb.H"
#include "WordList.H"

// for tokenization
static const char *DELIMITERS = "'-.,:;()[]{}\"'`"; 

#define _DEBUG_ 0

typedef struct {int e; int f;}  TRACEBACK;

static Array<char *>  rawEnglishText, rawFrenchText; // straight from disk
static Array<Array<char *> > englishText, frenchText;// tokenized
static Array<int> frenchAnchorOfEnglishAnchor;       // anchor aligment
static double nullProb;                  
static char *anchorText;                     

void fatalError(const char *msg, ...) {
  va_list args;
  va_start(args, msg);
  char buff[1024];
  vsprintf(buff, msg, args);
  va_end(args);
  fprintf(stderr, "FATAL ERROR: %s\n", buff);
  abort();
}

void spaceOutTokens(const char *in, char *out) { 
  // if in="this.is\a]string", output="this is a string"
  int j=0;
  for (int i=0;i<(int) strlen(in); i++) {
    const char c = in[i];
    if (strchr(DELIMITERS, c)) { out[j++]=' '; out[j++]=c; out[j++]=' '; }
    else out[j++] = c;
  }
  out[j]=0;
}

void tokenizeString(char *buff, Array<char *> &tokens) { 
  int i=0;
  tokens.Grow(strlen(buff));
  char *p = strtok(buff, " \t\n");
  for (i=0; p; i++) { 
    tokens[i] = strdup(p);
    p = strtok(NULL, " \t\n");
  }
  tokens.Grow(i);
}

int isBlank(const char *buff) { 
  return ((strspn(buff, "\n\t ") == strlen(buff))? 1 : 0);
}

int isAnchor(const char *buff) { 
  // Does anchor string reside in the beginning of buff?
  const int N = strlen(anchorText);
  if ((int) strlen(buff) < N) return 0;
  return (strncmp(buff, anchorText, N)==0)? 1 : 0;
}

int sentencesAreEqual(const Array<char *> &a1, const Array<char *> &a2) {
  if (a1.Size()!=a2.Size()) return 0;
  for (int i=0; i<(int) a1.Size(); i++)  
    if (strcmp(a1[i], a2[i])) return 0;
  return 1;
}

void printSentence(const Array<char *> &sentence) {
  for (int i=0; i<(int) sentence.Size(); i++)  
    fprintf(stdout, "%s ", sentence[i]);
  fprintf(stdout, "\n");
}

int readData(const char *path, 
	     Array<char*> &raw, Array<Array<char*> > &cooked)  { 
  // Read all text in this file. Sentences assumed newline-delimited.
  // raw[i]: i'th sentence (i.e. line) in path, verbatim.
  // cooked[i]: array of tokens from raw[i] 

  // 1st pass: count sentences 
  FILE *f = fopen (path, "r");
  if (!f) fatalError("Cannot open %s for input\n", path);
  char buff[MAXLINE];
  int n;
  for (n=0; fgets(buff, MAXLINE, f); n++) 
    if (isBlank(buff)) fatalError("Line %i of %s was blank\n", n+1,path);
  raw.Grow(n);
  cooked.Grow(n); 
  
  // 2nd pass: read in sentences and tokenize
  rewind(f);
  int nAnchors=0;
  for (n=0; fgets(buff, MAXLINE, f); n++) { 
    int x;
    for (x=strlen(buff)-1; strchr("\t\n ", buff[x]); x--);
    x++;
    buff[x]=0;                   // excise trailing spaces from line
    raw[n] = strdup(buff);
    if (isAnchor(buff)) {
      cooked[n].Grow(1); 
      cooked[n][0] = strdup(buff); 
      nAnchors++;
    }
    else { 
      char newbuff[3*MAXLINE];
      spaceOutTokens(buff, newbuff);  // space out things like ( and :
      tokenizeString(newbuff, cooked[n]);
    }
  }
  fprintf(stderr, "Read %i sentences from %s; found %i anchors.\n", 
	  n, path, nAnchors);
  return nAnchors;
}


void alignAnchors(int nE, int nF) { 
  // fill in 'frenchAnchorOfEnglishAnchor' array, a correspondence 
  // between anchors in the two texts.

 // pass through French text, putting anchors into a set
  const int F = frenchText.Size();
  WordList frenchAnchors(nF);
  Array<int> positionOfFrenchAnchor(nF);
  int nFAnchors=0, nEAnchors=0;
  for (int f=0; f<F; f++) {
    const char *putativeAnchor = frenchText[f][0];
    if (!isAnchor(putativeAnchor)) continue;
    nFAnchors++;
    if (frenchAnchors[putativeAnchor]!=-1) 
      fatalError("In line %i of French, found duplicate anchor %s\n",
		 f, putativeAnchor);
    const int idx = frenchAnchors.add(putativeAnchor);
    positionOfFrenchAnchor[idx] = f;
  }
  int frenchAnchorWasMatched[nFAnchors];
  memset(frenchAnchorWasMatched, 0, sizeof(int)*nFAnchors);

  // pass through English text, finding corresponding French anchors
  const int E =englishText.Size();
  frenchAnchorOfEnglishAnchor.Grow(E);
  for (int e=0; e<E; e++) {
    const char *putativeAnchor = englishText[e][0];
    if (!isAnchor(putativeAnchor)) continue;
    nEAnchors++;
    const int idx = frenchAnchors[putativeAnchor];
    if (idx==-1) {
      frenchAnchorOfEnglishAnchor[e]=-1; // no corresponding French anchor
      fprintf(stderr, "WARNING: English anchor %s missing a French counterpart\n",
	      putativeAnchor);
    }
    else {
      frenchAnchorOfEnglishAnchor[e]=positionOfFrenchAnchor[idx];
      frenchAnchorWasMatched[idx]=1;
    }
  }

  for (int i=0; i<nFAnchors; i++) 
    if (frenchAnchorWasMatched[i]==0)  
      fprintf(stderr, "WARNING: French anchor %s missing a English counterpart\n",
	      frenchAnchors[i]);
}


LogProb getProbOfSentencePair(const Array<char*> &french, 
			     const Array<char *> &english) { 
  // prob (french sentence | english sentence) 
  
  // if either is an anchor, then score=1 iff 
  // the anchors are the same, and 0 otherwise
  if (isAnchor(french[0]) || isAnchor(english[0])) 
    return (strcmp(french[0], english[0])==0)? (LogProb) 1 : (LogProb) 0;

  // lScore:  measures whether the two strings are lexically identical.
  LogProb lScore = (sentencesAreEqual(french, english))? 1 : 0;
 
  // score using translation model
  LogProb tScore=1;
  for (int x=0; x<(int) french.Size(); x++) {
    double s=0;
    for (int y=0; y<(int) english.Size(); y++) {
      double p = u_getTranslationProb(french[x], english[y]);
      if (p<=0 || p>1) {
	fprintf(stderr, "All probs must be in the range (0, 1]\n"); 
	abort(); 
      } 
      s += (LogProb) p;
    }
    s /= english.Size();
    
    tScore *= s;
  }
  LogProb score = (LEXICAL_WEIGHT)*lScore + (1.0-LEXICAL_WEIGHT)*tScore;
  return score;
}

LogProb probThatFrenchIsSpontaneous(const Array<char *> &f) { 
  // prob that a French sentence arose spontaneously is the
  // probability that each word in the French was generated spontaneously.
  LogProb s = 1;
  int N = (isAnchor((const char*) f[0])? AVERAGE_SENT_LEN : f.Size());
  for (int x=0; x<N; x++) 
    s *= nullProb;
  return s;
}

int main(int argc, char *argv[]) { 
  if (argc != 4)  
      fatalError("Usage: %s [etext] [ftext] [anchor string]\
                    etext: path of English text\
                    ftext: path of French text\
                    anchor string: text to be used as anchor\n", argv[0]);
  const char *englishFile = strdup(argv[1]);
  const char *frenchFile  = strdup(argv[2]);
  anchorText              = strdup(argv[3]);

  u_open();

  nullProb = 1.0 / u_getNFrenchTerms();

  fprintf(stderr, "Reading English and French sentences...\n");
  const int nEAnchors = readData(englishFile, rawEnglishText, englishText);
  const int nFAnchors = readData(frenchFile,  rawFrenchText,  frenchText);

  // check that both texts terminate in an anchor  
  if (!isAnchor(frenchText[frenchText.Size()-1][0]) || 
      !isAnchor(englishText[englishText.Size()-1][0])) 
    fatalError("Texts must terminate with an anchor.\n");

  fprintf(stderr,"Aligning anchors in text...\n");
  alignAnchors(nEAnchors, nFAnchors);

  fprintf(stderr,"Allocating data structures for alignment search...\n");
  LogProb    **score     = new LogProb    *[MAX_SEGMENT];
  TRACEBACK **traceback = new TRACEBACK *[MAX_SEGMENT];
  for (int s=0; s<MAX_SEGMENT; s++) {
    score[s]     = new LogProb    [MAX_SEGMENT];
    traceback[s] = new TRACEBACK [MAX_SEGMENT];
  }

  // establish some boundary conditions 
  score[0][0] = 1;        
  for (int x=1; x<MAX_SEGMENT; x++) {
    traceback[x][0].e = x-1;
    traceback[x][0].f = 0;
    traceback[0][x].e = 0;
    traceback[0][x].f = x-1;
  }
  
  fprintf(stderr, "Entering main alignment loop...\n");
  int currentE=0, currentF=0, nSegments=0;
 
  while (currentE<(int) englishText.Size()-1 || 
	 currentF<(int) frenchText.Size()-1) { 

    // scan ahead to find next anchor
    int e,f;
    for (e=currentE+1; 
	 (!isAnchor(englishText[e][0])||
	  frenchAnchorOfEnglishAnchor[e]==-1); 
	 e++);

    const int ne = e - currentE +1;
    const int nf = frenchAnchorOfEnglishAnchor[e]-currentF +1;
    if (ne>=MAX_SEGMENT) 
      fatalError("English segment too long: starting at %i; length: %i\n",
		 currentE, ne);
    if (nf>=MAX_SEGMENT) 
      fatalError("French segment too long: starting at %i; length: %i\n", 
		 currentF, nf);
    fprintf(stderr, "Handling e[%i...%i] and f[%i...%i]\n",
	    currentE, currentE+ne-1, currentF, currentF+nf-1);
    nSegments++;
    
    for (e=1; e<ne; e++) 
      score[e][0] = score[e-1][0] * ((LogProb) 1.0);
    for (f=1; f<nf; f++) 
      score[0][f] = 
	score[0][f-1]*probThatFrenchIsSpontaneous(frenchText[currentF+f]);
  
    for (e=1; e<ne; e++) {
      for (f=1; f<nf; f++) {
	LogProb s1 =
	  score[e][f-1]*probThatFrenchIsSpontaneous(frenchText[currentF+f]);
	LogProb s2 =
	  score[e-1][f-1]*getProbOfSentencePair(frenchText[currentF+f],
						englishText[currentE+e]);
	LogProb s3 = score[e-1][f] * ((LogProb) 1.0);

#if _DEBUG_ 
	printSentence(englishText[currentE+e]);
	printSentence(frenchText[currentF+f]);
	LogProb pa = getProbOfSentencePair(frenchText[currentF+f],
					  englishText[currentE+e]);
	fprintf(stderr, "SCORE=%1.14f\n", (double) pa);
	LogProb ps = probThatFrenchIsSpontaneous(frenchText[currentF+f]);
	fprintf(stderr, "f spontaneous=%1.14f\n", (double) ps);
#endif

	if (s1>s2 && s1>s3) { 
	  // French f arose spontaneously
	  score[e][f]       = s1;
	  traceback[e][f].e = e;
	  traceback[e][f].f = f-1;
	}
	else if (s2 > s3) { 
	  // (f,e) are aligned
	  score[e][f]       = s2;
	  traceback[e][f].e = e-1;
	  traceback[e][f].f = f-1;
	}
	else { 
	  // English has no pair in the French
	  score[e][f]       = s3;
	  traceback[e][f].e = e-1;
	  traceback[e][f].f = f;
	}

#if _DEBUG_ 
	fprintf(stderr, "TRACEBACK[%i][%i] = {%i, %i}\n\n", e,f,
		traceback[e][f].e,traceback[e][f].f); 
#endif
      }
    }
  
    // Compute traceback "in closed form"
    TRACEBACK viterbi[e+f+2];
    e--; 
    f--;
    // As an anchor, the final (e,f) pair must align to one another
    viterbi[0].e = e;
    viterbi[0].f = f;
    int n;
    for (n=1; ;n++) { 
#if _DEBUG_ 
      printf("n=%i     e=%i f=%i\n",n,e,f); 
#endif
      viterbi[n] = traceback[e][f];
      int eNew   = traceback[e][f].e;
      int fNew   = traceback[e][f].f;
      if (e==0 && f==0) break; 
      e=eNew; 
      f=fNew;
    }
    
    // spit out aligned sentences for this segment
    for (int m=n-1; m>=0; m--) {   
      const int eIDX = viterbi[m].e;
      const int fIDX = viterbi[m].f;
      if (m<n-1 && viterbi[m].e==viterbi[m+1].e) 
	fprintf(stdout, "[Empty]\n");
      else fprintf(stdout, "%s\n", rawEnglishText[currentE+eIDX]);
      if (m<n-1 && viterbi[m].f==viterbi[m+1].f) 
	fprintf(stdout, "[Empty]\n");
      else fprintf(stdout, "%s\n",  rawFrenchText[currentF+fIDX]);
      fprintf(stdout, "\n");    // delimiter between aligned sentence pairs
    }
    currentE+=ne;
    currentF+=nf;
  }

  fprintf(stderr, "** SUMMARY **\n");
  fprintf(stderr, "  Total # of English words: %i\n", englishText.Size());
  fprintf(stderr, "  Total # of French words: %i\n",  frenchText.Size()); 
  fprintf(stderr, "  Average # of English words/segment: %f\n",
	  ((double) englishText.Size())/nSegments);
  return 0;
}
