#!/usr/local/bin/perl

# This script reads in a lexicon file given in llf format and generates
# a xfer style lexicon from that.

use strict;
use warnings;

my $fileName = shift;
my $type = shift || 'xfer';
my %lexicon = ();

#[TODO] Read the pos mapping from docs/POS-mapping.txt
my %posMap = (noun => 'N',
		verb => 'V',
		adj => 'JJ',
		adv => 'RB',
		none => 'LEX',
		pron => 'PRP',
		tmp => 'IN',
		loc => 'IN',
		num => 'CD',
		whq => 'WH',
		infl => 'INFL',
		dem => 'DEM',
		conj => 'CC',
		det => 'DET',
		other => 'LEX');

loadLLFLexiconFromFile($fileName, \%lexicon);
printXferLexicon(\%lexicon,"/afs/cs/project/avenue-1/Avenue/Urdu-MT/data") if($type eq 'xfer');
printBiLingualDict(\%lexicon) if($type =~ /dict/);


# Takes in a hash and prints that out as a simple
# bilingual dict with 2 words on every line, 
# English words followed by urdu word
sub printBiLingualDict{
	my $lexiconRef = shift;
	my $count++;
	foreach (sort keys %{$lexiconRef}){
		my %done = (); # Keep track of repeated entries
		foreach my $pos (keys %{${$lexiconRef}{$_}}){
			next if($pos eq 'id');
			foreach my $eWord (@{ ${${$lexiconRef}{$_}}{$pos} }){
# 				next if(defined $done{$eWord});
# 				$done{$eWord} = 1;
				$count++;
				print "$eWord $_ $pos\n" if($type eq 'dict-giza');
# 				print "$eWord @ $_\n" if($type eq 'dict-hun');
			}
		}
	}
	print "$count\n";
}

# Takes in a hash and prints out a Xfer format lexicon
# printXferLexicon(hashRef)
sub printXferLexicon{
	my $lexiconRef = shift;
	my $outDir = shift;
	my %donePOS = ();
	foreach my $key (keys %posMap){
		my $pos = $posMap{$key};
		next if(defined $donePOS{$pos});
		$donePOS{$pos} = 1;
		print "Writing $pos lexicon..\n";
		if($pos eq 'WH' or $pos eq 'CD' or $pos eq 'INFL' or $pos eq 'PRP' or $pos eq 'DEM' or $pos eq 'CC' or $pos eq 'DET'){
			open(OFILE,">$outDir/fullform-dvd-others.lex") or die("couldn't open file to write closed class lexicon\n");
		}
		else{
			open(OFILE,">$outDir/fullform-dvd-$pos.lex") or die("couldn't open file to write $pos lexicon\n");
		}
		foreach (sort keys %{$lexiconRef}){
			my $count = 1;
			my %done = (); # Keep track of repeated entries
			foreach my $eWord (@{ ${${$lexiconRef}{$_}}{$pos} }){
				$eWord =~ s/\s*\(.*\)$//;
				next if(defined $done{$eWord});
				$done{$eWord} = 1;
				print OFILE "{${${$lexiconRef}{$_}}{'id'}-$pos,$count}\n";
				print OFILE "$pos\::$pos |: ";
				print OFILE "[\"$_\"] -> ";
				print OFILE "[\"$eWord\"]\n";
				print OFILE "(\n\t(X1::Y1)\n";
				print OFILE "\t((x0 lex) = $_)\n";
				print OFILE ")\n\n";
				$count++;
			}
		}
		close OFILE;
	}
}

# Load the lexicon from a LLF format file
# loadLLFLexiconFromFile(fileName)
sub loadLLFLexiconFromFile{
	my $fileName = shift;
	my $lexiconRef = shift;

	open(IFILE, $fileName) or die("Couldn't open the file $fileName\n");
	
	# Read off the header
	my $head = "";
	$head = <IFILE> until($head =~ /^<LCTL_LEXICON/);
	
	# Start processing the entries
	my $entry = readEntry();
	while($entry ne ''){
		processEntry($entry,$lexiconRef);
		$entry = readEntry();
	}
	close IFILE;
}

# processEntry(string,hashref)
sub processEntry{
	my $entry = shift;
	my $lexiconRef = shift;
	my @lines = split /\n/,$entry;
	
# Get the entry id
	$lines[0] =~ /<ENTRY id="LEX-URD-(\d+)">/;
	my $entryId = $1;
	
# Get the entry word
	$lines[1] =~ /<WORD>(.+)<\/WORD>/;
	my $word = $1;
	
	${${$lexiconRef}{$word}}{'id'} = $entryId;
	for(my $i = 2; $i < $#lines; $i++){
		next unless($lines[$i] =~ /<POS/);
		(my $pos, my $src, my $wordsRef) = extractTranslations($lines[$i]);
		push @{ ${${$lexiconRef}{$word}}{$pos} },@{$wordsRef};
	}
}

# Extract the pos and all the enlish meanings in this line
# extractTranslations(String)
sub extractTranslations{
	my $line = shift;
	
	$line =~ /<POS>(.+)<\/POS><GLOSS source="(.+)">(.+)<\/GLOSS>/;
	
	my $pos = $1;
	my $src = $2;
	my $eString = $3;

	$pos = 'other' if($pos =~ /\s+/); # Handle "Adj or Adv"
	print STDERR lc($pos),' ',$_,' ' unless(defined $posMap{lc($pos)});
	$pos = $posMap{lc($pos)};
	
# 	print "$pos $src $eString\n";
	
	my @engWords = split /,/,$eString;
	
	return ($pos, $src, \@engWords);
}

# readEntry(File)
sub readEntry{
	my $entry = "";
	while($entry !~ /<\/ENTRY>/g){
		return "" unless(my $line = <IFILE>); # Check for the end of the file
		$entry .= $line;
	}
	return $entry;
}