#!/usr/local/bin/perl

## Enter a short description of the script

use strict;
use warnings;
use File::Path;
use File::chdir;
use File::Temp "tempfile";

binmode STDOUT, ":utf8";

my $topDir = shift;
$topDir =~ /^(.+)\/(.+?)\/?$/;

# # Flatten
# visitAndMirrorDir($topDir,"$1/flat",\&flattenLtf);

# # English Segmentation
 visitAndMirrorDir($topDir,"$1/reseg",\&resegEnglish);

# # Urdu Segmentation
my %dict = ();
# my $dictDir = '/usr0/users/abhaya/data/Nist-DVD/LCTL_Urdu_v1.0/Parallel_Text/flat/reseg/Train';
# visitDir($dictDir,\&collectSenEndStats,\%dict);
# foreach (keys %dict){
# 	delete $dict{$_} if($dict{$_} < 2);
# }
# 
# visitAndMirrorDir($topDir,"$1/reseg",\&resegUrdu);

# # Collect all files to align directory for sentence alignment
# visitDir($topDir,\&collectForAlign);

# # Check data for anomolies
# $topDir =~ /^(.+)\/(.+?)\/?$/;
# visitDir($topDir,\&checkDataForAligner);

# Collect aligned sentences
# $topDir =~ /^(.+)\/(.+?)\/?$/;
# visitDir($topDir,\&collectAlignedData,"$1/aligned-1");

# Count the number of sentences
# my %sCounts = ();
# visitDir($topDir,\&collectSentenceCounts,\%sCounts);
# # print "Total\tEnglish\t${$sCounts{'eng'}}{'-1'}\tUrdu\t${$sCounts{'urd'}}{'-1'}\n";
# foreach ( sort { $a <=> $b } keys %{$sCounts{'eng'}}){
# 	print "$_\tEnglish\t${$sCounts{'eng'}}{$_}\tUrdu\t${$sCounts{'urd'}}{$_}\n";
# }

# Collect the unalined files
# $topDir =~ /^(.+)\/(.+?)\/?$/;
# visitDir($topDir,\&checkAndCopyUnaligned,"$1/unaligned");

# Collect the unaligned sentences and copy to new directory
# $topDir =~ /^(.+)\/(.+?)\/?$/;
# visitDir($topDir,\&collectUnaligned,"$1/align-1");

# Apply champollion
#$topDir =~ /^(.+)\/(.+?)\/?$/;
#visitDir($topDir,\&applyChamp,"$1/aligned.champ.eval");

# Compute length ratio for Chinese
# $topDir =~ /^(.+)\/(.+?)\/?$/;
# visitDir($topDir,\&lengthRatio,'');

 # Collect sentences
#  $topDir =~ /^(.+)\/(.+?)\/?$/;
#  visitDir($topDir,\&collectData,"$1/lm");


sub visitDir{
	my $cdir = shift;
	my $callback = shift;
	my $cbarg = shift;
	
	local(*RDIR);
	opendir(RDIR,$cdir) or die("Couldn't open the dir: $cdir\n");
	while(defined(my $entry = readdir(RDIR))){
		next if($entry =~ /^\.\.?$/);
		if(-d "$cdir/$entry"){
			print STDERR "visting $cdir/$entry..\n";
			visitDir("$cdir/$entry",$callback,$cbarg);
		}
		else{
			&$callback("$cdir/$entry",$cbarg);
		}
	}
	close RDIR;
	return;
}

sub visitAndMirrorDir{
	my $inDir = shift;
	my $outDir = shift;
	my $callback = shift;
	my $cbarg = shift;
	
# Start by going to the directory one level above the input directory. That is where we will mirror the directory
	$inDir =~ /^(.+)\/(.+?)\/?$/;
	local $CWD = $1;
	my $cdir = $2;
	
# 	print "in $CWD, will mirror $cdir in $outDir/\n";
# 	return;
	local(*RDIR);
	mkpath("$outDir/$cdir", 0, 0755);
	opendir(RDIR,$cdir) or die("Couldn't open the dir: $cdir\n");
	while(defined(my $entry = readdir(RDIR))){
		next if($entry =~ /^\.\.?$/);
		next if($entry =~ /^English$/);
		if(-d "$cdir/$entry"){
			print STDERR "visting $cdir/$entry..\n";
			visitAndMirrorDir("$inDir/$entry","$outDir/$cdir",$callback,$cbarg);
		}
		else{
			&$callback("$inDir/$entry","$outDir/$cdir/$entry",$cbarg);
		}
	}
	close RDIR;
	return;
}

sub collectData{
	my $fileName = shift;
	my $collect = shift;
	my $collectFile = $collect;
	if($fileName =~ /_URD_/){
		$collectFile .= '.urd';
	}
	else{
		$collectFile .= '.eng';
	}
	
	`cat $fileName >> $collectFile`;
}

sub lengthRatio{
	my $uFileName = shift;
	my $eFileName = $uFileName;
	print STDERR $uFileName,"\n";
	$eFileName =~ s/\.ch\./\.en\./g;
	
	open(UFILE, $uFileName) or die("Couldn't open the file $uFileName\n");
	open(EFILE, $eFileName) or die("Couldn't open the file $eFileName\n");
	my $snum = 0;
	while (my $urd = <UFILE>){
		my $eng = <EFILE>;
		$snum++;
		chomp($urd);
		chomp($eng);
		
		my @ul = split /\s+/,$urd;
		my @el = split /\s+/,$eng;
		my $r = ($#el+1)/($#ul+1);
		print "$#el $r\n";
		print STDERR "$urd $snum $#el $#ul\n" if($r == 1);
	}
	close UFILE;
	close EFILE;
}

sub collectUnaligned{
	my $file = shift;
	my $outDir = shift;
	
	return unless($file =~ /_urd.snt$/);
	my $twinFile = $file;
	$twinFile =~ s/_urd\.snt/_eng\.snt/;
	
	my @alignedU = ();
	open(UFILE,"<$file.aligned") or die("Couldn't find the aligned file for $file\n");
	while (<UFILE>){
		push @alignedU , $_;
	}
	close UFILE;
	
	my @alignedE = ();
	open(EFILE,"<$twinFile.aligned") or die("Couldn't find the aligned file for $twinFile\n");
	while (<EFILE>){
		push @alignedE , $_;
	}
	close EFILE;
	my $ulen = $#alignedU;
	my $elen = $#alignedE;
	die("Something is worng in $file $twinFile | $ulen $elen\n") if($#alignedU != $#alignedE);
	
	if($ulen == -1){
		`cp $file $outDir/`;
		`cp $twinFile $outDir/`;
		return;
	}
	
	open(UFILE,"<$file") or die("Couldn't open $file\n");
	my $unalignedU = "";
	while (<UFILE>){
		if($#alignedU > -1 and $_ eq $alignedU[0]){
			shift @alignedU;
			next;
		}
		$unalignedU .= $_;
	}
	close UFILE;
	
	open(EFILE,"<$twinFile") or die("Couldn't open $twinFile\n");
	my $unalignedE = "";
	while (<EFILE>){
		if($#alignedE > -1 and $_ eq $alignedE[0]){
			shift @alignedE;
			next;
		}
		$unalignedE .= $_;
	}
	close EFILE;
	
	return if($unalignedU eq "" or $unalignedE eq "");
	
	$file =~ /^.+\/([^\/]+?)$/;
	open(UFILE,">$outDir/$1") or die("Couldn't open $outDir/$1\n");
	print UFILE $unalignedU;
	close UFILE;
	
	
	$twinFile =~ /^.+\/([^\/]+?)$/;
	open(EFILE,">$outDir/$1") or die("Couldn't open $outDir/$1\n");
	print EFILE $unalignedE;
	close EFILE;
}

sub checkAndCopyUnaligned{
	my $file = shift;
	my $outDir = shift;
	
	return unless($file =~ /_urd.snt$/);
	
	open(AFILE,"$file.aligned") or die ("Couldn't find alignment file for $file\n");
	my @contents = <AFILE>;
	close AFILE;
	return if($#contents >= 0);
	
	print $file,"\n";

	my $twinFile = $file;
	$twinFile =~ s/_urd\.snt$/_eng\.snt/;
	
	`cp $file $outDir/`;
	`cp $twinFile $outDir/`;
}

sub collectSentenceCounts{
	my $file = shift;
	my $countHashRef = shift;
	
	return unless($file =~ /.snt$/);
	open(IFILE,"<:encoding(utf-8)",$file) or die("Couldn't open file $file\n");
	my @lines = <IFILE>;
	close IFILE;
	my $lang = "";
	if($file =~ /(_ENG_|_eng)/){
		${${$countHashRef}{'eng'}}{'-1'} += $#lines+1;
		$lang = 'eng';
	}
	elsif($file =~ /(_URD_|_urd)/){
		${${$countHashRef}{'urd'}}{'-1'} += $#lines+1;	
		$lang = 'urd';
	}
	foreach (@lines){
		chomp;
		my @tokens = split /\s+/;
		${${$countHashRef}{'eng'}}{"$#tokens"} = 0 unless(defined ${${$countHashRef}{'eng'}}{"$#tokens"});
		${${$countHashRef}{'urd'}}{"$#tokens"} = 0 unless(defined ${${$countHashRef}{'urd'}}{"$#tokens"});
		${${$countHashRef}{$lang}}{"$#tokens"}++;
# 		print "@tokens\n" if($#tokens == 0);
	}
}

sub collectAlignedData{
	my $ufile = shift;
	my $outFile = shift;
	
	return unless($ufile =~ /_urd.snt.aligned$/);
	
	my $efile = $ufile;
	$efile =~ s/_urd.snt.aligned$/_eng.snt.aligned/;
	
	`cat $ufile >> $outFile.urd`;
	`cat $efile >> $outFile.eng`;
}

sub checkDataForAligner{
	my $file = shift;
	
	return unless($file =~ /_urd.snt$/);
	my $twinFile = $file;
	$twinFile =~ s/_urd\.snt$/_eng.snt/;
	
	$/ = "";
	open(UFILE,"<$file") or die("Couldn't open $file\n");
	my $urd = <UFILE>;
	close UFILE;
	$/ = "\n";
	unless(defined $urd and $urd !~ /^\s+$/){
		print $file," - Empty urdu file\n" ;
		`mv $file $file.mismatch`;
		`mv $twinFile $twinFile.mismatch`;
		return;
	}
	my @urdCnt = split /\s+/,$urd;
	
	$/ = "";
	open(EFILE,"<$twinFile") or die("Couldn't open $twinFile\n");
	my $eng = <EFILE>;
	close EFILE;
	$/ = "\n";
	unless(defined $eng and $eng !~ /^\s+$/){
		print $file," - Empty english file\n";
		`mv $file $file.mismatch`;
		`mv $twinFile $twinFile.mismatch`;
		return;
	}
	my @engCnt = split /\s+/,$eng;
	
	if($#urdCnt - $#engCnt > 3*$#engCnt or $#engCnt - $#urdCnt > 3*$#urdCnt){
		print $file," - size mismatch\n";
		`mv $file $file.mismatch`;
		`mv $twinFile $twinFile.mismatch`;
	}
}

sub applyChamp{
	my $fileName = shift;
	my $outFile = shift;
	
	return unless($fileName =~ /_urd.snt$/);
	my $twinFile = $fileName;
	$twinFile =~ s/_urd\.snt/_eng\.snt/;
	
	$/ = "";
	open(UFILE,$fileName) or die("Couldn't open $fileName\n");
	my $data = <UFILE>;
	close UFILE;
	$/ = "\n";
	$data =~ s/\s+$//;
	open(UFILE,">$fileName.clean") or die("Couldn't open $fileName for writing\n");
	print UFILE $data,"\n";
	close UFILE;
	`export CTK=/afs/cs.cmu.edu/user/abhayaa/abhaya-local/downloads/champollion-1.1;\n/afs/cs.cmu.edu/user/abhayaa/abhaya-local/downloads/champollion-1.1/bin/champollion.EA $twinFile $fileName.clean $fileName.champ`;
	
	open(UFILE,"<$fileName.clean") or die("Urdu file not found\n");
	my @urdu = <UFILE>;
	close UFILE;
	open(EFILE,"<$twinFile") or die("English file not found\n");
	my @english = <EFILE>;
	close EFILE;
	
	open(OFILE,">>	$outFile") or die("$outFile couldn't be opened\n");
	open(AFILE,"<$fileName.champ") or die("No alignment file found\n");
	print OFILE "\n$fileName\n";
	while (<AFILE>){
		chomp;
		next if(/omitted/);
		my @ei = ();
		my @ui = ();
		(my $eng, my $urd) = split /<=>/;
		$eng =~ s/\s+//;
		$urd =~ s/\s+//;
		if($eng =~ /,/){
			@ei = split /,/,$eng;
		}
		else{
			push @ei, $eng;
		}
		if($urd =~ /,/){
			@ui = split /,/,$urd;
		}
		else{
			push @ui, $urd;
		}
		my $ulen = 0;
		my $elen = 0;
# 		print OFILE "@ei -> @ui\n";
		foreach my $eind (@ei){
			chomp($english[$eind-1]);
			my @ew = split /\s+/, $english[$eind-1];
			$elen += $#ew+1;
			print OFILE "$english[$eind-1] ";
		}
		print OFILE "\n";
		foreach my $uind (@ui){
			chomp($urdu[$uind-1]);
			my @uw = split /\s+/, $urdu[$uind-1];
			$ulen += $#uw+1;
			print OFILE "$urdu[$uind-1] ";
		}
		print OFILE "\n";
# 		unless(($elen+1 > 100) or ($ulen+1 > 100) or ($ulen > 20 and $elen > 2*$ulen) or ($elen > 20 and $ulen > 2*$elen)){
# 			print OFILE "eng\n";
# 			foreach my $eind (@ei){
# 				print OFILE "$english[$eind-1]\n";
# 			}
# 			print OFILE "urd\n";
# 			foreach my $uind (@ui){
# 				print OFILE "$urdu[$uind-1]\n";
# 			}
# 			print OFILE "#\n";
# 			
# 		}
	}
	close OFILE;
}

sub collectForAlign{
	my $fileName = shift;
	$fileName =~ /([^\/]+).ltf.xml.flat.reseg$/;
# 	print "$fileName\n";
	my $name = $1;
	
	$fileName =~ /(Found|Translations|Special_Corpora)/;
	$name = "$1_$name";
	if($fileName =~ /(_ENG_)|(English)/){
		$name =~ s/_ENG_/_/;
		$name =~ s/English_//;
		$name .= "_eng.snt";
	}
	else{
		$name =~ s/_URD_/_/;
		$name =~ s/Urdu_//;
		$name .= "_urd.snt";
	}
# 	print $name,"\n";
	`cp $fileName /usr0/users/abhaya/data/Nist-DVD/LCTL_Urdu_v1.0/Parallel_Text/flat/reseg/align-eval/$name`;
}


sub collectSenEndStats{
	my $file = shift;
	my $dictRef = shift;

	return if($file =~ /_ENG_|English/);
# 	print STDERR $file,"\n";;

	open(IFILE,"<:encoding(utf-8)",$file) or die("Couldn't open $file\n");
	while (my $line = <IFILE>){
# 		${$dictRef}{$1}++ if($line =~ /([^\x{06D4}\x{061F}! ]+?)\s*\n/);

		$line =~ s/([\x{06D4}\x{061F}!])/ $1 /;
# 		print $_;
		while ($line =~ /(\S+)\s+[\x{06D4}\x{061F}!]/g){
# 			next if($1 =~ /\d/);
			${$dictRef}{$1}++;
		}
	}
	close IFILE;
}


sub resegEnglish{
	my $inFile = shift;
	my $outFile = shift;

	my $onlpDir = "/usr0/users/abhaya/downloads/opennlp-tools-1.3.0";

# Run the openNLP sentence splitter
	`/usr0/users/abhaya/downloads/java/bin/java -cp $onlpDir/lib/trove.jar:$onlpDir/lib/maxent-2.4.0.jar:$onlpDir/output/opennlp-tools-1.3.0.jar opennlp.tools.lang.english.SentenceDetector $onlpDir/EnglishSD.bin.gz < $inFile > $outFile.reseg`;
}

sub resegUrdu{

	my $inFile = shift;
	my $outFile = shift;
	
	$/ = "";
	open(IFILE,"<:encoding(utf-8)",$inFile) or die("Couldn't open $inFile\n");
	my $converted = <IFILE>;
	close IFILE;
	$/ = "\n";
	return if($converted eq "");
	# Convert the numbers
	$converted =~ s/[\x{06F0}\x{0660}]/0/gm;
	$converted =~ s/[\x{06F1}\x{0661}]/1/gm;
	$converted =~ s/[\x{06F2}\x{0662}]/2/gm;
	$converted =~ s/[\x{06F3}\x{0663}]/3/gm;
	$converted =~ s/[\x{06F4}\x{0664}]/4/gm;
	$converted =~ s/[\x{06F5}\x{0665}]/5/gm;
	$converted =~ s/[\x{06F6}\x{0666}]/6/gm;
	$converted =~ s/[\x{06F7}\x{0667}]/7/gm;
	$converted =~ s/[\x{06F8}\x{0668}]/8/gm;
	$converted =~ s/[\x{06F9}\x{0669}]/9/gm;

	# Convert the punct
	$converted =~ s/\x{060C}/,/gm; #Arabic Comma
	$converted =~ s/\x{060D}/-/gm; #Arabic Date separater
	$converted =~ s/\x{061B}/;/gm;
	$converted =~ s/\x{061F}/?/gm;
	$converted =~ s/\x{0640}//gm; # tatweel (used to elongate characters for justification)
	$converted =~ s/\x{066A}/%/gm;
	$converted =~ s/\x{066B}/./gm; # decimal seperater
	$converted =~ s/\x{066C}/,/gm; # thousands seperater
	$converted =~ s/\x{066D}/*/gm; # five point star
	$converted =~ s/\x{06D4}/ \x{06D4} /gm; # full stop

	# Convert the direction quotes to neutral quotes
	$converted =~ s/[\x{2018}\x{2019}]/'/gm;
	$converted =~ s/[\x{201C}\x{201D}]/"/gm;
	
	$converted =~ s/^\s+//;
	$converted =~ s/\s+$//;
	
	my @lines = split /\n/,$converted;
	if($#lines > 0 and $lines[0] eq $lines[1]){
		@lines = @lines[1..$#lines];
	}
# 	print "Before ---------------\n@lines\n";
	$converted = "";
	foreach my $line (@lines){
		$line =~ s/^\s+//;
		$line =~ s/\s+$//;
		
		if($line =~ /[\x{06D4}\?!]$/){#print "matched \x{06D4}\n";
			$converted .= $line." ";
		}
		elsif($line =~ /([^\x{06D4}\?! ]+)$/ and $dict{$1}){
			$converted .= $line."\x{06D4} ";
		}
		else{
			$converted .= $line." ";
		}
	}
	$converted =~ s/([\x{06D4}\?!])\s+/$1\n/g;
	
	open(OFILE,">:encoding(utf-8)",$outFile.".reseg") or die("Couldn't open file for output\n");
	print OFILE $converted;
	close OFILE;
# 	print "After -------------- $converted\n";
# 	<STDIN>;
}

sub flattenLtf{
	my $inFile = shift;
	my $outFile = shift;
	
	$inFile =~ /^(.+)\/(.+?)\/?$/;

# 	print "Flattening $2..\n";
	open(OUTP, ">$outFile.flat") || die("can't open output file ($outFile)!");
	# read in the file
	open(FILE, "<$inFile") || die("can't open \"$inFile\"");
	my $file = join("", <FILE>);
	close(FILE);
	# while there are segments
	while($file =~ s/\<SEG.*?\>(.*?)\<\/SEG\>//s){
		my $seg = $1;
		# while there are tokens
		my $sent = "";
		while($seg =~ s/\<TOKEN.*?\>(.*?)\<\/TOKEN\>//s){
			my $token = $1;
			$sent .= "$token ";
		# output token to output file.
# 			print OUTP "$token ";
		}
		$sent =~ s/\s+\././g;
		print OUTP "$sent\n";
	}
	# close output file
	close(OUTP);
}
