=introduction

	SYNERGY NER system
	Author: Rushin Shah
	
                  Language Technologies Institute
                     Carnegie Mellon University
                         Copyright (c) 2010
                        All Rights Reserved.

  Permission is hereby granted, free of charge, to use and distribute
  this software and its documentation without restriction, including
  without limitation the rights to use, copy, modify, merge, publish,
  distribute, sublicense, and/or sell copies of this work, and to
  permit persons to whom this work is furnished to do so, subject to
  the following conditions:
   1. The code must retain the above copyright notice, this list of
      conditions and the following disclaimer.
   2. Any modifications must be clearly marked as such.
   3. Original authors' names are not deleted.
   4. The authors' names are not used to endorse or promote products
      derived from this software without specific prior written
      permission.

  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK
  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE
  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
  THIS SOFTWARE.

=cut

package NER;
BEGIN { push @INC, "/ldc/lib/perl5/site_perl" unless ( grep m{/ldc/lib/perl5/site_perl}, @INC ) }
use List::Util qw(max);
use List::Util qw(min);
use Text::English;
use WebService::Google::Language;
use Encode;
use Encode::Buckwalter;
use Encode qw/encode decode is_utf8/;
use REST::Google::Translate;
use Algorithm::Diff qw(LCS LCS_length LCSidx diff sdiff compact_diff traverse_sequences traverse_balanced );
use String::LCSS_XS qw(lcss lcss_all);

#use WWW::Babelfish;
#use SAMA;
#use DB_File;
#use Fcntl;
#require Encode::Detect;
#use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;

=documentation
This is the documentation for the SYNERGY NER system for Arabic and Swahili, first presented in this paper.

The system is implemented in Perl. It requires the following Perl modules to be installed:

List::Util qw(max)
List::Util qw(min)
Text::English
WebService::Google::Language
Encode
Encode::Buckwalter
Encode qw/encode decode is_utf8/
REST::Google::Translate
Algorithm::Diff qw(LCS LCS_length LCSidx diff sdiff compact_diff traverse_sequences traverse_balanced )
String::LCSS_XS qw(lcss lcss_all)


To use the system, first include the following line in your code:
  use NER;

Then instantiate an object using the following command:

$ner = NER->(%args);

The arguments hash can take the following optional parameters:

align - specifies which alignment to use from {"giza", "en", "src"}. default "giza"
pp - post-processing (0 - off, 1 - on). default 1.
coref - whether to also perform CRR (1) or not  (0). default 0.
translate - which translator to use: google ("g") or bing ("b"). default "g".

Thus a sample usage would look like this:
$ner = NER->new(align => "giza", pp => 1, coref => 0, translate => "g");

Then, to use any function in this module, say X, call it as follows:
$ner->X(@args);

The function to call to perfrom NER is translate, and must be supplied an array of file names. e.g.
$ner->translate("file1_arb.txt", "file2_arb.txt");

File names must end in either "_arb.txt" for Arabic, or "_swh.txt" for Swahili.

Other useful functions include make_table, lbj and stanford. Sample usage:

($stan_table, $lbj_table, $corr_table, $union_table) = $ner->make_table($outfile, @infiles);

First argument must be the name of the outfile, subsequent arguments must be raw input files. By raw input files, we mean English text files without any labeling. They must have _raw at the end, i.e. be named in the following format: xyz_raw.txt

Before using make_table function for some $infile, following commands must be run:
  $ner->lbj($infile);
  $ner->stanford($infile);
 
This will produce the lbj and stanford tagged versions of the raw file. You must have lbj and stanford systems installed, and the folders LbjNerTagger1.11.release and stanford-ner-2009-01-16 must be in your current folder. LBJ files produced will be of the form _lbj.txt and Stanford files produced will be of the form _tempstan.txt and _stanford.txt . Do not modify this.

So, a full use case would look like this:

    for $file(@infiles)
    {
        $ner->lbj($file);
        $ner->stanford($file);
    }
    ($stan_table, $lbj_table, $corr_table, $union_table) = $ner->make_table("table_all.txt", @infiles);

Now you can either use the tables returned in $stan_table, etc. or the output file table_all.txt

If labeled version of the raw file exists, it must be of the form _correct.txt . If so, then $corr_table will be non-empty, otherwise empty string.
=cut

my $in_flag = 0;
my $treesize_i;
my $treesize;
my $ifh;
my $filename = "XXX";

$mytypes{PER}++; $mytypes{LOC}++; $mytypes{ORG}++; $mytypes{MIS}++;
#webservice::google::language - all except ar->en #rest::google::translate - all except en->sw
$service = WebService::Google::Language->new('http://example.com/');
REST::Google::Translate->http_referer('http://example.com');

$debug = STDOUT;

sub new
{

	my $name = shift;
	my %opt = @_;
	
	#control variables: $lang, $translator, $l_compare

	$l_compare = (exists($opt{align})) ? $opt{align} : "giza";
	$l_coref = (exists($opt{coref})) ? $opt{coref} : 0;
	$pp = (exists($opt{pp})) ? $opt{pp} : 1;
	$translator = (exists($opt{translate})) ? $opt{translate} : "g";
	$l_jt = 1;
	
	binmode STDOUT, ":utf8";
	%commons = ();
	open(my $cfh, "<", "commons.txt");
	while(<$cfh>)
	{
		$word = $_;
		chomp($word);
		chop($word) if(substr($word, -1, 1) eq "\n" or substr($word, -1, 1) eq "\r");
		next if($word eq "");
		$commons{$word}++;
	}
	close($cfh);	

	%swh_commons = ();
	open($cfh, "<", "swh_commons.txt");
	while(<$cfh>)
	{
		$word = $_;
		chomp($word);
		chop($word) if(substr($word, -1, 1) eq "\n" or substr($word, -1, 1) eq "\r");
		next if($word eq "");
		$swh_commons{$word}++;
	}
	close($cfh);	

	%arb_commons = ();
	open($cfh, "<", "arb_commons.txt");
	while(<$cfh>)
	{
		$word = $_;
		chomp($word);
		chop($word) if(substr($word, -1, 1) eq "\n" or substr($word, -1, 1) eq "\r");
		next if($word eq "");
		$arb_commons{$word}++;
	}
	close($cfh);	

	%propers = ();
	open(my $pfh, "<", "propers.txt");
	while(<$pfh>)
	{
		$word = $_;
		chomp($word);		
		chop($word) if(substr($word, -1, 1) eq "\n" or substr($word, -1, 1) eq "\r");		
		next if($word eq "");		
		$propers{lc $word}++;
	}
	close($pfh);	

	%swh_propers = ();
	open($pfh, "<", "swh_propers.txt");
	while(<$pfh>)
	{
		$word = $_;
		chomp($word);
		chop($word) if(substr($word, -1, 1) eq "\n" or substr($word, -1, 1) eq "\r");
		next if($word eq "");
		$swh_propers{lc $word}++;
	}
	close($pfh);

	%arb_propers = ();
	open($pfh, "<", "arb_propers.txt");
	while(<$pfh>)
	{
		$word = $_;
		chomp($word);
		chop($word) if(substr($word, -1, 1) eq "\n" or substr($word, -1, 1) eq "\r");
		next if($word eq "");
		$arb_propers{$word}++;
	}
	close($pfh);
	
	warn "NER 1.0 Rushin Shah\n";
	bless { align => $l_compare, coref => $l_coref, pp => , $pp, translate => $translator };
}

sub evaluate
{
	#basically, an augmented version of calcstats
	my $name = shift;
	#my $outfile = shift;
	my @files = @_;
	#open($ofh, ">", $outfile);
	my $sum_tp = 0;
	my $sum_fp = 0;
	my $sum_fn = 0;
	my $sum_tn = 0;
	%c_s = ();
	%c_l = ();
	%c_u = ();
	my $sum_tokens = 0;
	for my $infile (@files)
	{		
		($num, $str_tokens, $str_stan, $str_lbj, $str_union, $str_corr) = sync("sync", $infile);
		if($str_corr eq "" or $str_tcorr eq "")
		{
			print "Correct file doesn't exist for $infile, skipping\n";
			next;
		}

		$sum_tokens += $num;
		@stan_tokens = split "\n", $str_tokens;
		@lbj_tokens = split "\n", $str_tokens;
		@union_tokens = split "\n", $str_tokens;
		@corr_tokens = split "\n", $str_tokens;
		
		@stan_tags = split "\n", $str_stan;
		@lbj_tags = split "\n", $str_lbj;
		@union_tags = split "\n", $str_union;
		@corr_tags = split "\n", $str_corr;
		
		$tp = 0;
		$fp = 0;
		$fn = 0;
		$tn = 0;

		$s_tp = 0;
		$s_fp = 0;
		$s_fn = 0;
		$s_tn = 0;

		$u_tp = 0;
		$u_fp = 0;
		$u_fn = 0;
		$u_tn = 0;
		
		for($i = 0; $i < $num; $i++)
		{			
			if($stan_tags[$i] eq $lbj_tags[$i])
			{
				$union_tags[$i] = $stan_tags[$i];
			}
			if($stan_tags[$i] eq "NOT" and $lbj_tags[$i] ne "NOT")
			{
				$union_tags[$i] = $lbj_tags[$i];
			}
			if($stan_tags[$i] ne "NOT" and $lbj_tags[$i] eq "NOT")
			{
				$union_tags[$i] = $stan_tags[$i];
			}
			if($stan_tags[$i] ne "NOT" and $lbj_tags[$i] ne "NOT" and $stan_tags[$i] ne $lbj_tags[$i])
			{
				$union_tags[$i] = $lbj_tags[$i];
			}

			$tp++ if($corr_tags[$i] ne "NOT" and $union_tags[$i] ne "NOT");
			$fp++ if($corr_tags[$i] eq "NOT" and $union_tags[$i] ne "NOT");
			$fn++ if($corr_tags[$i] ne "NOT" and $union_tags[$i] eq "NOT");
			$tn++ if($corr_tags[$i] eq "NOT" and $union_tags[$i] eq "NOT");

			$s_tp++ if($corr_tags[$i] ne "NOT" and $stan_tags[$i] ne "NOT");
			$s_fp++ if($corr_tags[$i] eq "NOT" and $stan_tags[$i] ne "NOT");
			$s_fn++ if($corr_tags[$i] ne "NOT" and $stan_tags[$i] eq "NOT");
			$s_tn++ if($corr_tags[$i] eq "NOT" and $stan_tags[$i] eq "NOT");
			
			$u_tp++ if($corr_tags[$i] ne "NOT" and $lbj_tags[$i] ne "NOT");
			$u_fp++ if($corr_tags[$i] eq "NOT" and $lbj_tags[$i] ne "NOT");
			$u_fn++ if($corr_tags[$i] ne "NOT" and $lbj_tags[$i] eq "NOT");
			$u_tn++ if($corr_tags[$i] eq "NOT" and $lbj_tags[$i] eq "NOT");

			$c_s{PER}{tp}++ if($corr_tags[$i] eq "PER" and $stan_tags[$i] eq "PER");
			$c_s{PER}{fp}++ if($corr_tags[$i] ne "PER" and $stan_tags[$i] eq "PER");
			$c_s{PER}{fn}++ if($corr_tags[$i] eq "PER" and $stan_tags[$i] ne "PER");
			$c_s{PER}{tn}++ if($corr_tags[$i] ne "PER" and $stan_tags[$i] ne "PER");

			$c_s{LOC}{tp}++ if($corr_tags[$i] eq "LOC" and $stan_tags[$i] eq "LOC");
			$c_s{LOC}{fp}++ if($corr_tags[$i] ne "LOC" and $stan_tags[$i] eq "LOC");
			$c_s{LOC}{fn}++ if($corr_tags[$i] eq "LOC" and $stan_tags[$i] ne "LOC");
			$c_s{LOC}{tn}++ if($corr_tags[$i] ne "LOC" and $stan_tags[$i] ne "LOC");

			$c_s{ORG}{tp}++ if($corr_tags[$i] eq "ORG" and $stan_tags[$i] eq "ORG");
			$c_s{ORG}{fp}++ if($corr_tags[$i] ne "ORG" and $stan_tags[$i] eq "ORG");
			$c_s{ORG}{fn}++ if($corr_tags[$i] eq "ORG" and $stan_tags[$i] ne "ORG");
			$c_s{ORG}{tn}++ if($corr_tags[$i] ne "ORG" and $stan_tags[$i] ne "ORG");

			$c_s{MIS}{tp}++ if($corr_tags[$i] eq "MIS" and $stan_tags[$i] eq "MIS");
			$c_s{MIS}{fp}++ if($corr_tags[$i] ne "MIS" and $stan_tags[$i] eq "MIS");
			$c_s{MIS}{fn}++ if($corr_tags[$i] eq "MIS" and $stan_tags[$i] ne "MIS");
			$c_s{MIS}{tn}++ if($corr_tags[$i] ne "MIS" and $stan_tags[$i] ne "MIS");

			$c_l{PER}{tp}++ if($corr_tags[$i] eq "PER" and $lbj_tags[$i] eq "PER");
			$c_l{PER}{fp}++ if($corr_tags[$i] ne "PER" and $lbj_tags[$i] eq "PER");
			$c_l{PER}{fn}++ if($corr_tags[$i] eq "PER" and $lbj_tags[$i] ne "PER");
			$c_l{PER}{tn}++ if($corr_tags[$i] ne "PER" and $lbj_tags[$i] ne "PER");

			$c_l{LOC}{tp}++ if($corr_tags[$i] eq "LOC" and $lbj_tags[$i] eq "LOC");
			$c_l{LOC}{fp}++ if($corr_tags[$i] ne "LOC" and $lbj_tags[$i] eq "LOC");
			$c_l{LOC}{fn}++ if($corr_tags[$i] eq "LOC" and $lbj_tags[$i] ne "LOC");
			$c_l{LOC}{tn}++ if($corr_tags[$i] ne "LOC" and $lbj_tags[$i] ne "LOC");

			$c_l{ORG}{tp}++ if($corr_tags[$i] eq "ORG" and $lbj_tags[$i] eq "ORG");
			$c_l{ORG}{fp}++ if($corr_tags[$i] ne "ORG" and $lbj_tags[$i] eq "ORG");
			$c_l{ORG}{fn}++ if($corr_tags[$i] eq "ORG" and $lbj_tags[$i] ne "ORG");
			$c_l{ORG}{tn}++ if($corr_tags[$i] ne "ORG" and $lbj_tags[$i] ne "ORG");

			$c_l{MIS}{tp}++ if($corr_tags[$i] eq "MIS" and $lbj_tags[$i] eq "MIS");
			$c_l{MIS}{fp}++ if($corr_tags[$i] ne "MIS" and $lbj_tags[$i] eq "MIS");
			$c_l{MIS}{fn}++ if($corr_tags[$i] eq "MIS" and $lbj_tags[$i] ne "MIS");
			$c_l{MIS}{tn}++ if($corr_tags[$i] ne "MIS" and $lbj_tags[$i] ne "MIS");

			$c_u{PER}{tp}++ if($corr_tags[$i] eq "PER" and $union_tags[$i] eq "PER");
			$c_u{PER}{fp}++ if($corr_tags[$i] ne "PER" and $union_tags[$i] eq "PER");
			$c_u{PER}{fn}++ if($corr_tags[$i] eq "PER" and $union_tags[$i] ne "PER");
			$c_u{PER}{tn}++ if($corr_tags[$i] ne "PER" and $union_tags[$i] ne "PER");

			$c_u{LOC}{tp}++ if($corr_tags[$i] eq "LOC" and $union_tags[$i] eq "LOC");
			$c_u{LOC}{fp}++ if($corr_tags[$i] ne "LOC" and $union_tags[$i] eq "LOC");
			$c_u{LOC}{fn}++ if($corr_tags[$i] eq "LOC" and $union_tags[$i] ne "LOC");
			$c_u{LOC}{tn}++ if($corr_tags[$i] ne "LOC" and $union_tags[$i] ne "LOC");

			$c_u{ORG}{tp}++ if($corr_tags[$i] eq "ORG" and $union_tags[$i] eq "ORG");
			$c_u{ORG}{fp}++ if($corr_tags[$i] ne "ORG" and $union_tags[$i] eq "ORG");
			$c_u{ORG}{fn}++ if($corr_tags[$i] eq "ORG" and $union_tags[$i] ne "ORG");
			$c_u{ORG}{tn}++ if($corr_tags[$i] ne "ORG" and $union_tags[$i] ne "ORG");

			$c_u{MIS}{tp}++ if($corr_tags[$i] eq "MIS" and $union_tags[$i] eq "MIS");
			$c_u{MIS}{fp}++ if($corr_tags[$i] ne "MIS" and $union_tags[$i] eq "MIS");
			$c_u{MIS}{fn}++ if($corr_tags[$i] eq "MIS" and $union_tags[$i] ne "MIS");
			$c_u{MIS}{tn}++ if($corr_tags[$i] ne "MIS" and $union_tags[$i] ne "MIS");
		}
		
		$prec = $s_tp / ($s_tp + $s_fp);
		$recl = $s_tp / ($s_tp + $s_fn);
		$f1 = 2 * $prec * $recl / ($prec + $recl);	
		print "Tokens: $num Cumulative: $sum_tokens\n";
		printf "Stan: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $prec, $recl, $f1;

		$prec = $u_tp / ($u_tp + $u_fp);
		$recl = $u_tp / ($u_tp + $u_fn);
		$f1 = 2 * $prec * $recl / ($prec + $recl);		
		printf "LBJ: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $prec, $recl, $f1;

		$prec = $tp / ($tp + $fp);
		$recl = $tp / ($tp + $fn);
		$f1 = 2 * $prec * $recl / ($prec + $recl);		
		printf "Union: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n\n", $prec, $recl, $f1;
		
		$sum_s_tp += $s_tp;
		$sum_s_fp += $s_fp;
		$sum_s_fn += $s_fn;
		$sum_s_tn += $s_tn;

		$sum_u_tp += $u_tp;
		$sum_u_fp += $u_fp;
		$sum_u_fn += $u_fn;
		$sum_u_tn += $u_tn;

		$sum_tp += $tp;
		$sum_fp += $fp;
		$sum_fn += $fn;
		$sum_tn += $tn;
		
	}

	$c_s{PER}{prec} = $c_s{PER}{tp} / ($c_s{PER}{tp} + $c_s{PER}{fp});
	$c_s{PER}{recl} = $c_s{PER}{tp} / ($c_s{PER}{tp} + $c_s{PER}{fn});
	$c_s{PER}{f1} = 2 * $c_s{PER}{prec} * $c_s{PER}{recl} / ($c_s{PER}{prec} + $c_s{PER}{recl});		

	$c_s{LOC}{prec} = $c_s{LOC}{tp} / ($c_s{LOC}{tp} + $c_s{LOC}{fp});
	$c_s{LOC}{recl} = $c_s{LOC}{tp} / ($c_s{LOC}{tp} + $c_s{LOC}{fn});
	$c_s{LOC}{f1} = 2 * $c_s{LOC}{prec} * $c_s{LOC}{recl} / ($c_s{LOC}{prec} + $c_s{LOC}{recl});		

	$c_s{ORG}{prec} = $c_s{ORG}{tp} / ($c_s{ORG}{tp} + $c_s{ORG}{fp});
	$c_s{ORG}{recl} = $c_s{ORG}{tp} / ($c_s{ORG}{tp} + $c_s{ORG}{fn});
	$c_s{ORG}{f1} = 2 * $c_s{ORG}{prec} * $c_s{ORG}{recl} / ($c_s{ORG}{prec} + $c_s{ORG}{recl});		

	$c_s{MIS}{prec} = $c_s{MIS}{tp} / ($c_s{MIS}{tp} + $c_s{MIS}{fp});
	$c_s{MIS}{recl} = $c_s{MIS}{tp} / ($c_s{MIS}{tp} + $c_s{MIS}{fn});
	$c_s{MIS}{f1} = 2 * $c_s{MIS}{prec} * $c_s{MIS}{recl} / ($c_s{MIS}{prec} + $c_s{MIS}{recl});		
	printf "Stan PER: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_s{PER}{prec}, $c_s{PER}{recl}, $c_s{PER}{f1};
	printf "Stan LOC: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_s{LOC}{prec}, $c_s{LOC}{recl}, $c_s{LOC}{f1};
	printf "Stan ORG: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_s{ORG}{prec}, $c_s{ORG}{recl}, $c_s{ORG}{f1};
	printf "Stan MIS: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_s{MIS}{prec}, $c_s{MIS}{recl}, $c_s{MIS}{f1};
	$c_s{AVG}{f1} = ($c_s{PER}{f1} + $c_s{LOC}{f1} + $c_s{ORG}{f1} + $c_s{MIS}{f1}) / 4;
	print "Stan AVG f1 score: ", $c_s{AVG}{f1}, "\n";

	$prec = $sum_s_tp / ($sum_s_tp + $sum_s_fp);
	$recl = $sum_s_tp / ($sum_s_tp + $sum_s_fn);
	$f1 = 2 * $prec * $recl / ($prec + $recl);
	printf "Overall Stan Precision: %.3f Recall: %.3f F1: %.3f\n", $prec, $recl, $f1;

	$c_l{PER}{prec} = $c_l{PER}{tp} / ($c_l{PER}{tp} + $c_l{PER}{fp});
	$c_l{PER}{recl} = $c_l{PER}{tp} / ($c_l{PER}{tp} + $c_l{PER}{fn});
	$c_l{PER}{f1} = 2 * $c_l{PER}{prec} * $c_l{PER}{recl} / ($c_l{PER}{prec} + $c_l{PER}{recl});		

	$c_l{LOC}{prec} = $c_l{LOC}{tp} / ($c_l{LOC}{tp} + $c_l{LOC}{fp});
	$c_l{LOC}{recl} = $c_l{LOC}{tp} / ($c_l{LOC}{tp} + $c_l{LOC}{fn});
	$c_l{LOC}{f1} = 2 * $c_l{LOC}{prec} * $c_l{LOC}{recl} / ($c_l{LOC}{prec} + $c_l{LOC}{recl});		

	$c_l{ORG}{prec} = $c_l{ORG}{tp} / ($c_l{ORG}{tp} + $c_l{ORG}{fp});
	$c_l{ORG}{recl} = $c_l{ORG}{tp} / ($c_l{ORG}{tp} + $c_l{ORG}{fn});
	$c_l{ORG}{f1} = 2 * $c_l{ORG}{prec} * $c_l{ORG}{recl} / ($c_l{ORG}{prec} + $c_l{ORG}{recl});		

	$c_l{MIS}{prec} = $c_l{MIS}{tp} / ($c_l{MIS}{tp} + $c_l{MIS}{fp});
	$c_l{MIS}{recl} = $c_l{MIS}{tp} / ($c_l{MIS}{tp} + $c_l{MIS}{fn});
	$c_l{MIS}{f1} = 2 * $c_l{MIS}{prec} * $c_l{MIS}{recl} / ($c_l{MIS}{prec} + $c_l{MIS}{recl});		
	printf "LBJ PER: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_l{PER}{prec}, $c_l{PER}{recl}, $c_l{PER}{f1};
	printf "LBJ LOC: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_l{LOC}{prec}, $c_l{LOC}{recl}, $c_l{LOC}{f1};
	printf "LBJ ORG: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_l{ORG}{prec}, $c_l{ORG}{recl}, $c_l{ORG}{f1};
	printf "LBJ MIS: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_l{MIS}{prec}, $c_l{MIS}{recl}, $c_l{MIS}{f1};
	$c_l{AVG}{f1} = ($c_l{PER}{f1} + $c_l{LOC}{f1} + $c_l{ORG}{f1} + $c_l{MIS}{f1}) / 4;
	print "LBJ AVG f1 score: ", $c_l{AVG}{f1}, "\n";

	$prec = $sum_u_tp / ($sum_u_tp + $sum_u_fp);
	$recl = $sum_u_tp / ($sum_u_tp + $sum_u_fn);
	$f1 = 2 * $prec * $recl / ($prec + $recl);
	printf "Overall LBJ Precision: %.3f Recall: %.3f F1: %.3f\n", $prec, $recl, $f1;

	$c_u{PER}{prec} = $c_u{PER}{tp} / ($c_u{PER}{tp} + $c_u{PER}{fp});
	$c_u{PER}{recl} = $c_u{PER}{tp} / ($c_u{PER}{tp} + $c_u{PER}{fn});
	$c_u{PER}{f1} = 2 * $c_u{PER}{prec} * $c_u{PER}{recl} / ($c_u{PER}{prec} + $c_u{PER}{recl});		

	$c_u{LOC}{prec} = $c_u{LOC}{tp} / ($c_u{LOC}{tp} + $c_u{LOC}{fp});
	$c_u{LOC}{recl} = $c_u{LOC}{tp} / ($c_u{LOC}{tp} + $c_u{LOC}{fn});
	$c_u{LOC}{f1} = 2 * $c_u{LOC}{prec} * $c_u{LOC}{recl} / ($c_u{LOC}{prec} + $c_u{LOC}{recl});		

	$c_u{ORG}{prec} = $c_u{ORG}{tp} / ($c_u{ORG}{tp} + $c_u{ORG}{fp});
	$c_u{ORG}{recl} = $c_u{ORG}{tp} / ($c_u{ORG}{tp} + $c_u{ORG}{fn});
	$c_u{ORG}{f1} = 2 * $c_u{ORG}{prec} * $c_u{ORG}{recl} / ($c_u{ORG}{prec} + $c_u{ORG}{recl});		

	$c_u{MIS}{prec} = $c_u{MIS}{tp} / ($c_u{MIS}{tp} + $c_u{MIS}{fp});
	$c_u{MIS}{recl} = $c_u{MIS}{tp} / ($c_u{MIS}{tp} + $c_u{MIS}{fn});
	$c_u{MIS}{f1} = 2 * $c_u{MIS}{prec} * $c_u{MIS}{recl} / ($c_u{MIS}{prec} + $c_u{MIS}{recl});		
	printf "Union PER: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_u{PER}{prec}, $c_u{PER}{recl}, $c_u{PER}{f1};
	printf "Union LOC: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_u{LOC}{prec}, $c_u{LOC}{recl}, $c_u{LOC}{f1};
	printf "Union ORG: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_u{ORG}{prec}, $c_u{ORG}{recl}, $c_u{ORG}{f1};
	printf "Union MIS: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_u{MIS}{prec}, $c_u{MIS}{recl}, $c_u{MIS}{f1};
	$c_u{AVG}{f1} = ($c_u{PER}{f1} + $c_u{LOC}{f1} + $c_u{ORG}{f1} + $c_u{MIS}{f1}) / 4;
	print "Union AVG f1 score: ", $c_u{AVG}{f1}, "\n";
	
	$prec = $sum_tp / ($sum_tp + $sum_fp);
	$recl = $sum_tp / ($sum_tp + $sum_fn);
	$f1 = 2 * $prec * $recl / ($prec + $recl);
	printf "Overall Union Precision: %.3f Recall: %.3f F1: %.3f\n\n", $prec, $recl, $f1;
	print "Cumulative: $sum_tokens\n";
	#close($ofh);	
}

sub sync
{
	my $name = shift;
	my ($infile) = @_;

	$stanfile = $infile;
	$stanfile =~ s/_raw/_tempstan/g; 	
	$lbjfile = $infile;
	$lbjfile =~ s/_raw/_lbj/g; 
	$corrfile = $infile;
	$corrfile =~ s/_raw/_correct/g; 
	$coreffile = $infile . ".LBJ";
	
	{
		open(RAW, "<", $infile) || die "cannot open file $infile";
		local $/;
		$doc = <RAW>;
		close(RAW);
	}

	@raw_tokens = split /\s+/, $doc;
	$num = scalar @raw_tokens;	
	$str_traw = join "\n", @raw_tokens;
	
	@raw_stags = ();
	@raw_ltags = ();
	@raw_utags = ();
	@raw_ctags = ();	
			
	if(-e $stanfile)
	{

		{
			open($ifh, "<", $stanfile) || warn "sync: cannot open file $stanfile";
			local $/;
			$s_str = <$ifh>;
			close($ifh);
		}
		
		@stan_tagstokens = split /\s+/, $s_str;
		@stan_tokens = ();
		@stan_tags = ();
		
		for(@stan_tagstokens)
		{
			($token, $tag) = split "/", $_;
			#\ | ( ) [ { ^ $ * + ? . or $token =~ /\s+/ or $token =~/[:;,\.\[|"']+/
			#next if($l_compare eq "giza" and ($token eq "" or $token eq "-LRB-" or $token eq "-RRB-")); #or $token eq "s" or $token !~ /\w+/		
			#next if($token eq "" or $token eq "-LRB-" or $token eq "-RRB-" or $token eq "'s" or $token eq "s" or $token !~ /\w+/);
			#$token =~ s/\W//g;
			$tag = ($tag eq "O") ? "NOT" : substr($tag, 0, 3);
			$tag = "MIS" if($lang eq "sw" and (lc $token eq "january" or lc $token eq "february" or lc $token eq "march" or lc $token eq "april" or lc $token eq "may" or lc $token eq "june" or lc $token eq "july" or lc $token eq "august" or lc $token eq "september" or lc $token eq "october" or lc $token eq "november" or lc $token eq "december" or lc $token eq "sunday" or lc $token eq "monday" or lc $token eq "tuesday" or lc $token eq "wednesday" or lc $token eq "thursday" or lc $token eq "friday" or lc $token eq "saturday"));

			#$tag = "MIS" if($swh_propers{lc $token} ne "" and $swh_commons{lc $token} eq "");
			#$tag = "MIS" if($propers{lc $token} ne "" and $commons{lc $token} eq "");		
			
			push @stan_tokens, $token;
			push @stan_tags, $tag;
		}

		for (my $i = 0; $i < scalar @raw_tokens; $i++)
		{
			if($raw_tokens[$i] eq $stan_tokens[$i])
			{
				$raw_stags[$i] = $stan_tags[$i];
			}
			if($stan_tokens[$i] eq "-LRB-" or $stan_tokens[$i] eq "-RRB-")
			{
				splice @stan_tokens, $i, 1;
				splice @stan_tags, $i, 1;
			}
			if($raw_tokens[$i] ne $stan_tokens[$i])
			{
				$temptag = "NOT";
				$tempstr = "";
				$flag = 0;
				for($j = 0; $j <= 5; $j++)
				{
					$temptag = $stan_tags[$i+$j] if($stan_tags[$i+$j] ne "NOT");
					$tempstr = $tempstr . $stan_tokens[$i + $j];
					if($tempstr eq $raw_tokens[$i] or "(" . $tempstr eq $raw_tokens[$i])
					{
						$flag = 1;
						last;
					}
				}
				if($flag == 1)
				{
					$raw_stags[$i] = $temptag;				
					$stan_tokens[$i] = $raw_tokens[$i];
					$stan_tags[$i] = $raw_stags[$i];
					splice @stan_tokens, $i+1, $j;
					splice @stan_tags, $i+1, $j;				
				}
				else
				{
					$temptag = "NOT";
					for($j = 1; $j <= 5; $j++)
					{
						$temptag = $stan_tags[$i+$j] if($stan_tags[$i+$j] ne "NOT");
						if($raw_tokens[$i] eq $stan_tokens[$i+$j])
						{
							$flag = 1;
							last;
						}
					}
					if($flag == 1)
					{
						splice @stan_tokens, $i, $j;
						splice @stan_tags, $i, $j;
						$raw_stags[$i] = $temptag;
					}				
					else
					{
						$raw_stags[$i] = $stan_tags[$i];
					}				
				}
			}
			#print "stan_sync: |$raw_tokens[$i]|$stan_tokens[$i]|\n";
		}
		$str_sraw = join "\n", @raw_stags;	
	}
	else
	{
		$str_sraw = "";
	}

	if(-e $lbjfile)
	{
	
		{
			open($ifh, "<", $lbjfile) || warn "sync: cannot open file $lbjfile";
			local $/;
			$l_str = <$ifh>;
			close($ifh);
		}

		@lbj_tagstext = split /[\[\]]/, $l_str;
		@lbj_tokens = ();
		@lbj_tags = ();
		for my $chunk (@lbj_tagstext)
		{
			if($chunk =~ /^PER / or $chunk =~ /^LOC / or $chunk =~ /^ORG / or $chunk =~ /^MISC /)
			{
				$tag = substr($chunk, 0, 3);
				$chunk = ($chunk=~ /MISC/) ? substr($chunk, 5, -2) : substr($chunk, 4, -2);
			}
			else
			{
				$tag = "NOT";
			}
			$chunk =~ s/—/ /g;
			@parts = split /\s+/, $chunk;
			for my $token(@parts)
			{
				#next if($token eq "" or $token eq "s" or $token !~ /\w+/);
				#$token =~ s/\W//g;
				push @lbj_tokens, $token;
				push @lbj_tags, $tag;
			}		
		}

		for (my $i = 0; $i < scalar @raw_tokens; $i++)
		{
			if($raw_tokens[$i] eq $lbj_tokens[$i])
			{
				$raw_ltags[$i] = $lbj_tags[$i];
			}
			if($lbj_tokens[$i] eq "-LRB-" or $lbj_tokens[$i] eq "-RRB-")
			{
				splice @lbj_tokens, $i, 1;
				splice @lbj_tags, $i, 1;
			}
			if($raw_tokens[$i] ne $lbj_tokens[$i])
			{
				$temptag = "NOT";
				$tempstr = "";
				$flag = 0;
				for($j = 0; $j <= 5; $j++)
				{
					$temptag = $lbj_tags[$i+$j] if($lbj_tags[$i+$j] ne "NOT");
					$tempstr = $tempstr . $lbj_tokens[$i + $j];
					if($tempstr eq $raw_tokens[$i] or "(" . $tempstr eq $raw_tokens[$i])
					{
						$flag = 1;
						last;
					}
				}
				if($flag == 1)
				{
					$raw_ltags[$i] = $temptag;				
					$lbj_tokens[$i] = $raw_tokens[$i];
					$lbj_tags[$i] = $raw_ltags[$i];
					splice @lbj_tokens, $i+1, $j;
					splice @lbj_tags, $i+1, $j;				
				}
				else
				{
					$temptag = "NOT";
					for($j = 1; $j <= 5; $j++)
					{
						$temptag = $lbj_tags[$i+$j] if($lbj_tags[$i+$j] ne "NOT");
						if($raw_tokens[$i] eq $lbj_tokens[$i+$j])
						{
							$flag = 1;
							last;
						}
					}
					if($flag == 1)
					{
						splice @lbj_tokens, $i, $j;
						splice @lbj_tags, $i, $j;
						$raw_ltags[$i] = $temptag;
					}				
					else
					{
						$raw_ltags[$i] = $lbj_tags[$i];
					}				
				}
			}
			#print "lbj_sync: |$raw_tokens[$i]|$lbj_tokens[$i]|\n";
		}
		$str_lraw = join "\n", @raw_ltags;
	}
	else
	{
		$str_lraw = "";
	}
	
	if(-e $stanfile and -e $lbjfile)
	{
		for (my $i = 0; $i < scalar @raw_tokens; $i++)
		{
			if($raw_stags[$i] ne "NOT" or $raw_ltags[$i] ne "NOT")
			{
				$raw_utags[$i] = $raw_stags[$i] if($raw_stags[$i] ne "NOT");
				$raw_utags[$i] = $raw_ltags[$i] if($raw_ltags[$i] ne "NOT");
			}
			else
			{
				$raw_utags[$i] = "NOT";
			}
		}
		$str_uraw = join "\n", @raw_utags;
	}
	else
	{
		$str_uraw = "";
	}

	if(-e $corrfile)
	{
		
		{
			open($cfh, "<", $corrfile);
			local $/;
			$c_str = <$cfh>;
			close($cfh);
		}

		@corr_tagstext = split /[\[\]]/, $c_str;
		@corr_tokens = ();
		@corr_tags = ();
		for my $chunk (@corr_tagstext)
		{
			if($chunk =~ /^PER / or $chunk =~ /^LOC / or $chunk =~ /^ORG / or $chunk =~ /^MISC /)
			{
				$tag = substr($chunk, 0, 3);
				$chunk = ($chunk=~ /MISC/) ? substr($chunk, 5, -2) : substr($chunk, 4, -2);
			}
			else
			{
				$tag = "NOT";
			}
			$chunk =~ s/—/ /g;
			@parts = split /\s+/, $chunk;
			for my $token(@parts)
			{
				#next if($token eq "" or $token eq "s" or $token !~ /\w+/);
				#$token =~ s/\W//g;
				push @corr_tokens, $token;
				push @corr_tags, $tag;
			}		
		}
	
		for (my $i = 0; $i < scalar @raw_tokens; $i++)
		{
			if($raw_tokens[$i] eq $corr_tokens[$i])
			{
				$raw_ctags[$i] = $corr_tags[$i];
			}
			if($corr_tokens[$i] eq "-LRB-" or $corr_tokens[$i] eq "-RRB-")
			{
				splice @corr_tokens, $i, 1;
				splice @corr_tags, $i, 1;
			}
			if($raw_tokens[$i] ne $corr_tokens[$i])
			{
				$temptag = "NOT";
				$tempstr = "";
				$flag = 0;
				for($j = 0; $j <= 5; $j++)
				{
					$temptag = $lbj_tags[$i+$j] if($corr_tags[$i+$j] ne "NOT");
					$tempstr = $tempstr . $corr_tokens[$i + $j];
					if($tempstr eq $raw_tokens[$i] or "(" . $tempstr eq $raw_tokens[$i])
					{
						$flag = 1;
						last;
					}
				}
				if($flag == 1)
				{
					$raw_ctags[$i] = $temptag;				
					$corr_tokens[$i] = $raw_tokens[$i];
					$corr_tags[$i] = $raw_ctags[$i];
					splice @corr_tokens, $i+1, $j;
					splice @corr_tags, $i+1, $j;				
				}
				else
				{
					$temptag = "NOT";
					for($j = 1; $j <= 5; $j++)
					{
						$temptag = $corr_tags[$i+$j] if($corr_tags[$i+$j] ne "NOT");
						if($raw_tokens[$i] eq $corr_tokens[$i+$j])
						{
							$flag = 1;
							last;
						}
					}
					if($flag == 1)
					{
						splice @corr_tokens, $i, $j;
						splice @corr_tags, $i, $j;
						$raw_ctags[$i] = $temptag;
					}				
					else
					{
						$raw_ctags[$i] = $corr_tags[$i];
					}				
				}
			}
			#print "corr_sync: |$raw_tokens[$i]|$corr_tokens[$i]|\n";
		}
		$str_craw = join "\n", @raw_ctags;	
	}
	else
	{
		$str_craw = "";
	}

	if(-e $coreffile)
	{
		my $viewoutput_LBJ = "$infile.LBJ_list";
		my $mapfile = "$infile.MAP";
		warn "Extract_lbj: Processing... $infile\n";
		my %hash = ();
		my %counta = ();
		my @row;
		@coref_tokens = ();

		############################################
		############# -- Load LBJ -- ###############
		############################################
		open(LM, "<", $coreffile);
		open(TMP, ">", "$coreffile.tmp");
		while (<LM>) 
		{
		  @row = split(//, $_);
		}
		my $i = 0;
		#print scalar @row, "\n";
		while ($i < scalar(@row) -1) {
			# -- *
			if (($row[$i] eq "*") && ($row[$i+1] ne "_")) {
				print TMP "<coref>\n";
			}
			# -- *_
			elsif (($row[$i] eq "*") && ($row[$i+1] eq "_")) {
				print TMP "\n</coref set";
				while (($row[$i+1] ne "*") && ($row[$i+1] ne " ")) {
					#last if($i > scalar(@row) -1);
					print TMP $row[$i+1];
					$i++;					
				}
				print TMP ">";
			}
			# -- normal case
			elsif ($row[$i] eq " ") {
				print TMP "\n";
			}
			else {
				print TMP $row[$i];
			}			
			$i++;
			#print "i $i\n";
		}
		close(LM);
		close(TMP);
		#print "got out\n";
		my $id = "";
		my $word = "";
		my $name = "";
		my @stack = ();
		my $pos = 0;
		my $wnum = 0;
		my $num = 0;
		my $result;
		my $found;
		open(LM, "<", "$coreffile.tmp");
		while (<LM>) 
		{
		  chomp;
		  if (index($_,"<coref>") >= 0) {
			  #$id = substr($_, index($_,"<coref>")+11);
			  #$id = substr($id, 0, index($id,">"));
			  #print $id."\n";
			  
			  #push(@stack, $id);
			  push(@stack, length($name));
			  push(@stack, $wnum);
		  }
		  else {
			  if (index($_,"</coref") < 0) {
				  $word = $_;
				  if ($word =~ m/[a-zA-Z0-9]/) {
					 #print $word."\n";
					 $wnum ++;
					 while (!(substr($word,-1) =~ m/[a-zA-Z0-9]/)) {
						 $word = substr($word, 0, length($word)-1);
					 }
					 push(@coref_tokens, $word);
					 if (scalar(@stack)>0) {
						$name = $name.$word." ";
					 }
				  }
			  }
			  
			  if (index($_, "</coref") >= 0) {
				  #push(@stack, $id);
				  $num = pop(@stack);
				  $pos = pop(@stack);
				  #$id = pop(@stack);
				  $id = substr($_, index($_,"</coref set_")+12);
				  $id = substr($id, 0, index($id,">"));
				  $hash{$num."\t".($wnum - $num)."\t".substr($name, $pos)."\t".$id} = $num;
				  $counta{$id} ++;
			  #	  print $num."\t".($wnum - $num)."\t".substr($name, $pos)."\t".$id."\n";
			  }
		  }		  
		}
		close(LM);

		#***************************NEW CODE ***************************
		$k = 0;
		@track = ();
		for (my $i = 0; $i < scalar @raw_tokens; $i++)
		{
			if($raw_tokens[$i] eq $coref_tokens[$k])
			{
				#$raw_ctags[$i] = $corr_tags[$i];
				$track[$k] = $i;
			}
			if($raw_tokens[$i] ne $coref_tokens[$k])
			{
				#$temptag = "NOT";
				$tempstr = "";
				$flag = 0;
				for($j = 0; $j <= 5; $j++)
				{
					#$temptag = $lbj_tags[$i+$j] if($corr_tags[$i+$j] ne "NOT");
					$tempstr = $tempstr . $coref_tokens[$k + $j];
					if($tempstr eq $raw_tokens[$i] or "(" . $tempstr eq $raw_tokens[$i])
					{
						$flag = 1;
						last;
					}
				}
				if($flag == 1)
				{
					#$raw_ctags[$i] = $temptag;				
					$track[$k] = $i;
					for($l = 1; $l <= $j; $l++)
					{
						$k++;
						$track[$k] = $i;
					}
					#$corr_tags[$i] = $raw_ctags[$i];
					#splice @coref_tokens, $i+1, $j;
					#print "Case 1\n";
					#splice @corr_tags, $i+1, $j;				
				}
				else
				{
					#$temptag = "NOT";
					if($raw_tokens[$i] eq $coref_tokens[$k-1])
					{
						$k--;
						$track[$k] = $i;
						#splice @coref_tokens, $i-1, 0, "NULL";
						#print "Case 2\n";
					}
					else
					{
						for($j = 1; $j <= 5; $j++)
						{
							#$temptag = $corr_tags[$i+$j] if($corr_tags[$i+$j] ne "NOT");
							if($raw_tokens[$i] eq $coref_tokens[$k+$j])
							{
								$flag = 1;
								last;
							}
						}
						if($flag == 1)
						{
							$track[$k] = $i;
							for($l = 1; $l <= $j; $l++)
							{	
								$k++;							
								$track[$k] = $i;								
							}

							#splice @coref_tokens, $i, $j;
							#print "Case 3\n";
							#splice @corr_tags, $i, $j;
							#$raw_ctags[$i] = $temptag;
						}				
						else
						{
							#$raw_ctags[$i] = $corr_tags[$i];
							$track[$k] = $i;								
						}
					}					
				}
			}
			#print "coref_sync: |$raw_tokens[$i]|$coref_tokens[$k]|\n";
			$k++;				
		}
		open(my $mfh, ">", $mapfile);
		for (my $i = 0; $i < scalar @coref_tokens; $i++)
		{
			print $mfh "$i\t$track[$i]\t$coref_tokens[$i]\t$raw_tokens[$track[$i]]\n";
		}
		close($mfh);
		#print "File ends " . scalar @raw_tokens . " $k\n";
		#print $wnum."\n";
		#print $tokens[324]."\n";
		#print scalar(@tokens)."\n";
		#print "LBJ Loaded\n";

		############################################
		########## -- Formatting List -- ###########
		############################################
		my $key;
		open(OUTFILE, ">", $viewoutput_LBJ) || die "cannot write file\n";
		foreach $key ( sort { $hash{$a} <=> $hash{$b} } keys %hash ) {
			my @tmpk = split(/\t/,$key);
			$num = $tmpk[0];
			$start = $track[$num];
			$len = $tmpk[1];
			$id = $tmpk[3];
			$end = $track[$num+$len-1];
			$word = "";
			for($i = $start; $i <= $end; $i++)
			{
				$word = ($word eq "") ? $raw_tokens[$i] : $word . " " . $raw_tokens[$i];
			}
			$newlen = $end - $start + 1;
			print OUTFILE "$start\t$newlen\t$word\t$id\n";
			#print OUTFILE $key. "\n";		#old line
		} 
		close(OUTFILE);	
		#*******************************NEW CODE ENDS *****************
	}	
	return ($num, $str_traw, $str_sraw, $str_lraw, $str_uraw, $str_craw);
}

sub make_list
{
	my $name = shift;
	my ($en_file, $src_file) = @_;
	($en_num, $en_str_tokens, $en_str_stan, $en_str_lbj, $en_str_union, $en_str_corr) = sync("sync", $en_file);
	@en_stan_tokens = split "\n", $en_str_tokens;
	@en_lbj_tokens = split "\n", $en_str_tokens;
	@en_union_tokens = split "\n", $en_str_tokens;
	@en_corr_tokens = split "\n", $en_str_tokens;
	
	@en_stan_tags = split "\n", $en_str_stan;
	@en_lbj_tags = split "\n", $en_str_lbj;
	@en_union_tags = split "\n", $en_str_union;
	@en_corr_tags = split "\n", $en_str_corr;	
	
	($src_num, $src_str_tokens, $src_str_stan, $src_str_lbj, $src_str_union, $src_str_corr) = sync("sync", $src_file);
	@src_stan_tokens = split "\n", $src_str_tokens;
	@src_lbj_tokens = split "\n", $src_str_tokens;	
	@src_union_tokens = split "\n", $src_str_tokens;
	@src_corr_tokens = split "\n", $src_str_tokens;

	@src_stan_tags = split "\n", $src_str_stan;
	@src_lbj_tags = split "\n", $src_str_lbj;
	@src_union_tags = split "\n", $src_str_union;
	@src_corr_tags = split "\n", $src_str_corr;
	
	#en_corr and src_union
	my %en_list = ();
	my %src_list = ();
	my %comb_list = ();
	for($i = 0; $i < $src_num; $i++)
	{
		if($src_union_tags[$i] ne "NOT")
		{
			#print "src: $src_union_tokens[$i] Stem: " , Text::English::stem($src_union_tokens[$i]), "\n";
			@stem  = Text::English::stem($src_union_tokens[$i]);
			$src_list{$stem[0]}++;
			$comb_list{$stem[0]}++;
		}
	}
	$srclist = join ",", keys %src_list;	
	if($en_str_corr eq "" or $en_str_tcorr eq "")
	{
		print "Correct file doesn't exist for $en_file, so using stan file\n";
		for($i = 0; $i < $en_num; $i++)
		{
			if($en_union_tags[$i] ne "NOT")
			{
				#print "en: $en_corr_tokens[$i] Stem: " , Text::English::stem($en_corr_tokens[$i]), "\n";
				@stem = Text::English::stem($en_union_tokens[$i]);
				$en_list{$stem[0]}++;
				$comb_list{$stem[0]}++;
			}
		}
		#return (0,0,0);
	}
	else
	{
		for($i = 0; $i < $en_num; $i++)
		{
			if($en_corr_tags[$i] ne "NOT")
			{
				#print "en: $en_corr_tokens[$i] Stem: " , Text::English::stem($en_corr_tokens[$i]), "\n";
				@stem = Text::English::stem($en_corr_tokens[$i]);
				$en_list{$stem[0]}++;
				$comb_list{$stem[0]}++;
			}
		}
	}
	$enlist = join ",", keys %en_list;
	
	return(compare_lists("compare_lists", \%en_list, \%src_list, \%comb_list));
}

sub giza
{
	my $name = shift;
	my $lang = shift;
	my $in_cfile = shift;  #in_corpus.txt
	my $out_cfile = shift; #out_corpus.txt
	my $align_pref = shift;
	
	my @files = @_;
		
	#make parallel input and output corpora
	unlink($in_cfile) if(-e $in_cfile);
	unlink($out_cfile) if(-e $out_cfile);
	if(lc $^O eq "mswin32")
	{
		open($ofh_i, ">>", $in_cfile);
		open($ofh_o, ">>", $out_cfile);
	}
	for my $file (@files)
	{
		my $incorp = $file;
		if($lang eq "sw")
		{
			$incorp =~ s/\/alasiri/\/sw_alasiri/g;
			$incorp =~ s/_swh/_raw/g;
		}
		else
		{
			($incorp =~ /_arb/) ? $incorp =~ s/_arb/_raw/g : $incorp =~ s/-normalized-derivations/_raw/g;			
		}
		if(lc $^O eq "mswin32")
		{
			$tc = 0;
			open($ifh, "<",$incorp);
			while(<$ifh>)
			{
				print $ofh_i $_;
				$tc++;
			}
			push @tcs1, $tc;
			close($ifh);			
		}
		else
		{			
			`cat $incorp >> $in_cfile`;
		}				
		
	}
	for my $file (@files)
	{
		my $outcorp = $file;
		if($lang eq "sw")
		{
			$outcorp =~ s/\/alasiri/\/sw_alasiri/g;	
		}
		else
		{
			$outcorp =~ s/-normalized-derivations/_arb/g if($outcorp !~ /_arb/);
		}
		if(lc $^O eq "mswin32")
		{
			open($ifh, "<",$outcorp);
			$tc2 = 0;
			while(<$ifh>)
			{
				print $ofh_o $_;
				$tc2++;
			}
			push @tcs2, $tc2;
			close($ifh);			
		}
		else
		{
			`cat $outcorp >> $out_cfile`;			
		}				
	}
	if(lc $^O eq "mswin32")
	{
		close($ofh_i);
		close($ofh_o);
	}	
	for($tc = 0; $tc < 50; $tc++)
	{
		print "file: $files[$tc] 1: $tcs1[$tc] 2: $tcs2[$tc]\n" if($tcs1[$tc] != $tcs2[$tc]);
	}
	
	#run GIZA++. by default we've used in_corpus3, out_corpus3, dic3 (i.e. dic3.AA3.final)
	$inc_pref = $in_cfile;
	$outc_pref = $out_cfile;	
	$inc_pref =~ s/\.txt//g;
	$outc_pref =~ s/\.txt//g;
	
	system("giza-pp/GIZA++-v2/plain2snt.out $inc_pref.txt $outc_pref.txt");
	system("giza-pp/mkcls-v2/mkcls -p$inc_pref.txt -V$inc_pref.vcb.classes");	
	system("giza-pp/mkcls-v2/mkcls -p$outc_pref.txt -V$outc_pref.vcb.classes");
	system("giza-pp/GIZA++-v2/GIZA++ -S $inc_pref.vcb -T $outc_pref.vcb -C $inc_pref\_$outc_pref.snt -O $align_pref");
	#print "Error: $!\n";
	
	#break AA3.final into files into individual files	
	my $cfile = $align_pref . ".AA3.final";
	open($cfh, "<", $cfile);	#"dic3.AA3.final"
	my $l_total;
	for my $file (@files)
	{
		my $infile = $file;
		if($lang eq "sw")
		{
			$infile =~ s/\/alasiri/\/sw_alasiri/g;
		}
		else
		{
			$infile =~ s/-normalized-derivations/_arb/g if($infile !~ /_arb/);
		}
		my $l_count = 0;
		open(my $ifh,  "<", $infile);
		while(<$ifh>)
		{
			$l_count++;
		}
		close($ifh);
		$l_total += $l_count;
		#print "$file: $l_count lines, $l_total total\n";
		my $outfile = $file;
		if($lang eq "sw")
		{
			$outfile =~ s/\/alasiri/\/sw_alasiri/g;
			$outfile =~ s/_swh/_corps/g;
		}
		else
		{
			($outfile =~ /_arb/) ? $outfile =~ s/_arb/_corps/g : $outfile =~ s/-normalized-derivations/_corps/g;
		}
		open(my $ofh, ">", $outfile);
		for($i = 0; $i < 3 * $l_count; $i++)
		{
			$line = <$cfh>;
			print $ofh $line;
		}
		close($ofh);
	}
	close($cfh);
}

sub compare_lists
{
	my $name = shift;
	my(%en_list) = %{$_[0]};
	my(%src_list) = %{$_[1]};
	my(%comb_list) = %{$_[2]};
	my $tp = 0;
	my $fp = 0;
	my $fn = 0;
	my $tn = 0;
	
	my $tw_tp = 0;
	my $tw_fp = 0;
	my $tw_fn = 0;
		
	for my $key (keys %comb_list)
	{
		$en_list{$key} = 0 if($en_list{$key} eq "");
		$src_list{$key} = 0 if($src_list{$key} eq "");
		#print "KEY $key in correct: $en_list{$key} in translated: $src_list{$key}\n";
		$lkey = lc $key;
		$src_list{$key} = 0 if($commons{$lkey} ne "" and $propers{$key} eq ""); # and $lkey eq $key);
		$src_list{$key} = 0 if($swh_commons{$lkey} ne "" and $swh_propers{$key} eq ""); # and $lkey eq $key);
		
		if($en_list{$key} == $src_list{$key})
		{
			$tp += $en_list{$key};
		}
		if($en_list{$key} > $src_list{$key})
		{
			$tp += $src_list{$key};
			$fn += $en_list{$key} - $src_list{$key};
			print "Recl Error: KEY $key in correct: $en_list{$key} in translated: $src_list{$key}\n";	
		}
		if($en_list{$key} < $src_list{$key})
		{
			$tp += $en_list{$key};
			$fp += $src_list{$key} - $en_list{$key};
			print "Prec Error: KEY $key in correct: $en_list{$key} in translated: $src_list{$key}\n";	
		}
		#type-wise
		$tw_tp++ if($en_list{$key}  != 0 and $src_list{$key}  != 0);
		$tw_fp++ if($en_list{$key}  == 0 and $src_list{$key}  != 0);
		$tw_fn++ if($en_list{$key}  != 0 and $src_list{$key}  == 0);		
	}
	my $prec = $tp / ($tp + $fp);
	my $recl = $tp / ($tp + $fn);
	my $f1 = 2 * $prec * $recl / ($prec + $recl);
	print "Sentence-level Precision: $prec Recall: $recl F1 $f1\n";
	$prec = $tw_tp / ($tw_tp + $tw_fp);
	$recl = $tw_tp / ($tw_tp + $tw_fn);
	$f1 = 2 * $prec * $recl / ($prec + $recl);
	print "\nTypeWise Sentence-level Precision: $prec Recall: $recl F1 $f1\n";
	
	return ($tp, $fp, $fn, $tw_tp, $tw_fp, $tw_fn);
}

sub make_table
{
	my $name = shift;
	my $outfile = shift;
	my @files = @_;
	if($outfile ne "") # and not -e $outfile)
	{
		print "Creating table file $outfile\n";
		open($ofh, ">", $outfile);			
	}
	else
	{
		#print "Running make_table in silent mode\n";
		$ofh = "";
	}
	print $ofh "Filename\tPosition\tEntity\tSystem\tNE_Type\tCoref_ID\n";
	$stan_table = "";
	$lbj_table = "";
	$corr_table = "";
	$union_table = "";
	
	for my $infile (@files)
	{
		($num, $str_tokens, $str_stan, $str_lbj, $str_union, $str_corr) = sync("sync", $infile);

		$filename = $infile;		
		$filename =~ s/.+\\//g if($filename =~ /\\/);
		$filename =~ s/.+\///g if($filename =~ /\//);
		#$filename =~ s/_raw.*//g;
		
		@stan_tokens = split "\n", $str_tokens;
		@lbj_tokens = split "\n", $str_tokens;
		@union_tokens = split "\n", $str_tokens;
		@corr_tokens = split "\n", $str_tokens;
		
		@stan_tags = split "\n", $str_stan;
		@lbj_tags = split "\n", $str_lbj;
		@union_tags = split "\n", $str_union;
		@corr_tags = split "\n", $str_corr;
		
		if($str_stan ne "")
		{
			$hfile = $infile;
			$hfile =~ s/_raw/_stanford/g;
			$hfile =~ s/\.(txt)|(sgm)|(text)/\.html/g;
			open($hfh, ">", $hfile);
			print $hfh "<html><FONT SIZE=5>\n";
			for($i = 0; $i < $num; $i++)
			{
				if($stan_tags[$i] eq "NOT")
				{
					print $hfh "$stan_tokens[$i] ";
				}
				if($stan_tags[$i] ne "NOT")
				{
					$starttag = $stan_tags[$i];
					$j = $i+1;
					$j++ while($stan_tags[$j] eq $starttag and $j < $num and $stan_tokens[$j-1] !~ /,$/);
					$start = $i;
					$end = $j;
					$i = $j - 1;
					$entity = "";
					for($k = $start; $k < $end; $k++)
					{
						$entity = ($entity eq "") ? $stan_tokens[$k] : $entity . " " . $stan_tokens[$k];
					}
					print $ofh "$filename\t$start\t$entity\tNE_STAN\t$stan_tags[$start]\tNULL\n";
					$stan_table = $stan_table . "$filename\t$start\t$entity\tNE_STAN\t$stan_tags[$start]\tNULL\n";
					if($starttag eq "PER")
					{
						print $hfh "<FONT COLOR=RED>$entity</FONT> ";
					}
					if($starttag eq "LOC")
					{
						print $hfh "<FONT COLOR=BLUE>$entity</FONT> ";
					}
					if($starttag eq "ORG")
					{
						print $hfh "<FONT COLOR=GREEN>$entity</FONT> ";
					}
					if($starttag eq "MIS")
					{
						print $hfh "<FONT COLOR=PURPLE>$entity</FONT> ";
					}
					
				}
			}
			print $hfh "\n";
			print $hfh "\n</FONT></html>\n";
			close($hfh);		
		}
		
		if($str_lbj ne "")
		{
			$hfile = $infile;
			$hfile =~ s/_raw/_lbj/g;
			$hfile =~ s/\.(txt)|(sgm)|(text)/\.html/g;
			open($hfh, ">", $hfile);
			print $hfh "<html><FONT SIZE=5>\n";
			for($i = 0; $i < $num; $i++)
			{
				if($lbj_tags[$i] eq "NOT")
				{
					print $hfh "$lbj_tokens[$i] ";
				}
				if($lbj_tags[$i] ne "NOT")
				{
					$starttag = $lbj_tags[$i];
					$j = $i+1;
					$j++ while($lbj_tags[$j] eq $starttag and $j < $num);
					$start = $i;
					$end = $j;
					$i = $j - 1;
					$entity = "";
					for($k = $start; $k < $end; $k++)
					{
						$entity = ($entity eq "") ? $lbj_tokens[$k] : $entity . " " . $lbj_tokens[$k];
					}
					print $ofh "$filename\t$start\t$entity\tNE_LBJ\t$lbj_tags[$start]\tNULL\n";
					$lbj_table = $lbj_table . "$filename\t$start\t$entity\tNE_LBJ\t$lbj_tags[$start]\tNULL\n";
					if($starttag eq "PER")
					{
						print $hfh "<FONT COLOR=RED>$entity</FONT> ";
					}
					if($starttag eq "LOC")
					{
						print $hfh "<FONT COLOR=BLUE>$entity</FONT> ";
					}
					if($starttag eq "ORG")
					{
						print $hfh "<FONT COLOR=GREEN>$entity</FONT> ";
					}
					if($starttag eq "MIS")
					{
						print $hfh "<FONT COLOR=PURPLE>$entity</FONT> ";
					}
					
				}
			}
			print $hfh "\n";
			print $hfh "\n</FONT></html>\n";
			close($hfh);		
		}

		if($str_corr ne "")
		{
			$hfile = $infile;
			$hfile =~ s/_raw/_correct/g;
			$hfile =~ s/\.(txt)|(sgm)|(text)/\.html/g;
			open($hfh, ">", $hfile);
			print $hfh "<html><FONT SIZE=5>\n";		
			for($i = 0; $i < $num; $i++)
			{
				if($corr_tags[$i] eq "NOT")
				{
					print $hfh "$corr_tokens[$i] ";
				}			
				if($corr_tags[$i] ne "NOT")
				{
					$starttag = $corr_tags[$i];
					$j = $i+1;
					$j++ while($corr_tags[$j] eq $starttag and $j < $num);
					$start = $i;
					$end = $j;
					$i = $j - 1;
					$entity = "";
					for($k = $start; $k < $end; $k++)
					{
						$entity = ($entity eq "") ? $corr_tokens[$k] : $entity . " " . $corr_tokens[$k];
					}
					print $ofh "$filename\t$start\t$entity\tNE_CORR\t$corr_tags[$start]\tNULL\n";
					$corr_table = $corr_table . "$filename\t$start\t$entity\tNE_CORR\t$corr_tags[$start]\tNULL\n";
					if($starttag eq "PER")
					{
						print $hfh "<FONT COLOR=RED>$entity</FONT> ";
					}
					if($starttag eq "LOC")
					{
						print $hfh "<FONT COLOR=BLUE>$entity</FONT> ";
					}
					if($starttag eq "ORG")
					{
						print $hfh "<FONT COLOR=GREEN>$entity</FONT> ";
					}
					if($starttag eq "MIS")
					{
						print $hfh "<FONT COLOR=PURPLE>$entity</FONT> ";
					}
					
				}
			}
			print $hfh "\n";
			print $hfh "\n</FONT></html>\n";
			close($hfh);					
		}

		if($str_union ne "")
		{
			$hfile = $infile;
			$hfile =~ s/raw/union/g;
			$hfile =~ s/\.(txt)|(sgm)|(text)/\.html/g;
			open($hfh, ">", $hfile);
			print $hfh "<html><FONT SIZE=5>\n";		
			for($i = 0; $i < $num; $i++)
			{
				if($union_tags[$i] eq "NOT")
				{
					print $hfh "$union_tokens[$i] ";
				}
				if($union_tags[$i] ne "NOT")
				{
					$starttag = $union_tags[$i];
					$j = $i+1;
					$j++ while($union_tags[$j] eq $starttag and $j < $num);
					$start = $i;
					$end = $j;
					$i = $j - 1;
					$entity = "";
					for($k = $start; $k < $end; $k++)
					{
						$entity = ($entity eq "") ? $union_tokens[$k] : $entity . " " . $union_tokens[$k];
					}
					print $ofh "$filename\t$start\t$entity\tNE_UNION\t$union_tags[$start]\tNULL\n";
					$union_table = $union_table . "$filename\t$start\t$entity\tNE_UNION\t$union_tags[$start]\tNULL\n";
					if($starttag eq "PER")
					{
						print $hfh "<FONT COLOR=RED>$entity</FONT> ";
					}
					if($starttag eq "LOC")
					{
						print $hfh "<FONT COLOR=BLUE>$entity</FONT> ";
					}
					if($starttag eq "ORG")
					{
						print $hfh "<FONT COLOR=GREEN>$entity</FONT> ";
					}
					if($starttag eq "MIS")
					{
						print $hfh "<FONT COLOR=PURPLE>$entity</FONT> ";
					}				
				}
			}
			print $hfh "\n";
			print $hfh "\n</FONT></html>\n";
			close($hfh);
		}		
	}
	close($ofh);
	return ($stan_table, $lbj_table, $corr_table, $union_table);
}

sub email_pre
{
	my $name = shift;
	my ($infile) = @_;	
	open($ifh, "<", $infile);
	$str = "";
	while(<$ifh>)
	{
		$str = ($str eq "") ? $_ : $str . $_;
	}
	$str =~ s/=20(\n|\r)/ /g;
	$str =~ s/=01;(\n|\r)/ /g;
	$str =~ s/=01(;|,|&|\d+)/ /g;
	$str =~ s/=(\n|\r)+//g;
	$str =~ s/\//\\/g;
	$str =~ s/<.+>/ /g;
	close($ifh);
	open($ifh, ">", $infile);
	print $ifh $str;
	close($ifh);
}

sub lbj
{
	my $name = shift;
	my ($infile) = @_;
	die "Enter an input file\n" if ($infile eq "");
	$ofile = $infile;
	$ofile =~ s/_raw/_lbj/g; 
	$tf = "true";
	#unlink($ofile) if(-e $ofile);
	return if(-e $ofile);
	print "Running lbj on $infile\n";	
	if(lc $^O eq "linux")
	{
		$command = "java -classpath LBJ2.jar:LBJ2Library.jar:LbjNerTagger.jar -Xmx1500m LbjTagger.NerTagger -annotate ../$infile ../$ofile $tf Config/allFeaturesBigTrainingSet.config";
		
	}
	else
	{
		$command = "java -classpath \"LBJ2.jar;LBJ2Library.jar;LbjNerTagger.jar;LBJ2.jar:LBJ2Library.jar:LbjNerTagger.jar;\" -Xmx1500m LbjTagger.NerTagger -annotate ../$infile ../$ofile $tf Config/allFeaturesBigTrainingSet.config";
	}
	if(lc $^O eq "mswin32")
	{
		chdir("LbjNerTagger1.11.release");
		system($command);
		chdir("..");
	}
	else
	{
		`cd LbjNerTagger1.11.release\n$command\ncd ..\n`;
	}
}

sub stanford
{
	my $name = shift;
	my ($infile) = @_;
	die "Enter an input file\n" if ($infile eq "");	
	$ofile = $infile;
	$ofile =~ s/_raw/_tempstan/g; 
	#unlink($ofile) if(-e $ofile);
	return if(-e $ofile);
	print "Running stanford on $infile\n";	
	if(lc $^O eq "linux")
	{
		$command = "java -mx1000m -cp stanford-ner.jar edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier classifiers/ner-eng-ie.crf-4-conll-distsim.ser.gz -textFile ../$infile > ../$ofile";
	}
	else
	{
		$command = "java -mx1000m -cp \"stanford-ner.jar;\" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier classifiers/ner-eng-ie.crf-4-conll-distsim.ser.gz -textFile ../$infile > ../$ofile";
	}
	if(lc $^O eq "mswin32")
	{
		chdir("stanford-ner-2009-01-16");
		system($command);
		chdir("..");
	}
	else
	{
		`cd stanford-ner-2009-01-16\n$command\ncd ..\n`;
	}

	open($ifh, "<", $ofile);
	$count = 0;
	$str = "";
	while(<$ifh>)
	{
		$str = $str . " " . $_;
		$count++;
	}
	close($ifh);

	$o2file = $infile;
	$o2file =~ s/_raw/_stanford/g; 
	$o2htm = $o2file;
	$o2htm  =~ s/stanford/stanford2/g;
	$o2htm =~ s/\.(txt)|(sgm)|(text)/\.html/g;
	
	open($ofh, ">", $o2file);
	open($hfh, ">", $o2htm);
	@tokens = split /\s+/, $str;
	$count = 0;
	@formatted_tokens = ();
	@hformatted_tokens = ();
	$prevtag = "";

	for(@tokens)
	{
		($token, $tag) = split "/", $_;
		$formatted = $token;
		$hformatted = $token;
		$formatted = "[PER $formatted  ]" if($tag eq "PERSON");
		$hformatted = "<FONT COLOR=RED>$hformatted</FONT>" if($tag eq "PERSON");
		$formatted = "[ORG $formatted  ]" if($tag eq "ORGANIZATION");
		$hformatted = "<FONT COLOR=GREEN>$hformatted</FONT>" if($tag eq "ORGANIZATION");
		$formatted = "[LOC $formatted  ]" if($tag eq "LOCATION");
		$hformatted = "<FONT COLOR=BLUE>$hformatted</FONT>" if($tag eq "LOCATION");
		$formatted = "[MIS $formatted  ]" if($tag eq "MISC");
		$hformatted = "<FONT COLOR=PURPLE>$hformatted</FONT>" if($tag eq "MISC");
		if($tag eq $prevtag and ($tag eq "PERSON" or $tag eq "ORGANIZATION" or $tag eq "LOCATION" or $tag eq "MISC"))
		{
			$formatted_tokens[$count - 1] = substr($formatted_tokens[$count - 1], 0, -3);
			$hformatted_tokens[$count - 1] = substr($hformatted_tokens[$count - 1], 0, -7);
			
			#$formatted = ($tag eq "MISC") ? substr($formatted, 6) : substr($formatted, 5); #deprecated
			
			$formatted = substr($formatted, 5);
			if($tag eq "PERSON")
			{
				$hformatted = substr($hformatted, 16);
			}
			if($tag eq "ORGANIZATION")
			{
				$hformatted = substr($hformatted, 18);
			}
			if($tag eq "LOCATION")
			{
				$hformatted = substr($hformatted, 17);
			}
			if($tag eq "MISC")
			{
				$hformatted = substr($hformatted, 19);
			}
			
		}
		push @formatted_tokens, $formatted;
		push @hformatted_tokens, $hformatted;
		$prevtag = $tag;
		$count++;
	}

	print $hfh "<html><FONT SIZE=5>\n";
	for(@formatted_tokens)
	{
		print $ofh "$_ ";
	}
	for(@hformatted_tokens)
	{
		print $hfh "$_ ";
	}	
	print $ofh "\n";
	print $hfh "\n";
	print $hfh "\n</FONT></html>\n";
	close($ofh);
	close($hfh);
}

sub hcs_xml2txt
{
	my $name = shift;
	my ($infile) = @_;
	die "Enter an input file\n" if ($infile eq "");	
	open($ifh, "<", $infile);
	$ofile = $infile;
	$ofile =~ s/\.xml/_swh\.txt/g;
	print "hcs_xml2txt: $infile -> $ofile\n";
	open($ofh, ">", $ofile);
	$num = 0;
	$doc = "";
	while(<$ifh>)
	{
		$line = $_;
		chomp($line);
		$line =~ s/&quot;/"/g;
		$newline = "";
		@lineparts = split "<", $line;
		for my $linepart (@lineparts)
		{
			if($linepart =~ /w .+ type="(.+?)".+>(.+)/)
			{
				$type = $1;
				$word = $2;	
				$trans = "";
				if($type =~ /PROP/)
				{
					$netype = "NE";
					if($linepart =~ /trans="(.+?)"/)
					{
						$trans = $1;
					}
					else
					{
						$trans = $word;						
					}
					#print "Trans: $trans\n";					
				}
				else
				{
					$netype = "O";
				}
				
				#print "Word $word Type $type\n";
				if($trans ne "")
				{
					print $ofh "$word\t$netype\t$trans\n";
				}
				else
				{
					print $ofh "$word\t$netype\n";
				}
				#@parts = split /\s+/, $2;
				#$num += scalar @parts;
				#$tag = $type . " ". $2;
				#$word = "[" . $type . " " . $2 . "  ]";
				#$newline = $newline . $word . " ";
				#print "$tag\n";
			}
		}
		#$newline = substr($newline, 0, -1);
		#$newline = $newline . "\n";
		#$doc = $doc . $newline;
	}
	#print $ofh $doc;
	close($ifh);
	close($ofh);
}

sub lingpipe_xml2txt
{
	my $name = shift;
	my ($infile) = @_;
	die "Enter an input file\n" if ($infile eq "");
	print "Starting lingpipe_xml2txt on $infile\n";
	open($ifh, "<", $infile);
	$num = 0;
	$doc = "";
	while(<$ifh>)
	{
		$line = $_;
		chomp($line);
		$line =~ s/&quot;/"/g;
		$newline = "";
		@lineparts = split "<", $line;
		for my $linepart (@lineparts)
		{
			if($linepart =~ /s i=.+>(.+)/)
			{
				$newline = $newline . $1. " ";
			}
			if($linepart =~ /ENAMEX TYPE="(.+)">(.+)/)
			{
				$type = ($1 =~ /MISC/) ? substr($1, 0, 4) : substr($1, 0, 3);
				@parts = split /\s+/, $2;
				$num += scalar @parts;
				$tag = $type . " ". $2;
				$word = "[" . $type . " " . $2 . "  ]";
				$newline = $newline . $word . " ";
				#print "$tag\n";
			}
			if($linepart =~ /\/ENAMEX>(.+)/)
			{
				$newline = $newline . $1 . " ";
			}
		}
		$newline = substr($newline, 0, -1);
		#$newline = $newline . "\n";
		$doc = $doc . $newline;
	}
	$ofile = $infile;
	$ofile =~ s/xml/txt/g;
	open($ofh, ">", $ofile);
	print $ofh $doc;
	close($ofh);
}

sub mix
{
	my $name = shift;
	my ($infile) = @_;
	open(my $ifh, "<", $infile);
	my @part = ();
	my $tnum = 0;
	#takes a _mix file and produces a table_ file and a _stanford file
	$mlang = "swh";
	while(<$ifh>)
	{
		$line = $_;
		chop($line) if(substr($line, -1, 1) eq "\n" or substr($line, -1, 1) eq "\r");
		chop($line) if(substr($line, -1, 1) eq "\n" or substr($line, -1, 1) eq "\r");
		if($line ne "")
		{
			if($line !~ /[a-zA-Z]/)
			{
				$mlang = "arb";
				$tnum = 0;
			}
			else
			{
				$result = $service->detect($line);
				#printf "Detected language: %s\n", $result->language;
				if($result->language eq "en")
				{
					$tnum = 1;
				}
				else
				{
					$tnum = 0;
				}
			}
		}
		#print "|$line|\n";
		#if($line eq "***")
		#{
		#	print "Toggled\n";
		#	$tnum = 1 - $tnum;
		#}
		$part[$tnum] = $part[$tnum] . "\n" . $line;
	}
	close($ifh);
	my $part0 = $infile;
	$part0 =~ s/_mix.txt//g;
	$part0 = $part0 . "_part0_$mlang.txt";
	my $part1 = $infile;
	$part1 =~ s/_mix.txt//g;
	$part1 = $part1 . "_part1_raw.txt";
	
	open(my $ofh, ">", $part0);
	print $ofh $part[0];
	close($ofh);
	open($ofh, ">", $part1);
	print $ofh $part[1];
	close($ofh);

	my $split = $infile;
	$split =~ s/_mix.txt/_split.html/g;
	open(my $sfh, ">", $split);
	print $sfh "<html><FONT SIZE=5>\n";
	print $sfh "\n<FONT COLOR=RED>\n";
	print $sfh $part[0];
	print $sfh "\n</FONT><FONT COLOR=BLACK>\n";
	print $sfh $part[1];	
	print $sfh "\n</FONT></FONT></html>\n";
	close($sfh);
	
	if($^O =~ /Win/)
	{
		$part0 =~ s/.+\\//g;
		$part1 =~ s/.+\\//g;
	}
	else
	{
		$part0 =~ s/.+\///g;
		$part1 =~ s/.+\///g;
	}
	warn "New filenames: |$part0| |$part1|\n";
	
	translate("translate", $part0);
	stanford("stanford", $part1);	
	make_table("make_table", "", $part1);
	
	my $outfile = $infile;
	$outfile =~ s/_mix/_picked/g;
	my $part0_p = $part0;
	$part0_p =~ s/_(swh|arb)/_stanford/g;
	my $part1_p = $part1;
	$part1_p =~ s/_raw/_stanford/g;
	
	open($ofh, ">", $outfile);
	open($ifh, "<", $part0_p);
	while(<$ifh>)
	{
		print $ofh $_;
	}	
	close($ifh);
	open($ifh, "<", $part1_p);
	while(<$ifh>)
	{
		print $ofh $_;
	}
	close($ifh);
	close($ofh);

	#for html
	$outfile =~ s/\.(txt)|(sgm)|(text)/\.html/g;
	$part0_p =~ s/\.(txt)|(sgm)|(text)/\.html/g;
	$part1_p =~ s/\.(txt)|(sgm)|(text)/\.html/g;
	
	open($ofh, ">", $outfile);
	open($ifh, "<", $part0_p);
	while(<$ifh>)
	{
		$line = $_;
		print $ofh $line if($line !~ /\/html/);
	}	
	close($ifh);
	
	open($ifh, "<", $part1_p);
	<$ifh>;
	print $ofh "<br><br>";
	while(<$ifh>)
	{
		print $ofh $_;
	}
	close($ifh);
	close($ofh);
}

## -- Running LBJ Coref System --
sub lbj_coref {
	
	my $name = shift;	
	my ($infile) = @_;
	die "Enter an input file\n" if ($infile eq "");
	warn "Running lbj coref on $infile\n";
	return if (-e "$infile.LBJ" or $^O ne "linux");
	chdir("LBJ");
	system("java -Xmx1300m -jar LBJCoref.jar ../$infile > ../$infile.LBJ");
	chdir("..");
	warn "Done\n";
}
## -- 

## -- Running List Maker --
sub extract_lbj {
	my $name = shift;
	my ($infile, $ext) = @_;
	$ext = "raw" if(not defined $ext);
	warn "Perl script make_lists called\n";
	return if (!(-e "$infile.LBJ"));
	system("perl Make_Lists.pl -i $infile -l $ext");
}
## -- 

## -- Running Eval Matrix Maker --
sub make_imatrix {
	my $name = shift;
	(my $infile) = @_;
	return if (!(-e "$infile"));
	system("perl Make_Eval.pl -i $infile");
}
## -- 

sub b3_score {
	my $name = shift;
	(my $infile) = @_;
	return if(not -e $infile);
	#system("perl B3Scorer.pl -i $infile");
	my $file = "$infile.eval";
	my $precision;
	my $recall;
	my $fscore;
	my $pre_denom = 0;
	my $rec_denom = 0;
	my @a;
	my @b;

	open(FILE, $file) || die "cannot open file";
	my (@lines) = <FILE>;
	my @rec;
	my $n;
	my $m;
	my $i;
	my $j;

	## process first line
	@rec = split(" ", $lines[0]);
	$n = $rec[0];
	$m = $rec[1];

	## process last two lines
	@a = split(" ",$lines[$n+1]);
	foreach (@a) {
		$rec_denom += $_;
	}
	@b = split(" ",$lines[$n+2]);
	foreach (@b) {
		$pre_denom += $_;
	}

	## process the matrix
	for ($i=0;$i<$n;$i++){
		@rec = split(" ",$lines[$i+1]);
		for ($j=0;$j<$m;$j++) {
			if ($rec[$j] ne 0) {
				$recall += ($rec[$j]*$rec[$j])/$a[$i];
				$precision += ($rec[$j]*$rec[$j])/$b[$j];
			}
		}
	}

	## output the B3 score
	my $pre_num = $precision;
	my $rec_num = $recall;
	$precision /= $pre_denom;
	$recall /= $rec_denom;
	$fscore = ($precision + $recall == 0) ? 0 : (2*$precision*$recall)/($precision+$recall);
	print "Coref: $file: $precision\t$recall\t$fscore.\n";	
	close(FILE);
	return ($pre_num, $pre_denom, $rec_num, $rec_denom);
}

sub strip_sgm
{
	my $name = shift;
	(my $sgmfile) = @_;
	my $ext;
	if(uc $sgmfile =~ /ARABIC/)	{ $ext = "arb"; }
	elsif(uc $sgmfile =~ /CHINESE/) { $ext = "chi"; }
	else { $ext = "raw"; }	
	my $rawfile = $sgmfile;	
	$rawfile =~ s/\.(SGM|sgm)/_$ext\.txt/g;
	#print "SGMfile: $sgmfile Rawfile: $rawfile\n";
	return $rawfile if(-e $rawfile);
	my $doc = "";	
	my $sentence = "";	
	my $ifh;
	#if($ext ne "raw")	#problematic for other languages coz dunno if <p> and </p> are counted or not, for the moment replace with dummy false
	if($ext eq "xyz")
	{
		open($ifh, "<", $sgmfile);
		while(<$ifh> !~ /<TEXT>/)
		{
			next;
		}
		while(<$ifh>)
		{
			$line = $_;
			chomp($line);
			#print "$line\n";
			if($line =~ /<\/TEXT>/)
			{
				if($doc eq "")
				{
					$sentence = substr($sentence, 0, length($sentence) - 2) . " .";
					$doc = $sentence;	
				}
				$doc = $doc . "\n";
				last;
			}
			if($line =~ /<P>/)
			{
				$sentence = "";
				next;
			}
			if($line =~ /<\/P>/)
			{
				$sentence = substr($sentence, 0, length($sentence) - 2) . " .";
				$doc = ($doc eq "") ? $sentence : $doc . "\n" . $sentence;
				next;
			}
			$sentence = ($sentence eq "") ? $line : $sentence . " " . $line;		
		}
		close($ifh);		
	}
	else
	{
		{
			open($ifh, "<", $sgmfile) || die "cannot open file $sgmfile";
			local $/;
			$doc = <$ifh>;
			close($ifh);
		}
		$doc = substr($doc, index($doc, "<TEXT>")+7);
		$doc = substr($doc, 0, index($doc, "</TEXT>"));	
	}
	
	open(my $ofh, ">", $rawfile);
	print $ofh $doc;
	close($ofh);
	return $rawfile;
}

sub make_golds
{
	my $name = shift;
	my $sgmfile = $_[0];
	my $ext;
	if(uc $sgmfile =~ /ARABIC/)	{ $ext = "arb"; }
	elsif(uc $sgmfile =~ /CHINESE/) { $ext = "chi"; }
	else { $ext = "raw"; }	
	my $rawfile = $sgmfile;	
	$rawfile =~ s/\.(SGM|sgm)/_$ext\.txt/g;
	
	my $dtd;
	$dtd = ($rawfile =~ /LDC2005/) ? "ACE04" : (($rawfile =~ /LDC2006/) ? "ACE05" : "ACE01");
	my $glistfile = "$rawfile.Gold_list";
	my $gnlistfile = "$rawfile.Gold_nerlist";
	#return if (-e $glistfile and -e $gnlistfile);
	my $xmlfile = $rawfile;
	if($dtd eq "ACE04") { $xmlfile =~ s/_$ext\.txt/\_APF\.XML/g; }
	elsif($dtd eq "ACE05") 	{ $xmlfile =~ s/_$ext\.txt/\.apf\.XML/g; }
	else { $xmlfile =~ s/_$ext\.txt/\.sgm\.tmx\.rdc\.xml/g; }

	if (not -e $xmlfile)
	{
		print "No gold standard xml file $xmlfile available for $rawfile\n";
		return;
	}
	print "Running make_golds on $rawfile\n";
	open(GTEXT, "<", $xmlfile);
	open(OUTFILE, ">", $glistfile);
	open(OUTNFILE, ">", $gnlistfile);
	
	my $offset;
	my $NE;
	my $start;
	my $end;
	my $flag = 0;
	my $class =0;
	my @spaces;
	my $string;
	my $position;
	my $doc;
	
	{
		open(SGM, "<", $sgmfile) || die "cannot open file $sgmfile";
		local $/;
		$doc = <SGM>;
		close(SGM);
	}
	
	my $docText = substr($doc, index($doc, "<TEXT>")+7);
	$docText = substr($docText, 0, index($docText, "</TEXT>"));

	my $charIt = 0;
	my $tokenCharIt = 0;
	my $tokenIt = 0;
	my @charTokenMap = ();
	
	my @tokens = split /\s+/, $docText;		
	my %ghash = ();
	my %ghash2 = ();
	my %gnhash = ();
	my %gnhash2 = ();
	while (($charIt < length($docText)) && ($tokenIt < scalar(@tokens)))
	{
		push(@charTokenMap, $tokenIt);
		if (substr($docText,$charIt,1) eq substr($tokens[$tokenIt],$tokenCharIt,1)) {
			if ($tokenCharIt+1 < length($tokens[$tokenIt])) {
				$tokenCharIt ++; 
			}
			else {
				$tokenCharIt = 0;
				$tokenIt ++;
			}
		}
		$charIt ++;
	}

	my $eflag = 0;
	my $nflag = 0;
	if($dtd eq "ACE01")		#ACE 2 English
	{
		$offset = 143;
		while (<GTEXT>) {
			my $line = $_;
			chomp($line);
		
			if (($flag == 1) && ($eflag == 1)) {
				$start = substr($line, index($line, "<start>") + 7);
				$start = substr($start, 0, index($start, "</start>"));
				
				$end = substr($line, index($line, "<end>") + 5);
				$end = substr($end, 0, index($end, "</end>"));
						
			#DEB	if ($NE ne substr($doc, $start+143, $end-$start+1)) {
				#for(my $tc = 0; $tc < 20; $tc++)
				#{
				#	print "$tc: " . $NE." vs ".substr($doc, $start+$offset+$tc, $end-$start+1), "\n";
				#}
			
			#DEB		print $NE." vs ".substr($doc, $start+143, $end-$start+1);
			#DEB		print "--ERROR--\n";
			#DEB	}
				#print $posi."\n";
				my $tNE = $NE;
				@spaces = ($tNE =~ /\s/g);
				
				$start= ($start+$offset-index($doc, "<TEXT>")-7);
				#print "$charTokenMap[$start]\t";
				#print $charTokenMap[$start]."\t".(scalar(@spaces)+1)."\t".$NE."\t".$class."\n";
				if (!defined $ghash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"}) {
					#print OUTFILE "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n";
					$ghash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"} = 1;					
					$ghash2{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t". $class."\n"} = $charTokenMap[$start];
					
				}
				if($nflag == 1)
				{
					if (!defined $gnhash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"}) {
						#print OUTNFILE "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n";
						#print "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n";
						$gnhash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"} = 1;
						$gnhash2{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t". $class."\n"} = $charTokenMap[$start];
					}
				}				
			}
			$flag = 0;
			if (index($_, "<!-- string = \"") > 0) {
				$flag = 1;
				$NE = substr($line, index($line, "<!-- string = \"") + 15);
				$NE = substr($NE, 0, index($NE, "\" -->"));
				#print $NE."\t";
			}

			$nflag = 1 if($line =~ /<entity_mention TYPE="NAME"/);
			$nflag = 0 if($line =~ /<entity_mention TYPE/ and $line !~ /TYPE="NAME"/);
			
			$eflag = 1 if (index($line, "<extent>") > 0);
			$eflag = 0 if (index($line, "</extent>") > 0);
			
			if (index($line, "<relation ID") > 0) {
				last;
			}
			if (index($line, "</entity>") > 0) {
				$class++;
			}
		}
	}
	else	#ACE 4 and 5 English only
	{
		$offset = ($dtd eq "ACE04") ? 126 : 111;
		#print "DTD: $dtd offset: $offset\n";
		while (<GTEXT>) {
			my $line = $_;
			chomp($line);
			#print "GTEXT line: |$line|\n";
			if($flag == 1 and $eflag == 1)
			{
				#$start = $1;
				#$end = $2;
				#$NE = $3;
				#print "|$start|$end|$NE|\n";

				#DEB	if ($NE ne substr($doc, $start+143, $end-$start+1)) {
				#for(my $tc = 0; $tc < 20; $tc++)
				#{
				#	print "$tc: " . $NE." vs ".substr($doc, $start+$offset, $end-$start+1), "\n";
				#}
				#DEB		print "--ERROR--\n";
				#DEB	}
				#print $posi."\n";
				my $tNE = $NE;
				@spaces = ($tNE =~ /\s/g);
				$start= ($start+$offset-index($doc, "<TEXT>")-7);
				#print "start: $start ctmap $charTokenMap[$start] scalar ", scalar @charTokenMap, "\n";
				#print "$charTokenMap[$start]\t";
				#print $charTokenMap[$start]."\t".(scalar(@spaces)+1)."\t".$NE."\t".$class."\n";
				if (!defined $ghash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"}) {
					#print OUTFILE "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n";
					#print "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n";
					$ghash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"} = 1;
					$ghash2{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t". $class."\n"} = $charTokenMap[$start];
				}
				if($nflag == 1)
				{
					if (!defined $gnhash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"}) {
						#print OUTNFILE "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n";
						#print "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n";
						$gnhash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"} = 1;
						$gnhash2{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t". $class."\n"} = $charTokenMap[$start];
					}
				}
			}
			$flag = 0;
			if($line =~ /<charseq START="(\d+)" END="(\d+)">(.+)<\/charseq>/)
			{
				$start = $1;
				$end = $2;
				$NE = $3;
				$flag = 1;
			}
			elsif($line =~ /<charseq START="(\d+)" END="(\d+)">(.+)/)
			{
				$start = $1;
				$end = $2;
				$NE = $3;
				#print $line . "\n";
				
					$line = <GTEXT>;
					chomp($line);
					if($line =~ /(.+)<\/charseq>/)
					{
						$NE = $NE . " " . $1;
					}

				#print "$NE\n";
				$flag = 1;
			}

			$nflag = 1 if($line =~ /<entity_mention ID.+TYPE="NAM"/);
			$nflag = 0 if($line =~ /<entity_mention ID/ and $line !~ /TYPE="NAM"/);
			
			$eflag = 1 if ($line =~ /<extent>/);
			$eflag = 0 if ($line =~ /<\/extent>/);			
			
			if ($line =~ /<relation ID/) {
				last;			
			}
			if ($line =~ /<\/entity>/) {
				$class++;
			}
		}		
	}
	close(GTEXT);
	for my $key (sort {$ghash2{$a} <=> $ghash2{$b}} keys %ghash2)
	{
		print OUTFILE $key;
	}
	close(OUTFILE);
	for my $key (sort {$gnhash2{$a} <=> $gnhash2{$b}} keys %gnhash2)
	{
		print OUTNFILE $key;
	}
	close(OUTNFILE);
}

sub make_clusters
{
	my $name = shift;
	my $rawfile = $_[0];
	my $tfile = $rawfile;
	$tfile =~ s/_raw/_table/g;
	my $outfile = $rawfile;
	$outfile =~ s/_raw/_cls/g;
	#return if(-e $outfile);
	print "Running make_clusters on $rawfile\n";
	my @temps = make_table("make_table", $tfile, $rawfile);
	my $stan_table = $temps[0];
	#print $stan_table, "\n";
	my @in_table = split "\n", $stan_table;
	$color = 0;
	@colors = ();
	push @colors, $color;
	#Now, cluster stan_table
	for (my $i = 1; $i < scalar @in_table; $i++)
	{
		$record = $in_table[$i];
		@strs = split "\t", $record;
		$str = lc $strs[2];
		$str =~ s/'s//g;
		#$str =~ s/[^ \.\-\w]//g;
		
		$flag = 0;
		for($j = 0; $j < $i; $j++)
		{
			$prev_record = $in_table[$j];
			@prev_strs = split "\t", $prev_record;
			$prev_str = $prev_strs[2];	
			$prev_str = lc $prev_strs[2];
			$prev_str =~ s/'s//g;
			#$prev_str =~ s/[^ \.\-\w]//g;

			my $lcs_str = lcss($str, $prev_str);
			#$s1 =~ /\Q$s2\E/
			#print "|$str|$prev_str|$lcs_str|\n";
			
			if($str eq $prev_str or $str =~ /\Q$prev_str\E/ or $prev_str =~ /\Q$str\E/ or (length($lcs_str) > 5 and length($lcs_str) > (min(length($str), length($prev_str)) / 2.0)))	#add more conditions here
			{
				if($str !~ /-/ and $prev_str !~ /-/)
				{
					push @colors, $colors[$j];
					$flag = 1;
					#print "str has hpyhen\n" if($str =~ /-/);
					#print "prev_str has hpyhen\n" if($prev_str =~ /-/);
					#print "|$str|$prev_str|$lcs_str|\n" if($str =~ /\Q$prev_str\E/ or $prev_str =~ /\Q$str\E/);
					#print "Case 1\n" if($str =~ /\Q$prev_str\E/);
					#print "Case 2\n" if($prev_str =~ /\Q$str\E/);
					last;
				}
			}
		}
		if($flag == 0)
		{
			$color++;
			push @colors, $color;
		}
	}
	#Now, make out_table using in_table and colors
	open(my $ofh, ">", $outfile);
	for ($i = 0; $i < scalar @in_table; $i++)
	{
		$record = $in_table[$i];	
		@strs = split "\t", $record;
		$str = $strs[2];
		@tokens = split " ", $str;
		$length = scalar @tokens;
		$new_record = "$strs[1]\t$length\t$strs[2]\t$colors[$i]\n";
		print $ofh $new_record;
	}
	close($ofh);	
	
}

sub english_nercoref
{
	my $name = shift;
	return if($^O ne "linux"); #for sake of coref	
	my $outfile = $_[0];	#_raw
	
	#NER
	my $tfile = $outfile;
	$tfile =~ s/_raw/_table/g;	
	lbj("lbj", $outfile);
	stanford("stanford", $outfile);
	make_table("make_table", $tfile, $outfile); 

	#COREF
	lbj_coref("lbj_coref", $outfile);  #makes .LBJ file
	#extract_lbj("extract_lbj", $outfile);   #makes .LBJ_list file
	make_imatrix("make_imatrix", $outfile);
	b3_score("b3_score", $outfile);
}

sub setT
{
	my $name = shift;
	my $val = $_[0];
	$l_jt = $val;
}

sub translate
{
	my $name = shift;
	@infiles = @_;
	#%arb_propers = ();
	#%arb_commons = ();
	for my $infile (sort @infiles)
	{
		#start with infile, infile -> outfile, outfile -> tfile, trans_file, picked_file, correct_file
		print "Translating $infile\n";
		$lang = ($infile =~ /normalized-derivations/ or $infile =~ /_arb/) ? "ar" : "sw";
		
		open($ifh, "<", $infile);
		@sentences = ();
		@tr_sentences = ();
		$document = "";
		@src_tokens = ();
		%src_trans = ();
		@src_poss = ();
		@src_lemmas = ();
		$labeled = 0;
		if($infile =~ /normalized-derivations/ or ($infile =~ /alasiri/ and $infile !~ /sw_alasiri/))
		{
			$infile =~ s/-normalized-derivations/_arb/g;
			$infile =~ s/alasiri_/sw_alasiri_/g;
			$labeled = 1;
			while(1)
			{
				$inflag = 1;
				$sentence = "";
				
				#get buckwalter data, convert to utf8
				#so each tree corresponds to a sentence
				while($inflag == 1)
				{
					##@got_arr = getdata();
					@got_arr = getdata_aner();
					goto LOOP if(! @got_arr);
					##($token, $lemma, $llemma, $morph, $inflag) = @got_arr;	
					$token = $got_arr[0];
					$lemma = $got_arr[1];
					$inflag = $got_arr[2];
					$trans = (scalar @got_arr == 4) ? $got_arr[3] : "";
					#print "t: |$token|$lemma|\n";
					#next if($token eq "" or $lemma eq ""  or $token =~ /\s+/ or $lemma =~ /\s+/ or $llemma eq "" or $llemma =~ /\s+/);
					next if($token eq "" or $lemma eq ""  or $token =~ /\s+/ or $lemma =~ /\s+/);
					$sentence = ($sentence eq "") ? $token : $sentence . " ".  $token;
					##$src_token = decode("Buckwalter", $token);
					$src_token = $token;
					#print "s: $src_token\n";
					push @src_tokens, $src_token;
					push @src_poss, $lemma;
					$src_trans{lc $trans}++ if($trans ne "");
					#push @src_lemmas, $llemma;
					#if($lemma =~ /NOUN_PROP/)
					#{
					#	$arb_propers{$src_token}++;
					#}
					#else
					#{
					#	$arb_commons{$src_token}++;
					#}
					#print "Arb: |$src_token|$lemma|\n";										
				}
				##$sentence = decode("Buckwalter", $sentence);
				#we got the sentence. now let's break it into parts
				if(length($sentence) > 700)
				{
					print "Big Length: ", length($sentence), "\n";
					$f_split = 1;
					$remainder = length($sentence) % 700;
					$times = (length($sentence) - $remainder) / 700;
					
					for($temp1 = 0; $temp1 < $times; $temp1++)
					{
						for($tempc = 650; $tempc < 700; $tempc++)
						{
							if(substr($sentence, $tempc, 1) eq " ")
							{
								last;
							}
						}
						$part_sent[$temp1] = substr($sentence, 0, $tempc) . ".";
						print "split: ", length($part_sent[$temp1]), "\n";
						$sentence = substr($sentence, $tempc);
					}
					if(length($sentence) > 700)
					{
						for($tempc = 650; $tempc < 700; $tempc++)
						{
							if(substr($sentence, $tempc, 1) eq " ")
							{
								last;
							}
						}
						$part_sent[$times] = substr($sentence, 0, $tempc) . ".";
						$part_sent[$times + 1] = substr($sentence, $tempc);
						print "split: ", length($part_sent[$times]), "\n";	
						print "split: ", length($part_sent[$times+1]), "\n";	
					}
					else
					{
						$part_sent[$times] = $sentence;
						print "split: ", length($part_sent[$times]), "\n";	
					}
				}
				else
				{
					$part_sent[0] = $sentence;
				}
				for($tempc = 0; $tempc < scalar @part_sent; $tempc++)
				{
					next if($part_sent[$tempc] eq ".");
					push @sentences, $part_sent[$tempc];
					$document = ($document eq "") ? $part_sent[$tempc] : $document . "\n" . $part_sent[$tempc];
				}

				#part-breaking ends here				
			}
LOOP:
			print scalar @src_tokens, " tokens, ", scalar @sentences, " sentences in $infile\n";
			close($ifh);
			if(not -e $infile)
			{
				open($ofh, ">", $infile);
				print $ofh "$_\n" for(@sentences);
				close($ofh);
			}
			open($ifh, "<", $infile);
		}
		#next;
		
		$outfile = $infile;
		($lang eq "ar") ? ($outfile =~ s/_arb/_raw/g) : ($outfile =~ s/_swh/_raw/g);
		
		#for unlabeled docs, make sure there is one sentence per line, and it ends with a . also, no blank sentences.
		
		$f_split = 0;
		$linenum = 0;
		@sentences = ();
		$document = "";
		while(<$ifh>)
		{
			$sentence = $_;
			$linenum++;
			chop($sentence) if(substr($sentence, -1, 1) eq "\n" or substr($sentence, -1, 1) eq "\r");
			chop($sentence) if(substr($sentence, -1, 1) eq "\n" or substr($sentence, -1, 1) eq "\r");
			next if($sentence eq "" or $sentence =~ /^\s+$/);
			#print "|$sentence|\n"; #if($sentence =~ /^\s+$/);
			#src_tokens, src_poss, src_lemmas
			@temp_tokens = split /\s+/, $sentence;
			for(@temp_tokens)
			{
				#s/\W+//g if($lang ne "ar");
				next if($_ eq "");
				push @src_tokens, $_ if($labeled == 0);
			}
			
			#translate to english
			next if(-e $outfile);
			@part_sent = ();
			if(length($sentence) > 700)
			{
				print "Big Length: ", length($sentence), "\n";
				$f_split = 1;
				warn "***********************f_split activated on labeled data, shouldn't have.*********************\n" if($labeled == 1);	
				$remainder = length($sentence) % 700;
				$times = (length($sentence) - $remainder) / 700;
				
				for($temp1 = 0; $temp1 < $times; $temp1++)
				{
					for($tempc = 650; $tempc < 700; $tempc++)
					{
						if(substr($sentence, $tempc, 1) eq " ")
						{
							last;
						}
					}
					$part_sent[$temp1] = substr($sentence, 0, $tempc) . ".";
					print "split: ", length($part_sent[$temp1]), "\n";
					$sentence = substr($sentence, $tempc);
				}
				if(length($sentence) > 700)
				{
					for($tempc = 650; $tempc < 700; $tempc++)
					{
						if(substr($sentence, $tempc, 1) eq " ")
						{
							last;
						}
					}
					$part_sent[$times] = substr($sentence, 0, $tempc) . ".";
					$part_sent[$times + 1] = substr($sentence, $tempc);
					print "split: ", length($part_sent[$times]), "\n";	
					print "split: ", length($part_sent[$times+1]), "\n";	
				}
				else
				{
					$part_sent[$times] = $sentence;
					print "split: ", length($part_sent[$times]), "\n";	
				}
			}
			else
			{
				$part_sent[0] = $sentence;
			}
			for($tempc = 0; $tempc < scalar @part_sent; $tempc++)
			{
				#print "line no: $linenum length: ", length($part_sent[$tempc]), "\n" if($linenum % 100 == 0);
				#print "Sentence: $part_sent[$tempc]\n";
				if($translator eq "g" or $lang ne "ar")
				{
					if($lang eq "ar")
					{
						do
						{
							$res = REST::Google::Translate->new(
									q => $part_sent[$tempc],
									langpair => 'ar|en'
							);
							print "Waiting for translation ", $res->responseStatus, "\n";
						} until($res->responseStatus == 200);
						$translated = $res->responseData->translatedText;
					}
					else
					{
						do
						{
							$result = $service->translate($part_sent[$tempc]);
							print "Waiting for translation ", $result->error, "\n";
						}  while ($result->error and $result->code != 400);
						$translated = $result->translation;
					}
				}
				else
				{
					$command = "./bing en infile.txt outfile.txt";
					open($bing_ifh, ">", "infile.txt");
					print $bing_ifh $part_sent[$tempc];
					close($bing_ifh);
					`$command`;
					open($bing_ofh, "<", "outfile.txt");
					$translated = "";
					while(<$bing_ofh>)
					{
						$translated = ($translated eq "") ? $_ : $translated . " " . $_;
					}
					close($bing_ofh);
				}
				$translated =~ s/&quot;/\"/g;
				$translated =~ s/&#39;//g;
				$translated =~ s/<.+>/ /g;
				$translated =~ s/\///g; 
				$translated =~ s/-(\d+)-/$1/g;
				$translated =~ s/(\d+)-(\d+)/$1$2/g;
				$translated =~ s/(\w+):(\w+)/$1$2/g;
				$translated =~ s/(\w+)\.(\w+)/$1$2/g;
				substr($translated, -1, 1) = " ";
				$translated = $translated . ".";
				printf "%s\n", $translated;

				push @tr_sentences, $translated;
				push @sentences, $part_sent[$tempc];
				$document = ($document eq "") ? $part_sent[$tempc] : $document . "\n" . $part_sent[$tempc];
			}
		}
		$n_tokens += scalar @src_tokens;
		close($ifh);

		if(not -e $outfile)
		{
			print "Outfile: $outfile\n";
			open($ofh, ">", $outfile);
			print $ofh "$_\n" for(@tr_sentences);
			close($ofh);
		}
		if($f_split == 1)
		{
			open($ofh, ">", $infile);
			print $ofh "$_\n" for(@sentences);
			close($ofh);
		}
		
		next if($l_jt == 0);
		#Read in aligned text from _corps file.  make_aligncorps, run_giza and make_corpfiles need to be run to make _corps.		
		
		my $corpfile = $outfile;
		$corpfile =~ s/_raw/_corps/g;
		if((not -e $corpfile) or $l_compare ne "giza")
		{
			$l_compare = "en" if($l_compare eq "giza");
			goto SKIP_GIZA;
		}
		open($cfh, "<", $corpfile);
		@t_words = ();
		@s_words = ();
		@s_nums = ();
		@sg_words = ();
		@sg_nums = ();		
		print "Reading $corpfile into memory\n";
		while(<$cfh>)
		{
			$line = $_;
			chomp($line);
			chop($line) if(substr($line, -1, 1) eq "\n" or substr($line, -1, 1) eq "\r");		
			if($line =~ /# Sentence pair \((\d+)\) source length (\d+) target length (\d+)/)
			{
				#$sid = $1;
				#$s_len = $2;
				#$t_len = $3;
				$flag = 1;
				next;
			}
			if($flag == 1)
			{
				@temp = split /\s+/, $line;
				for my $word (@temp)
				{
					push @t_words, $word;
				}			
				$t_senlen = scalar @temp;
				$flag = 2;
				next;
			}
			if($flag == 2)
			{
				#NER
				@temp = split(/\) /, $line);    #why does it behave this way
				#@temp = split(/\}\) /, $line);    #why does it behave this way
				#$temp_str = (join "|", @temp);
				#print "temp: $temp_str\n";				
				@wordnums = ();
				for($i = 0; $i < scalar @temp; $i++)
				{
					@parts = split(/ \(/, $temp[$i]); #why does it behave this way
					#@parts = split(/ \(\{/, $temp[$i]); #why does it behave this way
					#$parts_str = (join "|", @parts);
					#print "parts: $parts_str\n";					
					push @wordnums, $temp[$i] if($parts[0] ne "NULL");
				}
				for($i = 0; $i < scalar @wordnums; $i++)
				{
					@s_parts = split(/ \(/, $wordnums[$i]); #why does it behave this way
					#@s_parts = split(/ \(\{/, $wordnums[$i]); #why does it behave this way
					#$s_parts_str = (join "|", @s_parts);
					#print "before s_parts: $s_parts_str\n";

					#$s_parts[1] =~ s/^ //g;
					#$s_parts[1] =~ s/\s*$//g;
					$s_parts[1] =~ s/\{ //g;
					$s_parts[1] =~ s/\s*\}//g;

					next if($s_parts[0] eq "" or $s_parts[0] eq "-LRB-" or $s_parts[0] eq "-RRB-" or $s_parts[0] eq "s" or $s_parts[0] !~ /\w+/);
					$s_parts[0] =~ s/\W+//g;					
					push @s_words, $s_parts[0];
					#push @sg_words, $s_parts[0];
					#print "s_parts: |$s_parts[0]|$s_parts[1]|\nnum: ";
					@nums = split /\s+/, $s_parts[1];
					$ind_str = "";
					for my $num (@nums)
					{
						#print "|$num| ";
						$ind = scalar(@t_words) - $t_senlen;
						$ind += $num - 1;
						$ind_str = ($ind_str eq "") ? $ind : $ind_str . " $ind";
					}
					#print "\n";
					#print "ind_str: |$ind_str|\n";
					push @s_nums, $ind_str;
					#push @sg_nums, $ind_str;
				}
				
				#COREF
				#@temp = split(/\) /, $line);    #why does it behave this way
				@temp = split(/\}\) /, $line);    #why does it behave this way
				#$temp_str = (join "|", @temp);
				#print "temp: $temp_str\n";				
				@wordnums = ();
				for($i = 0; $i < scalar @temp; $i++)
				{
					#@parts = split(/ \(/, $temp[$i]); #why does it behave this way
					@parts = split(/ \(\{/, $temp[$i]); #why does it behave this way
					#$parts_str = (join "|", @parts);
					#print "parts: $parts_str\n";					
					push @wordnums, $temp[$i] if($parts[0] ne "NULL");
				}
				for($i = 0; $i < scalar @wordnums; $i++)
				{
					#@s_parts = split(/ \(/, $wordnums[$i]); #why does it behave this way
					@s_parts = split(/ \(\{/, $wordnums[$i]); #why does it behave this way
					#$s_parts_str = (join "|", @s_parts);
					#print "before s_parts: $s_parts_str\n";

					$s_parts[1] =~ s/^ //g;
					$s_parts[1] =~ s/\s*$//g;
					#$s_parts[1] =~ s/\{ //g;
					#$s_parts[1] =~ s/\s*\}//g;

					next if($s_parts[0] eq "" or $s_parts[0] eq "-LRB-" or $s_parts[0] eq "-RRB-" or $s_parts[0] eq "s" or $s_parts[0] !~ /\w+/);
					$s_parts[0] =~ s/\W+//g;					
					#push @s_words, $s_parts[0];
					push @sg_words, $s_parts[0];
					#print "s_parts: |$s_parts[0]|$s_parts[1]|\nnum: ";
					@nums = split /\s+/, $s_parts[1];
					$ind_str = "";
					for my $num (@nums)
					{
						#print "|$num| ";
						$ind = scalar(@t_words) - $t_senlen;
						$ind += $num - 1;
						$ind_str = ($ind_str eq "") ? $ind : $ind_str . " $ind";
					}
					#print "\n";
					#print "ind_str: |$ind_str|\n";
					#push @s_nums, $ind_str;
					push @sg_nums, $ind_str;
				}				
				next;
			}		
		}
		close($cfh);
		#for($k = 0; $k < scalar @src_tokens; $k++)
		#{
		#	print "Swh: |$src_tokens[$k]|\t|$t_words[$k]|\n";
		#}
		#for($k = 0; $k < scalar @s_words; $k++)
		#{
		#	print "Eng: |$k|$s_words[$k]|\n";
		#}

SKIP_GIZA:		
		#Run NER, make table
		
		#lbj("lbj", $outfile);
		stanford("stanford", $outfile);
		my $tfile = $outfile;
		$tfile =~ s/_raw/_table/g;
		my @temps = make_table("make_table", $tfile, $outfile); 
		my $stan_table = $temps[0];
		my @table = split "\n", $stan_table;
		#@table = split "\n", $lbj_table;

		#**************
		#Run coref, make LBJ_list
		if($l_coref == 1)
		{
			lbj_coref("lbj_coref", $outfile);  #makes .LBJ file
			#extract_lbj("extract_lbj", $outfile);   #makes .LBJ_list file

			my $ctfile = "$outfile.LBJ_list";
			open(my $ctfh, "<", $ctfile);
			my %temp_table = ();
			@coref_table = ();
			while(<$ctfh>)
			{
				$line = $_;
				chomp($line);
				my @temp = split "\t", $line;
				$temp_table{$temp[0]} = $line;
			}
			close($ctfh);
			for my $key (sort {$a <=> $b} keys %temp_table)
			{
				push @coref_table, $temp_table{$key};
			}
		}
		#**************
		
		#Make lists of NE, align s_words and s_nums. Read t_* for src or tl_* for en
		
		@indices = ();
		@entities = ();
		@en_entities = ();
		@t_indices = ();
		@t_entities = ();
		@t_types = ();
		@en_t_indices = ();
		@en_t_entities = ();
		@en_t_types = ();
		my %en_t_list = ();
		
		print "Making lists of named entities in English and source language\n";
		warn "Making lists of ", scalar @table, " entities\n";
		my $counter = 0;
		for my $record (@table)
		{
			@fields = split "\t", $record;
			$index = $fields[1];
			$tag = $fields[2];
			$type = $fields[4];
			next if ($tag eq "");
			$counter++;
			warn "entity no. $counter\n" if($counter % 10 == 0);
			#print "Entity:$tag\n";
			push @en_entities, $tag;
			@en_words = split /\s+/, $tag;	
			for($i = 0; $i < scalar @en_words; $i++)
			{
				$n = $index + $i;
				push @en_t_indices, $n;
				push @en_t_entities, $en_words[$i];	
				$en_t_list{lc $en_words[$i]}++;
				push @en_t_types, $type;
				$en_words[$i] =~ s/\W+//g;
				if($en_words[$i] ne $s_words[$n])
				{
					if($en_words[$i] eq $s_words[$n-1])
					{
						splice @s_words, $n - 1, 0, "NULL";
						splice @s_nums, $n - 1, 0, "NULL";
					}
				}
				#print "English: $n\t$en_words[$i]\t$s_words[$n]\n";					
			}
		
			#Make t_* for src
			next if($l_compare ne "src");			
			if($translator eq "g" or $lang ne "ar")
			{
				if($lang eq "ar")
				{
					do
					{
						$res = REST::Google::Translate->new(
								q => $tag,
								langpair => 'en|ar'
						);
					} until($res->responseStatus == 200);
					$entity = $res->responseData->translatedText;
				}
				else
				{
					do
					{
						$result = $service->translate($tag, 'dest' => 'sw');
					} while ($result->error and $result->code != 400);
					$entity = $result->translation;
				}
			}
			else
			{
				$command = "./bing ar infile.txt outfile.txt";
				open($bing_ifh, ">", "infile.txt");
				print $bing_ifh $tag;
				close($bing_ifh);
				`$command`;
				open($bing_ofh, "<", "outfile.txt");
				$entity = "";
				while(<$bing_ofh>)
				{
					$entity = ($entity eq "") ? $_ : $entity . " " . $_;
				}
				close($bing_ofh);
				#printf "Bing: %s\n", $translated;
			}
			
			push @entities, $entity;
			@words = split /\s+/, $entity;	
			for($i = 0; $i < scalar @words; $i++)
			{
				$n = $index + $i;
				push @t_indices, $n;
				push @t_entities, $words[$i];
				push @t_types, $type;
			}			
		}
		
		#Make tl_* for en
		#goto ALIGN_METHODS if($l_compare ne "en");
ALIGN_METHODS:
		#Alignment methods
		
		%ne_list = ();
		%ne_list2 = ();
		%ne_type2 = ();
		$window = 30;
		
		#This used to be the default method
		if($l_compare eq "en" or $l_coref == 1)
		{
			@tl_tokens = ();
			@tl_words = ();
			@tl_indices = ();
			print "Transliterating document from source to English\n";
			warn "Transliterating ", scalar @src_tokens, " tokens\n";
			$counter = 0;
			$trans_file = $outfile;
			$trans_file =~ s/_raw/_translit/g;
			if(-e $trans_file)
			{
				print "Translit file exists\n";
				open(my $trfh, "<", $trans_file);
				my $i = 0;
				while(<$trfh>)
				{
					my $translated = $_;
					chomp($translated);
					chop($translated) if(substr($translated, -1, 1) eq "\n" or substr($translated, -1, 1) eq "\r");
					push @tl_words, $translated;
					@words = split /\s+/, $translated;							
					for my $word (@words)
					{
						push @tl_indices, $i;
						push @tl_tokens, $word;
					}
					$i++;
				}
				close($trfh);
			}
			else
			{
				open (my $trfh, ">", $trans_file);
				for($i = 0; $i < scalar @src_tokens; $i++)
				{

					$src_token = $src_tokens[$i];
					next if($src_token eq "");
					$counter++;
					warn "token no. $counter\n" if($counter % 200 == 0);
					if($translator eq "g" or $lang ne "ar")
					{
						if($lang eq "ar")
						{
							do
							{
								$res = REST::Google::Translate->new(
										q => $src_token,
										langpair => 'ar|en'
								);
								print "Waiting for translation ", $res->responseStatus, "\n";
							} until($res->responseStatus == 200);
							$translated = $res->responseData->translatedText;
							#printf "Source: $src_token Google: %s\n", $translated;
						}
						else
						{
							do
							{
								$result = $service->translate($src_token);
								print "Waiting for translation ", $result->error, "\n";
							} while ($result->error and $result->code != 400);
							$translated = $result->translation;
							#printf "Source: $src_token Google: %s\n", $translated;
						}
					}
					else
					{
						$command = "./bing en infile.txt outfile.txt";
						open($bing_ifh, ">", "infile.txt");
						print $bing_ifh $src_token;
						close($bing_ifh);
						`$command`;
						open($bing_ofh, "<", "outfile.txt");
						$translated = "";
						while(<$bing_ofh>)
						{
							$translated = ($translated eq "") ? $_ : $translated . " " . $_;
						}
						close($bing_ofh);
						#printf "Bing: %s\n", $translated;
					}
					$translated =~ s/&quot;/\"/g;
					$translated =~ s/&#39;//g;
					$translated =~ s/<.+>/ /g;
					$translated =~ s/\///g; 
					$translated =~ s/-(\d+)-/$1/g;
					$translated =~ s/(\d+)-(\d+)/$1$2/g;
					$translated =~ s/(\w+):(\w+)/$1$2/g;
					$translated =~ s/(\w+)\.(\w+)/$1$2/g;							
					push @tl_words, $translated;
					@words = split /\s+/, $translated;			
					print $trfh "$translated\n";
					for my $word (@words)
					{
						push @tl_indices, $i;
						push @tl_tokens, $word;
					}
				}
				close($trfh);		
			}
			for($i = 0; $i < scalar @en_t_entities; $i++)
			{
				$index = $en_t_indices[$i];
				$start = ($index < $window) ? 0 : $index - $window;
				$end = ($index > scalar @tl_tokens - $window) ? scalar @tl_tokens : $index + $window;
				@st2 = Text::English::stem($en_t_entities[$i]);	
				$s2 = $en_t_entities[$i];
				#next if($commons{$st2[0]} ne "" and $propers{$st2[0]} eq ""); # and $lkey eq $key);
				#next if($swh_commons{$st2[0]} ne "" and $swh_propers{$st2[0]} eq ""); # and $lkey eq $key);				
				for($j = $start; $j < $end; $j++)
				{
					@st1 = Text::English::stem($tl_tokens[$j]);

					$s1 = $tl_tokens[$j];

					if($lang ne "ar")
					{
						$s1 =~ s/\W+//g;
						$s2 =~ s/\W+//g;
					}
					#print "index $en_t_indices[$i] $en_t_entities[$i] j $j $tl_tokens[$j]\n";
					if(lc $s1 eq lc $s2)
					{
						$ne_list{$i}++;	
						$ne_list2{$tl_indices[$j]}++;
						$ne_type2{$tl_indices[$j]} = $en_t_types[$i];
						#print "Matched here $en_t_indices[$i] $j\n";
					}
					#print "Matched s1-s2 $en_t_indices[$i] $j\n" if($s1 eq $s2);
				}
			}
		}
		#elsif($l_compare eq "src")
		if($l_compare eq "src")
		{
			for($i = 0; $i < scalar @t_entities; $i++)
			{
				$index = $t_indices[$i];
				$start = ($index < $window) ? 0 : $index - $window;
				$end = ($index > scalar @src_tokens - $window) ? scalar @src_tokens : $index + $window;
				for($j = $start; $j < $end; $j++)
				{
					@st1 = Text::English::stem($src_tokens[$j]);
					@st2 = Text::English::stem($t_entities[$i]);
					$s1 = $src_tokens[$j];
					$s2 = $t_entities[$i];
					if($lang ne "ar")
					{
						$s1 =~ s/\W+//g;
						$s2 =~ s/\W+//g;
					}
					#print "index $en_t_indices[$i] $en_t_entities[$i] j $j $tl_tokens[$j]\n";
					if(lc $st1[0] eq lc $st2[0])
					{
						$ne_list{$i}++;	
						$ne_list2{$j}++;
						$ne_type2{$j} = $t_types[$i];						
					}					
				}
			}
		}
		if($l_compare eq "giza")	# giza
		{
			print "Giza: ", scalar @en_t_entities, "\n";
			for($i = 0; $i < scalar @en_t_entities; $i++)
			{
				$entity = $en_t_entities[$i];
				$index = $en_t_indices[$i];
				$t_idlist = $s_nums[$index]; 
				next if($t_idlist eq "");	#no alignment
				$ne_list{$i}++;				
				@t_ids = split /\s+/, $t_idlist;
				for my $t_id (@t_ids)
				{
					$ne_list2{$t_id}++;
					$ne_type2{$t_id} = $en_t_types[$i];
				}
			}
		}
		@nelist = sort {$a <=> $b} keys %ne_list;
		#print "Rematched:\n";
		#for my $key (@nelist)
		#{
			#print "$en_t_indices[$key]\t$en_t_entities[$key]\n";
		#}
		#print "Overall:\n";
		#for($i = 0; $i < scalar @en_t_entities; $i++)
		#{
			#print "$en_t_indices[$i]\t$en_t_entities[$i]\n";
		#}

		print "Rematched ", scalar @nelist, " out of ", scalar @en_t_entities, " = ", (scalar @nelist / scalar @en_t_entities), " Extractor-tagged entities\n";
		
		if($pp == 1)
		{
			for($i = 0; $i < scalar @src_tokens; $i++)
			{
				$ne_list2{$i}++ if($lang eq "sw" and $swh_propers{lc $src_tokens[$i]} ne "" and $swh_commons{lc $src_tokens[$i]} eq "");
				delete $ne_list2{$i} if($lang eq "sw" and $swh_propers{lc $src_tokens[$i]} eq "" and $swh_commons{lc $src_tokens[$i]} ne "" and $ne_list2{$i} ne "");
				$ne_list2{$i}++ if($lang eq "ar" and $arb_propers{$src_tokens[$i]} ne "" and $arb_commons{$src_tokens[$i]} eq "");
				delete $ne_list2{$i} if($lang eq "ar" and $arb_propers{$src_tokens[$i]} eq "" and $arb_commons{$src_tokens[$i]} ne "" and $ne_list2{$i} ne "");
				
			}
		}
		#for($i = 0; $i < scalar @tl_indices; $i++)
		#{
		#	$ne_list2{$tl_indices[$i]}++ if($propers{lc $tl_tokens[$i]} ne "" and $commons{lc $tl_tokens[$i]} eq "");
		#	delete $ne_list2{$tl_indices[$i]} if($propers{lc $tl_tokens[$i]} eq "" and $commons{lc $tl_tokens[$i]} ne "" and $ne_list2{$tl_indices[$i]} ne "");
		#}

		#**************		
		#Make LBJ_list files for native language
		if($l_coref == 1)
		{
			my $counter = 0;
			my @new_table = ();
			#my %names = ();
			#my %p_ids = ();
#=temp			
			for my $record (@coref_table)
			{
				#print "Record: |$record|\n";
				@fields = split "\t", $record;
				$index = $fields[0];
				$tag = $fields[2];
				next if ($tag eq "");
				$counter++;
				warn "entity no. $counter\n" if($counter % 10 == 0);
				#print "Entity:$tag\n";
				
				@en_words = split /\s+/, $tag;	
				for($i = 0; $i < scalar @en_words; $i++)
				{
					$n = $index + $i;
					$en_words[$i] =~ s/\W+//g;
					if($en_words[$i] ne $sg_words[$n])
					{
						if($en_words[$i] eq $sg_words[$n-1])
						{
							splice @sg_words, $n - 1, 0, "NULL";
							splice @sg_nums, $n - 1, 0, "NULL";
						}
					}
					#print "Coref: $n\t$en_words[$i]\t$sg_words[$n]\n";					
				}
			}
#=cut			
			$counter = 0;
			for my $record (@coref_table)
			{
				print "Coref Record: $record\n";
				@fields = split "\t", $record;
				$index = $fields[0];
				$length = $fields[1];
				$entity = $fields[2];
				$e_id = $fields[3];
				next if ($entity eq "");
				$counter++;
				#print "Entity:|$entity|\n";
				my %ids = ();
				
				@en_words = split /\s+/, $entity;	
				for($i = 0; $i < scalar @en_words; $i++)
				{
					$n = $index + $i;
					#print "En word: $en_words[$i]\n";
					#$names{$en_words[$i]} = $e_id if($lang eq "ar" and $arb_propers{$en_words[$i]} ne "" and $arb_commons{$en_words[$i]} eq "");					
#=temp					
					#Transliteration begins
					$start = ($n < $window) ? 0 : $n - $window;
					$end = ($n > scalar @tl_tokens - $window) ? scalar @tl_tokens : $n + $window;
					@st2 = Text::English::stem($en_words[$i]);	
					$s2 = $en_words[$i];
					#next if($commons{$st2[0]} ne "" and $propers{$st2[0]} eq ""); # and $lkey eq $key);
					#next if($swh_commons{$st2[0]} ne "" and $swh_propers{$st2[0]} eq ""); # and $lkey eq $key);				
					for($j = $start; $j < $end; $j++)
					{
						@st1 = Text::English::stem($tl_tokens[$j]);

						$s1 = $tl_tokens[$j];

						if($lang eq "ar")
						{
							$s1 =~ s/\W+//g;
							$s2 =~ s/\W+//g;
						}
						#print "index $en_t_indices[$i] $en_t_entities[$i] j $j $tl_tokens[$j]\n";
						if(lc $st1[0] eq lc $st2[0])
						{
							$ids{$tl_indices[$j]}++;
							#$p_ids{$tl_indices[$j]}++;
							#print "Matched here $en_t_indices[$i] $j\n";
						}
						#print "Matched s1-s2 $en_t_indices[$i] $j\n" if($s1 eq $s2);
					}
					#Transliteration ends
#=cut					
#=temp
					#Giza begins

					$t_idlist = $sg_nums[$n]; 
					next if($t_idlist eq "");	#no alignment exists
					@t_ids = split /\s+/, $t_idlist;
					for my $t_id (@t_ids)
					{
						$ids{$t_id}++;
						#$p_ids{$tl_indices[$j]}++;
						#print "Src: $src_tokens[$t_id]\n";
					}
					#Giza ends
#=cut					
				}
				next if(scalar keys %ids == 0);	#no alignment
				my $minid = min keys %ids;
				my $maxid = max keys %ids;
				$keys = join ",", sort {$a <=> $b} keys %ids;
				print "Keys: $keys\n";
				#make new record for native language
				my $new_index = $minid;
				my $new_length = ($maxid - $minid) + 1;
				#try using t_words instead of src_tokens if it doesn't work
				#my $new_entity = $src_tokens[$minid];
				my $new_entity = "";
				#for($i = $minid + 1; $i <= $maxid; $i++)
				for my $i (sort {$a <=> $b} keys %ids)
				{
					$new_entity = ($new_entity eq "") ? $src_tokens[$i] : $new_entity . " " . $src_tokens[$i];
				}
				my $new_e_id = $e_id;
				my $new_record = "$new_index\t$new_length\t$new_entity\t$new_e_id";
				print "New Record: $new_record\n";
				push @new_table, $new_record;
			}
=temp			
			#NE post-processing, didnt work at all. numbers went down.
			for($i = 0; $i < scalar @src_tokens; $i++)
			{
				if($p_ids{$i} eq "" and $lang eq "ar" and $arb_propers{$src_tokens[$i]} ne "" and $arb_commons{$src_tokens[$i]} eq "")
				{
					my $new_index = $i;
					my $new_length = 1;
					my $new_entity = $src_tokens[$i];
					my $new_e_id = $names{$src_tokens[$i]};
					my $new_record = "$new_index\t$new_length\t$new_entity\t$new_e_id";
					push @new_table, $new_record;
				}
			}
=cut
			my $ctfile = "$outfile.LBJ_list";
			my $new_ctfile = $ctfile;
			($lang eq "ar") ? ($new_ctfile =~ s/_raw/_arb/g) : ($new_ctfile =~ s/_raw/_swh/g);
			open($ntfh, ">", $new_ctfile) || die "Cannot open ctfile $ctfile new_ctfile: $new_ctfile\n";
			for my $new_record (@new_table)
			{
				print $ntfh "$new_record\n";
			}
			close($ntfh);
			
			#here, an extract_gold fn to make the GOLD_LIST file. plus something to align LBJ_LIST and GOLD_LIST
			make_imatrix("make_imatrix", $infile);
			($co_pren, $co_pred, $co_recn, $co_recd) = b3_score("b3_score", $infile);
			$s_co_pren += $co_pren;
			$s_co_pred += $co_pred;
			$s_co_recn += $co_recn;
			$s_co_recd += $co_recd;
			$co_pre = $s_co_pren / $s_co_pred;
			$co_rec = $s_co_recn / $s_co_recd;
			$co_f1 = 2 * $co_pre * $co_rec / ($co_pre + $co_rec);
			print "Cumulative Coref: Prec: $co_pre Recl: $co_rec F1: $co_f1\n";
			warn "Cumulative Coref: Prec: $co_pre Recl: $co_rec F1: $co_f1\n";

		}				
		#**************		
		
		@nelist2 = sort {$a <=> $b} keys %ne_list2;
		$picked = scalar @nelist2;
		%type_picked = ();
		$tp = 0;
		%type_tp = ();
		#$ecount = 0;
		#$ecount2 = 0;
		
		#write picked file
		$pickedfile = $outfile;
		$pickedfile =~ s/raw/picked/g;
		$pickedfile =~ s/sw_alasiri_/alasiri_/g if($labeled == 1);
		$pickedhtm = $pickedfile;
		$pickedhtm =~ s/\.(txt)|(sgm)|(text)/\.html/g;
		open($pfh, ">", $pickedfile);
		open($hfh, ">", $pickedhtm);
		print $hfh "<html><FONT SIZE=5>\n";
		for($i = 0; $i < scalar @src_tokens; $i++)
		{
			if($ne_type2{$i} eq "")
			{
				print $pfh "$src_tokens[$i] ";
				print $hfh "$src_tokens[$i] ";
			}
			else
			{
				$starttag = $ne_type2{$i};
				$j = $i+1;
				$j++ while($ne_type2{$j} eq $starttag and $j < scalar @src_tokens);
				$start = $i;
				$end = $j;
				$i = $j - 1;
				$entity = "";
				for($k = $start; $k < $end; $k++)
				{
					$entity = ($entity eq "") ? $src_tokens[$k] : $entity . " " . $src_tokens[$k];
				}
				#entity is $entity, tag is $ne_type2{$start} or $starttag
				print $pfh "[$starttag $entity  ] ";
				if($starttag eq "PER")
				{
					print $hfh "<FONT COLOR=RED>$entity</FONT> ";
				}
				if($starttag eq "LOC")
				{
					print $hfh "<FONT COLOR=BLUE>$entity</FONT> ";
				}
				if($starttag eq "ORG")
				{
					print $hfh "<FONT COLOR=GREEN>$entity</FONT> ";
				}
				if($starttag eq "MIS")
				{
					print $hfh "<FONT COLOR=PURPLE>$entity</FONT> ";
				}
				
			}
		}
		print $pfh "\n";
		print $hfh "\n";
		print $hfh "\n</FONT></html>\n";
		close($pfh);
		close($hfh);
		
		next if($labeled == 0);

		#evaluation part begins here

		#Here, compare %ne_list2 and %src_trans.
		if($l_compare ne "src" and $lang eq "sw")
		{
			my %comb_list = ();
			$comb_list{$_}++ foreach(keys %en_t_list);
			$comb_list{$_}++ foreach(keys %src_trans);						
			@co_result = compare_lists("compare_lists", \%src_trans, \%en_t_list, \%comb_list); #($tp, $fp, $fn)
			$ie_tp += $co_result[0];
			$ie_fp += $co_result[1];
			$ie_fn += $co_result[2];
			$ietw_tp += $co_result[3];
			$ietw_fp += $co_result[4];
			$ietw_fn += $co_result[5];
			
		}

		#write correct file
		$correctfile = $outfile;
		$correctfile =~ s/raw/correct/g;
		$correctfile =~ s/sw_alasiri_/alasiri_/g;
		$correcthtm = $correctfile;
		$correcthtm =~ s/\.(txt)|(sgm)|(text)/\.html/g;
		open($cfh, ">", $correctfile);
		open($hfh, ">", $correcthtm);
		print $hfh "<html><FONT SIZE=5>\n";	
		for($i = 0; $i < scalar @src_tokens; $i++)
		{
			if($src_poss[$i] eq "O")
			{
				print $cfh "$src_tokens[$i] ";
				print $hfh "$src_tokens[$i] ";
			}
			else
			{
				$starttag = $src_poss[$i];
				$j = $i+1;
				$j++ while($src_poss[$j] eq $starttag and $j < scalar @src_tokens);
				$start = $i;
				$end = $j;
				$i = $j - 1;
				$entity = "";
				for($k = $start; $k < $end; $k++)
				{
					$entity = ($entity eq "") ? $src_tokens[$k] : $entity . " " . $src_tokens[$k];
				}
				#entity is $entity, tag is $src_poss[$start] or $starttag
				print $cfh "[$starttag $entity  ] ";
				print $hfh "<FONT COLOR=RED>$entity</FONT> ";				
			}
		}
		print $cfh "\n";
		print $hfh "\n";
		print $hfh "\n</FONT></html>\n";
		close($cfh);		
		close($hfh);
		
		for my $key (@nelist2)
		{
			#print $src_poss[$key], "\n";
			
			#$tp++ if($src_poss[$key] =~ /NOUN_PROP/);
			$tp++ if($src_poss[$key] ne "O");
			if($lang eq "ar")
			{
				$type_tp{PER}++ if($src_poss[$key] =~ /PER/ and $ne_type2{$key} =~ /PER/);
				$type_tp{LOC}++ if($src_poss[$key] =~ /LOC/ and $ne_type2{$key} =~ /LOC/);
				$type_tp{ORG}++ if($src_poss[$key] =~ /ORG/ and $ne_type2{$key} =~ /ORG/);
				$type_tp{MIS}++ if($src_poss[$key] =~ /MIS/ and $ne_type2{$key} =~ /MIS/);
				$type_picked{PER}++ if($ne_type2{$key} =~ /PER/);
				$type_picked{LOC}++ if($ne_type2{$key} =~ /LOC/);
				$type_picked{ORG}++ if($ne_type2{$key} =~ /ORG/);
				$type_picked{MIS}++ if($ne_type2{$key} =~ /MIS/);
				print "ORG picked: $key|$tl_words[$key]\n" if($ne_type2{$key} =~ /ORG/);
				print "MIS picked: $key|$tl_words[$key]\n" if($ne_type2{$key} =~ /MIS/);
			}
			#$ecount++ if($src_poss[$key+1] =~ /NOUN_PROP/ and not grep {$_ == $key + 1} @nelist2);
			#$ecount2++ if($src_poss[$key+2] =~ /NOUN_PROP/ and not grep {$_ == $key + 2} @nelist2);
		}

		#print "tp: ", $tp, "\n";
		#print "ecount: ", $ecount, "\n";
		#print "ecount2: ", $ecount2, "\n";

		$correct = 0;
		%type_correct = ();
		
		for($i = 0; $i < scalar @src_tokens; $i++)
		{
			#$correct++ if($src_poss[$i] =~ /NOUN_PROP/);
			$correct++ if($src_poss[$i] ne "O");
			if($lang eq "ar")
			{
				$type_correct{PER}++ if($src_poss[$i] =~ /PER/);
				$type_correct{LOC}++ if($src_poss[$i] =~ /LOC/);
				$type_correct{ORG}++ if($src_poss[$i] =~ /ORG/);
				$type_correct{MIS}++ if($src_poss[$i] =~ /MIS/);
				print "ORG correct: $i|$tl_words[$i]\n" if($src_poss[$i] =~ /ORG/);
				print "MIS correct: $i|$tl_words[$i]\n" if($src_poss[$i] =~ /MIS/);
			}
		}
		#print "total correct: ", $total_correct, "\n";

		$prec = sprintf("%.3f", ($picked == 0) ? 0 : $tp / $picked);
		$recl = sprintf("%.3f", ($correct == 0) ? 0 : $tp / $correct);
		$f1 = sprintf("%.3f", ($prec + $recl == 0) ? 0 : 2 * $prec * $recl / ($prec + $recl));
		print "$infile Precision: $prec Recall: $recl F1: $f1\n";
		warn "$infile Precision: $prec Recall: $recl F1: $f1\n";

		$s_tp += $tp;
		$s_picked +=$picked;
		$s_correct += $correct;

		$s_prec = sprintf("%.3f", ($s_picked == 0) ? 0 : $s_tp / $s_picked);
		$s_recl = sprintf("%.3f", ($s_correct == 0) ? 0 : $s_tp / $s_correct);
		$s_f1 = sprintf("%.3f", ($s_prec + $s_recl == 0) ? 0 : 2 * $s_prec * $s_recl / ($s_prec + $s_recl));
		print "Cumulative Tokens: $n_tokens TP: $s_tp Picked: $s_picked Correct: $s_correct\n";	
		print "Cumulative Precision: $s_prec Recall: $s_recl F1: $s_f1\n";	
		warn "Cumulative Tokens: $n_tokens TP: $s_tp Picked: $s_picked Correct: $s_correct\n";	
		warn "Cumulative Precision: $s_prec Recall: $s_recl F1: $s_f1\n";	

		if($lang eq "sw")
		{
			$ie_picked = $ie_tp + $ie_fp;
			$ie_correct = $ie_tp + $ie_fn;
			$ie_prec = sprintf("%.3f", ($ie_picked == 0) ? 0 : $ie_tp / $ie_picked);
			$ie_recl = sprintf("%.3f", ($ie_correct == 0) ? 0 : $ie_tp / $ie_correct);
			$ie_f1 = sprintf("%.3f", ($ie_prec + $ie_recl == 0) ? 0 : 2 * $ie_prec * $ie_recl / ($ie_prec + $ie_recl));
			print "\n\nIE:\nCumulative Tokens: $n_tokens TP: $ie_tp Picked: $ie_picked Correct: $ie_correct\n";	
			print "Cumulative Precision: $ie_prec Recall: $ie_recl F1: $ie_f1\n";	
			warn "IE:\nCumulative Tokens: $n_tokens TP: $ie_tp Picked: $ie_picked Correct: $ie_correct\n";	
			warn "Cumulative Precision: $ie_prec Recall: $ie_recl F1: $ie_f1\n";	

			$ietw_picked = $ietw_tp + $ietw_fp;
			$ietw_correct = $ietw_tp + $ietw_fn;
			$ietw_prec = sprintf("%.3f", ($ietw_picked == 0) ? 0 : $ietw_tp / $ietw_picked);
			$ietw_recl = sprintf("%.3f", ($ietw_correct == 0) ? 0 : $ietw_tp / $ietw_correct);
			$ietw_f1 = sprintf("%.3f", ($ietw_prec + $ietw_recl == 0) ? 0 : 2 * $ietw_prec * $ietw_recl / ($ietw_prec + $ietw_recl));
			print "\n\nIETW:\nCumulative Tokens: $n_tokens TP: $ietw_tp Picked: $ietw_picked Correct: $ietw_correct\n";	
			print "Cumulative Precision: $ietw_prec Recall: $ietw_recl F1: $ietw_f1\n";	
			warn "IETW:\nCumulative Tokens: $n_tokens TP: $ietw_tp Picked: $ietw_picked Correct: $ietw_correct\n";	
			warn "Cumulative Precision: $ietw_prec Recall: $ietw_recl F1: $ietw_f1\n";
		}

		if($lang eq "ar")
		{
			$avg_f1 = 0;
			$s_avg_f1 = 0;
			for my $type (keys %mytypes)
			{
				$prec = sprintf("%.3f", ($type_picked{$type} == 0) ? 0 : $type_tp{$type} / $type_picked{$type});
				$recl = sprintf("%.3f", ($type_correct{$type} == 0) ? 0 : $type_tp{$type} / $type_correct{$type});
				$f1 = sprintf("%.3f", ($prec + $recl == 0) ? 0 : 2 * $prec * $recl / ($prec + $recl));
				$avg_f1 += $f1;
				print "$type TP: $type_tp{$type} Picked: $type_picked{$type} Correct: $type_correct{$type}\n";	
				warn "$type TP: $type_tp{$type} Picked: $type_picked{$type} Correct: $type_correct{$type}\n";	
				print "$infile $type Precision: $prec Recall: $recl F1: $f1\n";
				warn "$infile $type Precision: $prec Recall: $recl F1: $f1\n";

				$s_type_tp{$type} += $type_tp{$type};
				$s_type_picked{$type} +=$type_picked{$type};
				$s_type_correct{$type} += $type_correct{$type};

				$s_prec = sprintf("%.3f", ($s_type_picked{$type} == 0) ? 0 : $s_type_tp{$type} / $s_type_picked{$type});
				$s_recl = sprintf("%.3f", ($s_type_correct{$type} == 0) ? 0 : $s_type_tp{$type} / $s_type_correct{$type});
				$s_f1 = sprintf("%.3f", ($s_prec + $s_recl == 0) ? 0 : 2 * $s_prec * $s_recl / ($s_prec + $s_recl));
				$s_avg_f1+= $s_f1;
				print "Cumulative $type TP: $s_type_tp{$type} Picked: $s_type_picked{$type} Correct: $s_type_correct{$type}\n";	
				print "Cumulative $type Precision: $s_prec Recall: $s_recl F1: $s_f1\n";	
				warn "Cumulative $type TP: $s_type_tp{$type} Picked: $s_type_picked{$type} Correct: $s_type_correct{$type}\n";	
				warn "Cumulative $type Precision: $s_prec Recall: $s_recl F1: $s_f1\n";			
			}
			$avg_f1 /= 4;
			$s_avg_f1 /= 4;
			print "avg F1 score: $avg_f1\n";
			warn "avg F1 score: $avg_f1\n";
			print "Cumulative avg F1 score: $s_avg_f1\n";
			warn "Cumulative avg F1 score: $s_avg_f1\n";
		}
		
	}
}

sub getdata_aner()
{

	my $line = <$ifh>;
	chomp($line);
	if(!($line))
	{
		print $debug "Getdata_aner: File done: $infile\n";
		return ();
	}
	
	my @parts = split /\s+/, $line;
	#print scalar @parts, ": ", join(", ", @parts); print "\n";
	#if(scalar @parts == 3)
	#{
	#	do
	#	{
	#		$result = $service->translate($parts[2]);
	#	}  while ($result->error and $result->code != 400);
	#	$parts[2] = $result->translation;
	#}
	#if(substr($parts[0], -1, 1) eq "." and $parts[0] ne ".")	#remove it
	#{
	#	$parts[0] = substr($parts[0], 0, length($parts[0]) - 1);
	#}
	
	if($parts[0] eq ".")	
	{
		return (scalar @parts == 2) ? ($parts[0], $parts[1], 0) : ($parts[0], $parts[1], 0, $parts[2]);
	}
	else
	{
		return (scalar @parts == 2) ? ($parts[0], $parts[1], 1) : ($parts[0], $parts[1], 1, $parts[2]);
	}
	
}

sub getdata
{

	if($in_flag == 0)
	{
		while(1)
		{
			$line = <$ifh>;
			goto GLOOP if(! ($line) or $line =~ /token listing end (.+)\.tree.+/);
			if($line =~ /token listing start (.+)\.tree.+/)
			{
				
				@tokens = ();
				@lemmas = ();
				@ltot = ();
				@llemmas = ();
				@morphs = ();
				
				if($1 ne $filename)
				{
					print $debug "Getdata: File done: $filename\n" if($filename ne "XXX");
					$filename = $1;
				}
				next;		
			}
			if($line =~ /#source Tokens:(.+)/)
			{		
				$treesize = $1;
				next;
			}
			
			if($line =~ /s:([0-9]+).+·(.+)·(.+)·(.+)·(.+)·(.+)/)
			{
				$tokens[$1] = (substr($2, length($2) - 1, 1) eq "Â") ? substr($2, 0, length($2) - 1) : $2;
				@endpts = split "-", $4;

				for($i = $endpts[0]; $i <= $endpts[1]; $i++)
				{
					$ltot[$i] = $1;
				}
				next;
			}

			if($line =~ /t:([0-9]+).+·(.+)·(.+)·(.+)·(.+)·(.+)·(.+)·(.+)·(.+)/)
			{	
				$temp = (substr($4, length($4) - 1, 1) eq "Â") ? substr($4, 0, length($4) - 1) : $4;
				if($lemmas[$ltot[$1]] eq "")
				{
					$lemmas[$ltot[$1]] = $temp;
				}
				else
				{
					$lemmas[$ltot[$1]] = $lemmas[$ltot[$1]] . "+" . $temp;
				}

				$temp = (substr($7, length($7) - 1, 1) eq "Â") ? substr($7, 0, length($7) - 1) : $7;
				if($temp eq "[TBupdate]" or $temp eq "dummylemma" or $temp eq "nolemma")
				{
					$temp = "[DEFAULT]";
				}
				$llemmas[$ltot[$1]] = $temp if($temp ne "[clitics]" or ($temp eq "[clitics]" and $llemmas[$ltot[$1]] eq ""));
				$llemmas[$ltot[$1]] =~ tr/[]//d;
				
				$temp = (substr($5, length($5) - 1, 1) eq "Â") ? substr($5, 0, length($5) - 1) : $5;
				if($morphs[$ltot[$1]] eq "")
				{
					$morphs[$ltot[$1]] = $temp;
				}
				else
				{
					$morphs[$ltot[$1]] = $morphs[$ltot[$1]] . "+" . $temp;
				}		
				next;
			}
		}
GLOOP:
		$in_flag = 1;
	}

	if(!($line))
	{
		print $debug "Getdata: File done: $filename\n";
		$in_flag = 0;
		$filename = "XXX";
		return ();
	}
	
	$token = $tokens[$treesize_i];
	$lemma = $lemmas[$treesize_i];
	$llemma = $llemmas[$treesize_i];
	$morph = $morphs[$treesize_i];
	$treesize_i++;
	if($treesize_i == $treesize)
	{
		$treesize_i = 0;
		$in_flag = 0;
	}
	
	@return_arr = ($token, $lemma, $llemma, $morph, $in_flag);
	return @return_arr;
}

1;
