=introduction SYNERGY NER system Author: Rushin Shah Language Technologies Institute Carnegie Mellon University Copyright (c) 2010 All Rights Reserved. Permission is hereby granted, free of charge, to use and distribute this software and its documentation without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of this work, and to permit persons to whom this work is furnished to do so, subject to the following conditions: 1. The code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Any modifications must be clearly marked as such. 3. Original authors' names are not deleted. 4. The authors' names are not used to endorse or promote products derived from this software without specific prior written permission. CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. =cut package NER; BEGIN { push @INC, "/ldc/lib/perl5/site_perl" unless ( grep m{/ldc/lib/perl5/site_perl}, @INC ) } use List::Util qw(max); use List::Util qw(min); use Text::English; use WebService::Google::Language; use Encode; use Encode::Buckwalter; use Encode qw/encode decode is_utf8/; use REST::Google::Translate; use Algorithm::Diff qw(LCS LCS_length LCSidx diff sdiff compact_diff traverse_sequences traverse_balanced ); use String::LCSS_XS qw(lcss lcss_all); #use WWW::Babelfish; #use SAMA; #use DB_File; #use Fcntl; #require Encode::Detect; #use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; =documentation This is the documentation for the SYNERGY NER system for Arabic and Swahili, first presented in this paper. The system is implemented in Perl. It requires the following Perl modules to be installed: List::Util qw(max) List::Util qw(min) Text::English WebService::Google::Language Encode Encode::Buckwalter Encode qw/encode decode is_utf8/ REST::Google::Translate Algorithm::Diff qw(LCS LCS_length LCSidx diff sdiff compact_diff traverse_sequences traverse_balanced ) String::LCSS_XS qw(lcss lcss_all) To use the system, first include the following line in your code: use NER; Then instantiate an object using the following command: $ner = NER->(%args); The arguments hash can take the following optional parameters: align - specifies which alignment to use from {"giza", "en", "src"}. default "giza" pp - post-processing (0 - off, 1 - on). default 1. coref - whether to also perform CRR (1) or not (0). default 0. translate - which translator to use: google ("g") or bing ("b"). default "g". Thus a sample usage would look like this: $ner = NER->new(align => "giza", pp => 1, coref => 0, translate => "g"); Then, to use any function in this module, say X, call it as follows: $ner->X(@args); The function to call to perfrom NER is translate, and must be supplied an array of file names. e.g. $ner->translate("file1_arb.txt", "file2_arb.txt"); File names must end in either "_arb.txt" for Arabic, or "_swh.txt" for Swahili. Other useful functions include make_table, lbj and stanford. Sample usage: ($stan_table, $lbj_table, $corr_table, $union_table) = $ner->make_table($outfile, @infiles); First argument must be the name of the outfile, subsequent arguments must be raw input files. By raw input files, we mean English text files without any labeling. They must have _raw at the end, i.e. be named in the following format: xyz_raw.txt Before using make_table function for some $infile, following commands must be run: $ner->lbj($infile); $ner->stanford($infile); This will produce the lbj and stanford tagged versions of the raw file. You must have lbj and stanford systems installed, and the folders LbjNerTagger1.11.release and stanford-ner-2009-01-16 must be in your current folder. LBJ files produced will be of the form _lbj.txt and Stanford files produced will be of the form _tempstan.txt and _stanford.txt . Do not modify this. So, a full use case would look like this: for $file(@infiles) { $ner->lbj($file); $ner->stanford($file); } ($stan_table, $lbj_table, $corr_table, $union_table) = $ner->make_table("table_all.txt", @infiles); Now you can either use the tables returned in $stan_table, etc. or the output file table_all.txt If labeled version of the raw file exists, it must be of the form _correct.txt . If so, then $corr_table will be non-empty, otherwise empty string. =cut my $in_flag = 0; my $treesize_i; my $treesize; my $ifh; my $filename = "XXX"; $mytypes{PER}++; $mytypes{LOC}++; $mytypes{ORG}++; $mytypes{MIS}++; #webservice::google::language - all except ar->en #rest::google::translate - all except en->sw $service = WebService::Google::Language->new('http://example.com/'); REST::Google::Translate->http_referer('http://example.com'); $debug = STDOUT; sub new { my $name = shift; my %opt = @_; #control variables: $lang, $translator, $l_compare $l_compare = (exists($opt{align})) ? $opt{align} : "giza"; $l_coref = (exists($opt{coref})) ? $opt{coref} : 0; $pp = (exists($opt{pp})) ? $opt{pp} : 1; $translator = (exists($opt{translate})) ? $opt{translate} : "g"; $l_jt = 1; binmode STDOUT, ":utf8"; %commons = (); open(my $cfh, "<", "commons.txt"); while(<$cfh>) { $word = $_; chomp($word); chop($word) if(substr($word, -1, 1) eq "\n" or substr($word, -1, 1) eq "\r"); next if($word eq ""); $commons{$word}++; } close($cfh); %swh_commons = (); open($cfh, "<", "swh_commons.txt"); while(<$cfh>) { $word = $_; chomp($word); chop($word) if(substr($word, -1, 1) eq "\n" or substr($word, -1, 1) eq "\r"); next if($word eq ""); $swh_commons{$word}++; } close($cfh); %arb_commons = (); open($cfh, "<", "arb_commons.txt"); while(<$cfh>) { $word = $_; chomp($word); chop($word) if(substr($word, -1, 1) eq "\n" or substr($word, -1, 1) eq "\r"); next if($word eq ""); $arb_commons{$word}++; } close($cfh); %propers = (); open(my $pfh, "<", "propers.txt"); while(<$pfh>) { $word = $_; chomp($word); chop($word) if(substr($word, -1, 1) eq "\n" or substr($word, -1, 1) eq "\r"); next if($word eq ""); $propers{lc $word}++; } close($pfh); %swh_propers = (); open($pfh, "<", "swh_propers.txt"); while(<$pfh>) { $word = $_; chomp($word); chop($word) if(substr($word, -1, 1) eq "\n" or substr($word, -1, 1) eq "\r"); next if($word eq ""); $swh_propers{lc $word}++; } close($pfh); %arb_propers = (); open($pfh, "<", "arb_propers.txt"); while(<$pfh>) { $word = $_; chomp($word); chop($word) if(substr($word, -1, 1) eq "\n" or substr($word, -1, 1) eq "\r"); next if($word eq ""); $arb_propers{$word}++; } close($pfh); warn "NER 1.0 Rushin Shah\n"; bless { align => $l_compare, coref => $l_coref, pp => , $pp, translate => $translator }; } sub evaluate { #basically, an augmented version of calcstats my $name = shift; #my $outfile = shift; my @files = @_; #open($ofh, ">", $outfile); my $sum_tp = 0; my $sum_fp = 0; my $sum_fn = 0; my $sum_tn = 0; %c_s = (); %c_l = (); %c_u = (); my $sum_tokens = 0; for my $infile (@files) { ($num, $str_tokens, $str_stan, $str_lbj, $str_union, $str_corr) = sync("sync", $infile); if($str_corr eq "" or $str_tcorr eq "") { print "Correct file doesn't exist for $infile, skipping\n"; next; } $sum_tokens += $num; @stan_tokens = split "\n", $str_tokens; @lbj_tokens = split "\n", $str_tokens; @union_tokens = split "\n", $str_tokens; @corr_tokens = split "\n", $str_tokens; @stan_tags = split "\n", $str_stan; @lbj_tags = split "\n", $str_lbj; @union_tags = split "\n", $str_union; @corr_tags = split "\n", $str_corr; $tp = 0; $fp = 0; $fn = 0; $tn = 0; $s_tp = 0; $s_fp = 0; $s_fn = 0; $s_tn = 0; $u_tp = 0; $u_fp = 0; $u_fn = 0; $u_tn = 0; for($i = 0; $i < $num; $i++) { if($stan_tags[$i] eq $lbj_tags[$i]) { $union_tags[$i] = $stan_tags[$i]; } if($stan_tags[$i] eq "NOT" and $lbj_tags[$i] ne "NOT") { $union_tags[$i] = $lbj_tags[$i]; } if($stan_tags[$i] ne "NOT" and $lbj_tags[$i] eq "NOT") { $union_tags[$i] = $stan_tags[$i]; } if($stan_tags[$i] ne "NOT" and $lbj_tags[$i] ne "NOT" and $stan_tags[$i] ne $lbj_tags[$i]) { $union_tags[$i] = $lbj_tags[$i]; } $tp++ if($corr_tags[$i] ne "NOT" and $union_tags[$i] ne "NOT"); $fp++ if($corr_tags[$i] eq "NOT" and $union_tags[$i] ne "NOT"); $fn++ if($corr_tags[$i] ne "NOT" and $union_tags[$i] eq "NOT"); $tn++ if($corr_tags[$i] eq "NOT" and $union_tags[$i] eq "NOT"); $s_tp++ if($corr_tags[$i] ne "NOT" and $stan_tags[$i] ne "NOT"); $s_fp++ if($corr_tags[$i] eq "NOT" and $stan_tags[$i] ne "NOT"); $s_fn++ if($corr_tags[$i] ne "NOT" and $stan_tags[$i] eq "NOT"); $s_tn++ if($corr_tags[$i] eq "NOT" and $stan_tags[$i] eq "NOT"); $u_tp++ if($corr_tags[$i] ne "NOT" and $lbj_tags[$i] ne "NOT"); $u_fp++ if($corr_tags[$i] eq "NOT" and $lbj_tags[$i] ne "NOT"); $u_fn++ if($corr_tags[$i] ne "NOT" and $lbj_tags[$i] eq "NOT"); $u_tn++ if($corr_tags[$i] eq "NOT" and $lbj_tags[$i] eq "NOT"); $c_s{PER}{tp}++ if($corr_tags[$i] eq "PER" and $stan_tags[$i] eq "PER"); $c_s{PER}{fp}++ if($corr_tags[$i] ne "PER" and $stan_tags[$i] eq "PER"); $c_s{PER}{fn}++ if($corr_tags[$i] eq "PER" and $stan_tags[$i] ne "PER"); $c_s{PER}{tn}++ if($corr_tags[$i] ne "PER" and $stan_tags[$i] ne "PER"); $c_s{LOC}{tp}++ if($corr_tags[$i] eq "LOC" and $stan_tags[$i] eq "LOC"); $c_s{LOC}{fp}++ if($corr_tags[$i] ne "LOC" and $stan_tags[$i] eq "LOC"); $c_s{LOC}{fn}++ if($corr_tags[$i] eq "LOC" and $stan_tags[$i] ne "LOC"); $c_s{LOC}{tn}++ if($corr_tags[$i] ne "LOC" and $stan_tags[$i] ne "LOC"); $c_s{ORG}{tp}++ if($corr_tags[$i] eq "ORG" and $stan_tags[$i] eq "ORG"); $c_s{ORG}{fp}++ if($corr_tags[$i] ne "ORG" and $stan_tags[$i] eq "ORG"); $c_s{ORG}{fn}++ if($corr_tags[$i] eq "ORG" and $stan_tags[$i] ne "ORG"); $c_s{ORG}{tn}++ if($corr_tags[$i] ne "ORG" and $stan_tags[$i] ne "ORG"); $c_s{MIS}{tp}++ if($corr_tags[$i] eq "MIS" and $stan_tags[$i] eq "MIS"); $c_s{MIS}{fp}++ if($corr_tags[$i] ne "MIS" and $stan_tags[$i] eq "MIS"); $c_s{MIS}{fn}++ if($corr_tags[$i] eq "MIS" and $stan_tags[$i] ne "MIS"); $c_s{MIS}{tn}++ if($corr_tags[$i] ne "MIS" and $stan_tags[$i] ne "MIS"); $c_l{PER}{tp}++ if($corr_tags[$i] eq "PER" and $lbj_tags[$i] eq "PER"); $c_l{PER}{fp}++ if($corr_tags[$i] ne "PER" and $lbj_tags[$i] eq "PER"); $c_l{PER}{fn}++ if($corr_tags[$i] eq "PER" and $lbj_tags[$i] ne "PER"); $c_l{PER}{tn}++ if($corr_tags[$i] ne "PER" and $lbj_tags[$i] ne "PER"); $c_l{LOC}{tp}++ if($corr_tags[$i] eq "LOC" and $lbj_tags[$i] eq "LOC"); $c_l{LOC}{fp}++ if($corr_tags[$i] ne "LOC" and $lbj_tags[$i] eq "LOC"); $c_l{LOC}{fn}++ if($corr_tags[$i] eq "LOC" and $lbj_tags[$i] ne "LOC"); $c_l{LOC}{tn}++ if($corr_tags[$i] ne "LOC" and $lbj_tags[$i] ne "LOC"); $c_l{ORG}{tp}++ if($corr_tags[$i] eq "ORG" and $lbj_tags[$i] eq "ORG"); $c_l{ORG}{fp}++ if($corr_tags[$i] ne "ORG" and $lbj_tags[$i] eq "ORG"); $c_l{ORG}{fn}++ if($corr_tags[$i] eq "ORG" and $lbj_tags[$i] ne "ORG"); $c_l{ORG}{tn}++ if($corr_tags[$i] ne "ORG" and $lbj_tags[$i] ne "ORG"); $c_l{MIS}{tp}++ if($corr_tags[$i] eq "MIS" and $lbj_tags[$i] eq "MIS"); $c_l{MIS}{fp}++ if($corr_tags[$i] ne "MIS" and $lbj_tags[$i] eq "MIS"); $c_l{MIS}{fn}++ if($corr_tags[$i] eq "MIS" and $lbj_tags[$i] ne "MIS"); $c_l{MIS}{tn}++ if($corr_tags[$i] ne "MIS" and $lbj_tags[$i] ne "MIS"); $c_u{PER}{tp}++ if($corr_tags[$i] eq "PER" and $union_tags[$i] eq "PER"); $c_u{PER}{fp}++ if($corr_tags[$i] ne "PER" and $union_tags[$i] eq "PER"); $c_u{PER}{fn}++ if($corr_tags[$i] eq "PER" and $union_tags[$i] ne "PER"); $c_u{PER}{tn}++ if($corr_tags[$i] ne "PER" and $union_tags[$i] ne "PER"); $c_u{LOC}{tp}++ if($corr_tags[$i] eq "LOC" and $union_tags[$i] eq "LOC"); $c_u{LOC}{fp}++ if($corr_tags[$i] ne "LOC" and $union_tags[$i] eq "LOC"); $c_u{LOC}{fn}++ if($corr_tags[$i] eq "LOC" and $union_tags[$i] ne "LOC"); $c_u{LOC}{tn}++ if($corr_tags[$i] ne "LOC" and $union_tags[$i] ne "LOC"); $c_u{ORG}{tp}++ if($corr_tags[$i] eq "ORG" and $union_tags[$i] eq "ORG"); $c_u{ORG}{fp}++ if($corr_tags[$i] ne "ORG" and $union_tags[$i] eq "ORG"); $c_u{ORG}{fn}++ if($corr_tags[$i] eq "ORG" and $union_tags[$i] ne "ORG"); $c_u{ORG}{tn}++ if($corr_tags[$i] ne "ORG" and $union_tags[$i] ne "ORG"); $c_u{MIS}{tp}++ if($corr_tags[$i] eq "MIS" and $union_tags[$i] eq "MIS"); $c_u{MIS}{fp}++ if($corr_tags[$i] ne "MIS" and $union_tags[$i] eq "MIS"); $c_u{MIS}{fn}++ if($corr_tags[$i] eq "MIS" and $union_tags[$i] ne "MIS"); $c_u{MIS}{tn}++ if($corr_tags[$i] ne "MIS" and $union_tags[$i] ne "MIS"); } $prec = $s_tp / ($s_tp + $s_fp); $recl = $s_tp / ($s_tp + $s_fn); $f1 = 2 * $prec * $recl / ($prec + $recl); print "Tokens: $num Cumulative: $sum_tokens\n"; printf "Stan: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $prec, $recl, $f1; $prec = $u_tp / ($u_tp + $u_fp); $recl = $u_tp / ($u_tp + $u_fn); $f1 = 2 * $prec * $recl / ($prec + $recl); printf "LBJ: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $prec, $recl, $f1; $prec = $tp / ($tp + $fp); $recl = $tp / ($tp + $fn); $f1 = 2 * $prec * $recl / ($prec + $recl); printf "Union: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n\n", $prec, $recl, $f1; $sum_s_tp += $s_tp; $sum_s_fp += $s_fp; $sum_s_fn += $s_fn; $sum_s_tn += $s_tn; $sum_u_tp += $u_tp; $sum_u_fp += $u_fp; $sum_u_fn += $u_fn; $sum_u_tn += $u_tn; $sum_tp += $tp; $sum_fp += $fp; $sum_fn += $fn; $sum_tn += $tn; } $c_s{PER}{prec} = $c_s{PER}{tp} / ($c_s{PER}{tp} + $c_s{PER}{fp}); $c_s{PER}{recl} = $c_s{PER}{tp} / ($c_s{PER}{tp} + $c_s{PER}{fn}); $c_s{PER}{f1} = 2 * $c_s{PER}{prec} * $c_s{PER}{recl} / ($c_s{PER}{prec} + $c_s{PER}{recl}); $c_s{LOC}{prec} = $c_s{LOC}{tp} / ($c_s{LOC}{tp} + $c_s{LOC}{fp}); $c_s{LOC}{recl} = $c_s{LOC}{tp} / ($c_s{LOC}{tp} + $c_s{LOC}{fn}); $c_s{LOC}{f1} = 2 * $c_s{LOC}{prec} * $c_s{LOC}{recl} / ($c_s{LOC}{prec} + $c_s{LOC}{recl}); $c_s{ORG}{prec} = $c_s{ORG}{tp} / ($c_s{ORG}{tp} + $c_s{ORG}{fp}); $c_s{ORG}{recl} = $c_s{ORG}{tp} / ($c_s{ORG}{tp} + $c_s{ORG}{fn}); $c_s{ORG}{f1} = 2 * $c_s{ORG}{prec} * $c_s{ORG}{recl} / ($c_s{ORG}{prec} + $c_s{ORG}{recl}); $c_s{MIS}{prec} = $c_s{MIS}{tp} / ($c_s{MIS}{tp} + $c_s{MIS}{fp}); $c_s{MIS}{recl} = $c_s{MIS}{tp} / ($c_s{MIS}{tp} + $c_s{MIS}{fn}); $c_s{MIS}{f1} = 2 * $c_s{MIS}{prec} * $c_s{MIS}{recl} / ($c_s{MIS}{prec} + $c_s{MIS}{recl}); printf "Stan PER: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_s{PER}{prec}, $c_s{PER}{recl}, $c_s{PER}{f1}; printf "Stan LOC: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_s{LOC}{prec}, $c_s{LOC}{recl}, $c_s{LOC}{f1}; printf "Stan ORG: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_s{ORG}{prec}, $c_s{ORG}{recl}, $c_s{ORG}{f1}; printf "Stan MIS: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_s{MIS}{prec}, $c_s{MIS}{recl}, $c_s{MIS}{f1}; $c_s{AVG}{f1} = ($c_s{PER}{f1} + $c_s{LOC}{f1} + $c_s{ORG}{f1} + $c_s{MIS}{f1}) / 4; print "Stan AVG f1 score: ", $c_s{AVG}{f1}, "\n"; $prec = $sum_s_tp / ($sum_s_tp + $sum_s_fp); $recl = $sum_s_tp / ($sum_s_tp + $sum_s_fn); $f1 = 2 * $prec * $recl / ($prec + $recl); printf "Overall Stan Precision: %.3f Recall: %.3f F1: %.3f\n", $prec, $recl, $f1; $c_l{PER}{prec} = $c_l{PER}{tp} / ($c_l{PER}{tp} + $c_l{PER}{fp}); $c_l{PER}{recl} = $c_l{PER}{tp} / ($c_l{PER}{tp} + $c_l{PER}{fn}); $c_l{PER}{f1} = 2 * $c_l{PER}{prec} * $c_l{PER}{recl} / ($c_l{PER}{prec} + $c_l{PER}{recl}); $c_l{LOC}{prec} = $c_l{LOC}{tp} / ($c_l{LOC}{tp} + $c_l{LOC}{fp}); $c_l{LOC}{recl} = $c_l{LOC}{tp} / ($c_l{LOC}{tp} + $c_l{LOC}{fn}); $c_l{LOC}{f1} = 2 * $c_l{LOC}{prec} * $c_l{LOC}{recl} / ($c_l{LOC}{prec} + $c_l{LOC}{recl}); $c_l{ORG}{prec} = $c_l{ORG}{tp} / ($c_l{ORG}{tp} + $c_l{ORG}{fp}); $c_l{ORG}{recl} = $c_l{ORG}{tp} / ($c_l{ORG}{tp} + $c_l{ORG}{fn}); $c_l{ORG}{f1} = 2 * $c_l{ORG}{prec} * $c_l{ORG}{recl} / ($c_l{ORG}{prec} + $c_l{ORG}{recl}); $c_l{MIS}{prec} = $c_l{MIS}{tp} / ($c_l{MIS}{tp} + $c_l{MIS}{fp}); $c_l{MIS}{recl} = $c_l{MIS}{tp} / ($c_l{MIS}{tp} + $c_l{MIS}{fn}); $c_l{MIS}{f1} = 2 * $c_l{MIS}{prec} * $c_l{MIS}{recl} / ($c_l{MIS}{prec} + $c_l{MIS}{recl}); printf "LBJ PER: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_l{PER}{prec}, $c_l{PER}{recl}, $c_l{PER}{f1}; printf "LBJ LOC: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_l{LOC}{prec}, $c_l{LOC}{recl}, $c_l{LOC}{f1}; printf "LBJ ORG: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_l{ORG}{prec}, $c_l{ORG}{recl}, $c_l{ORG}{f1}; printf "LBJ MIS: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_l{MIS}{prec}, $c_l{MIS}{recl}, $c_l{MIS}{f1}; $c_l{AVG}{f1} = ($c_l{PER}{f1} + $c_l{LOC}{f1} + $c_l{ORG}{f1} + $c_l{MIS}{f1}) / 4; print "LBJ AVG f1 score: ", $c_l{AVG}{f1}, "\n"; $prec = $sum_u_tp / ($sum_u_tp + $sum_u_fp); $recl = $sum_u_tp / ($sum_u_tp + $sum_u_fn); $f1 = 2 * $prec * $recl / ($prec + $recl); printf "Overall LBJ Precision: %.3f Recall: %.3f F1: %.3f\n", $prec, $recl, $f1; $c_u{PER}{prec} = $c_u{PER}{tp} / ($c_u{PER}{tp} + $c_u{PER}{fp}); $c_u{PER}{recl} = $c_u{PER}{tp} / ($c_u{PER}{tp} + $c_u{PER}{fn}); $c_u{PER}{f1} = 2 * $c_u{PER}{prec} * $c_u{PER}{recl} / ($c_u{PER}{prec} + $c_u{PER}{recl}); $c_u{LOC}{prec} = $c_u{LOC}{tp} / ($c_u{LOC}{tp} + $c_u{LOC}{fp}); $c_u{LOC}{recl} = $c_u{LOC}{tp} / ($c_u{LOC}{tp} + $c_u{LOC}{fn}); $c_u{LOC}{f1} = 2 * $c_u{LOC}{prec} * $c_u{LOC}{recl} / ($c_u{LOC}{prec} + $c_u{LOC}{recl}); $c_u{ORG}{prec} = $c_u{ORG}{tp} / ($c_u{ORG}{tp} + $c_u{ORG}{fp}); $c_u{ORG}{recl} = $c_u{ORG}{tp} / ($c_u{ORG}{tp} + $c_u{ORG}{fn}); $c_u{ORG}{f1} = 2 * $c_u{ORG}{prec} * $c_u{ORG}{recl} / ($c_u{ORG}{prec} + $c_u{ORG}{recl}); $c_u{MIS}{prec} = $c_u{MIS}{tp} / ($c_u{MIS}{tp} + $c_u{MIS}{fp}); $c_u{MIS}{recl} = $c_u{MIS}{tp} / ($c_u{MIS}{tp} + $c_u{MIS}{fn}); $c_u{MIS}{f1} = 2 * $c_u{MIS}{prec} * $c_u{MIS}{recl} / ($c_u{MIS}{prec} + $c_u{MIS}{recl}); printf "Union PER: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_u{PER}{prec}, $c_u{PER}{recl}, $c_u{PER}{f1}; printf "Union LOC: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_u{LOC}{prec}, $c_u{LOC}{recl}, $c_u{LOC}{f1}; printf "Union ORG: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_u{ORG}{prec}, $c_u{ORG}{recl}, $c_u{ORG}{f1}; printf "Union MIS: File: $infile Precision: %.3f Recall: %.3f F1: %.3f\n", $c_u{MIS}{prec}, $c_u{MIS}{recl}, $c_u{MIS}{f1}; $c_u{AVG}{f1} = ($c_u{PER}{f1} + $c_u{LOC}{f1} + $c_u{ORG}{f1} + $c_u{MIS}{f1}) / 4; print "Union AVG f1 score: ", $c_u{AVG}{f1}, "\n"; $prec = $sum_tp / ($sum_tp + $sum_fp); $recl = $sum_tp / ($sum_tp + $sum_fn); $f1 = 2 * $prec * $recl / ($prec + $recl); printf "Overall Union Precision: %.3f Recall: %.3f F1: %.3f\n\n", $prec, $recl, $f1; print "Cumulative: $sum_tokens\n"; #close($ofh); } sub sync { my $name = shift; my ($infile) = @_; $stanfile = $infile; $stanfile =~ s/_raw/_tempstan/g; $lbjfile = $infile; $lbjfile =~ s/_raw/_lbj/g; $corrfile = $infile; $corrfile =~ s/_raw/_correct/g; $coreffile = $infile . ".LBJ"; { open(RAW, "<", $infile) || die "cannot open file $infile"; local $/; $doc = ; close(RAW); } @raw_tokens = split /\s+/, $doc; $num = scalar @raw_tokens; $str_traw = join "\n", @raw_tokens; @raw_stags = (); @raw_ltags = (); @raw_utags = (); @raw_ctags = (); if(-e $stanfile) { { open($ifh, "<", $stanfile) || warn "sync: cannot open file $stanfile"; local $/; $s_str = <$ifh>; close($ifh); } @stan_tagstokens = split /\s+/, $s_str; @stan_tokens = (); @stan_tags = (); for(@stan_tagstokens) { ($token, $tag) = split "/", $_; #\ | ( ) [ { ^ $ * + ? . or $token =~ /\s+/ or $token =~/[:;,\.\[|"']+/ #next if($l_compare eq "giza" and ($token eq "" or $token eq "-LRB-" or $token eq "-RRB-")); #or $token eq "s" or $token !~ /\w+/ #next if($token eq "" or $token eq "-LRB-" or $token eq "-RRB-" or $token eq "'s" or $token eq "s" or $token !~ /\w+/); #$token =~ s/\W//g; $tag = ($tag eq "O") ? "NOT" : substr($tag, 0, 3); $tag = "MIS" if($lang eq "sw" and (lc $token eq "january" or lc $token eq "february" or lc $token eq "march" or lc $token eq "april" or lc $token eq "may" or lc $token eq "june" or lc $token eq "july" or lc $token eq "august" or lc $token eq "september" or lc $token eq "october" or lc $token eq "november" or lc $token eq "december" or lc $token eq "sunday" or lc $token eq "monday" or lc $token eq "tuesday" or lc $token eq "wednesday" or lc $token eq "thursday" or lc $token eq "friday" or lc $token eq "saturday")); #$tag = "MIS" if($swh_propers{lc $token} ne "" and $swh_commons{lc $token} eq ""); #$tag = "MIS" if($propers{lc $token} ne "" and $commons{lc $token} eq ""); push @stan_tokens, $token; push @stan_tags, $tag; } for (my $i = 0; $i < scalar @raw_tokens; $i++) { if($raw_tokens[$i] eq $stan_tokens[$i]) { $raw_stags[$i] = $stan_tags[$i]; } if($stan_tokens[$i] eq "-LRB-" or $stan_tokens[$i] eq "-RRB-") { splice @stan_tokens, $i, 1; splice @stan_tags, $i, 1; } if($raw_tokens[$i] ne $stan_tokens[$i]) { $temptag = "NOT"; $tempstr = ""; $flag = 0; for($j = 0; $j <= 5; $j++) { $temptag = $stan_tags[$i+$j] if($stan_tags[$i+$j] ne "NOT"); $tempstr = $tempstr . $stan_tokens[$i + $j]; if($tempstr eq $raw_tokens[$i] or "(" . $tempstr eq $raw_tokens[$i]) { $flag = 1; last; } } if($flag == 1) { $raw_stags[$i] = $temptag; $stan_tokens[$i] = $raw_tokens[$i]; $stan_tags[$i] = $raw_stags[$i]; splice @stan_tokens, $i+1, $j; splice @stan_tags, $i+1, $j; } else { $temptag = "NOT"; for($j = 1; $j <= 5; $j++) { $temptag = $stan_tags[$i+$j] if($stan_tags[$i+$j] ne "NOT"); if($raw_tokens[$i] eq $stan_tokens[$i+$j]) { $flag = 1; last; } } if($flag == 1) { splice @stan_tokens, $i, $j; splice @stan_tags, $i, $j; $raw_stags[$i] = $temptag; } else { $raw_stags[$i] = $stan_tags[$i]; } } } #print "stan_sync: |$raw_tokens[$i]|$stan_tokens[$i]|\n"; } $str_sraw = join "\n", @raw_stags; } else { $str_sraw = ""; } if(-e $lbjfile) { { open($ifh, "<", $lbjfile) || warn "sync: cannot open file $lbjfile"; local $/; $l_str = <$ifh>; close($ifh); } @lbj_tagstext = split /[\[\]]/, $l_str; @lbj_tokens = (); @lbj_tags = (); for my $chunk (@lbj_tagstext) { if($chunk =~ /^PER / or $chunk =~ /^LOC / or $chunk =~ /^ORG / or $chunk =~ /^MISC /) { $tag = substr($chunk, 0, 3); $chunk = ($chunk=~ /MISC/) ? substr($chunk, 5, -2) : substr($chunk, 4, -2); } else { $tag = "NOT"; } $chunk =~ s/—/ /g; @parts = split /\s+/, $chunk; for my $token(@parts) { #next if($token eq "" or $token eq "s" or $token !~ /\w+/); #$token =~ s/\W//g; push @lbj_tokens, $token; push @lbj_tags, $tag; } } for (my $i = 0; $i < scalar @raw_tokens; $i++) { if($raw_tokens[$i] eq $lbj_tokens[$i]) { $raw_ltags[$i] = $lbj_tags[$i]; } if($lbj_tokens[$i] eq "-LRB-" or $lbj_tokens[$i] eq "-RRB-") { splice @lbj_tokens, $i, 1; splice @lbj_tags, $i, 1; } if($raw_tokens[$i] ne $lbj_tokens[$i]) { $temptag = "NOT"; $tempstr = ""; $flag = 0; for($j = 0; $j <= 5; $j++) { $temptag = $lbj_tags[$i+$j] if($lbj_tags[$i+$j] ne "NOT"); $tempstr = $tempstr . $lbj_tokens[$i + $j]; if($tempstr eq $raw_tokens[$i] or "(" . $tempstr eq $raw_tokens[$i]) { $flag = 1; last; } } if($flag == 1) { $raw_ltags[$i] = $temptag; $lbj_tokens[$i] = $raw_tokens[$i]; $lbj_tags[$i] = $raw_ltags[$i]; splice @lbj_tokens, $i+1, $j; splice @lbj_tags, $i+1, $j; } else { $temptag = "NOT"; for($j = 1; $j <= 5; $j++) { $temptag = $lbj_tags[$i+$j] if($lbj_tags[$i+$j] ne "NOT"); if($raw_tokens[$i] eq $lbj_tokens[$i+$j]) { $flag = 1; last; } } if($flag == 1) { splice @lbj_tokens, $i, $j; splice @lbj_tags, $i, $j; $raw_ltags[$i] = $temptag; } else { $raw_ltags[$i] = $lbj_tags[$i]; } } } #print "lbj_sync: |$raw_tokens[$i]|$lbj_tokens[$i]|\n"; } $str_lraw = join "\n", @raw_ltags; } else { $str_lraw = ""; } if(-e $stanfile and -e $lbjfile) { for (my $i = 0; $i < scalar @raw_tokens; $i++) { if($raw_stags[$i] ne "NOT" or $raw_ltags[$i] ne "NOT") { $raw_utags[$i] = $raw_stags[$i] if($raw_stags[$i] ne "NOT"); $raw_utags[$i] = $raw_ltags[$i] if($raw_ltags[$i] ne "NOT"); } else { $raw_utags[$i] = "NOT"; } } $str_uraw = join "\n", @raw_utags; } else { $str_uraw = ""; } if(-e $corrfile) { { open($cfh, "<", $corrfile); local $/; $c_str = <$cfh>; close($cfh); } @corr_tagstext = split /[\[\]]/, $c_str; @corr_tokens = (); @corr_tags = (); for my $chunk (@corr_tagstext) { if($chunk =~ /^PER / or $chunk =~ /^LOC / or $chunk =~ /^ORG / or $chunk =~ /^MISC /) { $tag = substr($chunk, 0, 3); $chunk = ($chunk=~ /MISC/) ? substr($chunk, 5, -2) : substr($chunk, 4, -2); } else { $tag = "NOT"; } $chunk =~ s/—/ /g; @parts = split /\s+/, $chunk; for my $token(@parts) { #next if($token eq "" or $token eq "s" or $token !~ /\w+/); #$token =~ s/\W//g; push @corr_tokens, $token; push @corr_tags, $tag; } } for (my $i = 0; $i < scalar @raw_tokens; $i++) { if($raw_tokens[$i] eq $corr_tokens[$i]) { $raw_ctags[$i] = $corr_tags[$i]; } if($corr_tokens[$i] eq "-LRB-" or $corr_tokens[$i] eq "-RRB-") { splice @corr_tokens, $i, 1; splice @corr_tags, $i, 1; } if($raw_tokens[$i] ne $corr_tokens[$i]) { $temptag = "NOT"; $tempstr = ""; $flag = 0; for($j = 0; $j <= 5; $j++) { $temptag = $lbj_tags[$i+$j] if($corr_tags[$i+$j] ne "NOT"); $tempstr = $tempstr . $corr_tokens[$i + $j]; if($tempstr eq $raw_tokens[$i] or "(" . $tempstr eq $raw_tokens[$i]) { $flag = 1; last; } } if($flag == 1) { $raw_ctags[$i] = $temptag; $corr_tokens[$i] = $raw_tokens[$i]; $corr_tags[$i] = $raw_ctags[$i]; splice @corr_tokens, $i+1, $j; splice @corr_tags, $i+1, $j; } else { $temptag = "NOT"; for($j = 1; $j <= 5; $j++) { $temptag = $corr_tags[$i+$j] if($corr_tags[$i+$j] ne "NOT"); if($raw_tokens[$i] eq $corr_tokens[$i+$j]) { $flag = 1; last; } } if($flag == 1) { splice @corr_tokens, $i, $j; splice @corr_tags, $i, $j; $raw_ctags[$i] = $temptag; } else { $raw_ctags[$i] = $corr_tags[$i]; } } } #print "corr_sync: |$raw_tokens[$i]|$corr_tokens[$i]|\n"; } $str_craw = join "\n", @raw_ctags; } else { $str_craw = ""; } if(-e $coreffile) { my $viewoutput_LBJ = "$infile.LBJ_list"; my $mapfile = "$infile.MAP"; warn "Extract_lbj: Processing... $infile\n"; my %hash = (); my %counta = (); my @row; @coref_tokens = (); ############################################ ############# -- Load LBJ -- ############### ############################################ open(LM, "<", $coreffile); open(TMP, ">", "$coreffile.tmp"); while () { @row = split(//, $_); } my $i = 0; #print scalar @row, "\n"; while ($i < scalar(@row) -1) { # -- * if (($row[$i] eq "*") && ($row[$i+1] ne "_")) { print TMP "\n"; } # -- *_ elsif (($row[$i] eq "*") && ($row[$i+1] eq "_")) { print TMP "\n scalar(@row) -1); print TMP $row[$i+1]; $i++; } print TMP ">"; } # -- normal case elsif ($row[$i] eq " ") { print TMP "\n"; } else { print TMP $row[$i]; } $i++; #print "i $i\n"; } close(LM); close(TMP); #print "got out\n"; my $id = ""; my $word = ""; my $name = ""; my @stack = (); my $pos = 0; my $wnum = 0; my $num = 0; my $result; my $found; open(LM, "<", "$coreffile.tmp"); while () { chomp; if (index($_,"") >= 0) { #$id = substr($_, index($_,"")+11); #$id = substr($id, 0, index($id,">")); #print $id."\n"; #push(@stack, $id); push(@stack, length($name)); push(@stack, $wnum); } else { if (index($_,"0) { $name = $name.$word." "; } } } if (index($_, "= 0) { #push(@stack, $id); $num = pop(@stack); $pos = pop(@stack); #$id = pop(@stack); $id = substr($_, index($_,"")); $hash{$num."\t".($wnum - $num)."\t".substr($name, $pos)."\t".$id} = $num; $counta{$id} ++; # print $num."\t".($wnum - $num)."\t".substr($name, $pos)."\t".$id."\n"; } } } close(LM); #***************************NEW CODE *************************** $k = 0; @track = (); for (my $i = 0; $i < scalar @raw_tokens; $i++) { if($raw_tokens[$i] eq $coref_tokens[$k]) { #$raw_ctags[$i] = $corr_tags[$i]; $track[$k] = $i; } if($raw_tokens[$i] ne $coref_tokens[$k]) { #$temptag = "NOT"; $tempstr = ""; $flag = 0; for($j = 0; $j <= 5; $j++) { #$temptag = $lbj_tags[$i+$j] if($corr_tags[$i+$j] ne "NOT"); $tempstr = $tempstr . $coref_tokens[$k + $j]; if($tempstr eq $raw_tokens[$i] or "(" . $tempstr eq $raw_tokens[$i]) { $flag = 1; last; } } if($flag == 1) { #$raw_ctags[$i] = $temptag; $track[$k] = $i; for($l = 1; $l <= $j; $l++) { $k++; $track[$k] = $i; } #$corr_tags[$i] = $raw_ctags[$i]; #splice @coref_tokens, $i+1, $j; #print "Case 1\n"; #splice @corr_tags, $i+1, $j; } else { #$temptag = "NOT"; if($raw_tokens[$i] eq $coref_tokens[$k-1]) { $k--; $track[$k] = $i; #splice @coref_tokens, $i-1, 0, "NULL"; #print "Case 2\n"; } else { for($j = 1; $j <= 5; $j++) { #$temptag = $corr_tags[$i+$j] if($corr_tags[$i+$j] ne "NOT"); if($raw_tokens[$i] eq $coref_tokens[$k+$j]) { $flag = 1; last; } } if($flag == 1) { $track[$k] = $i; for($l = 1; $l <= $j; $l++) { $k++; $track[$k] = $i; } #splice @coref_tokens, $i, $j; #print "Case 3\n"; #splice @corr_tags, $i, $j; #$raw_ctags[$i] = $temptag; } else { #$raw_ctags[$i] = $corr_tags[$i]; $track[$k] = $i; } } } } #print "coref_sync: |$raw_tokens[$i]|$coref_tokens[$k]|\n"; $k++; } open(my $mfh, ">", $mapfile); for (my $i = 0; $i < scalar @coref_tokens; $i++) { print $mfh "$i\t$track[$i]\t$coref_tokens[$i]\t$raw_tokens[$track[$i]]\n"; } close($mfh); #print "File ends " . scalar @raw_tokens . " $k\n"; #print $wnum."\n"; #print $tokens[324]."\n"; #print scalar(@tokens)."\n"; #print "LBJ Loaded\n"; ############################################ ########## -- Formatting List -- ########### ############################################ my $key; open(OUTFILE, ">", $viewoutput_LBJ) || die "cannot write file\n"; foreach $key ( sort { $hash{$a} <=> $hash{$b} } keys %hash ) { my @tmpk = split(/\t/,$key); $num = $tmpk[0]; $start = $track[$num]; $len = $tmpk[1]; $id = $tmpk[3]; $end = $track[$num+$len-1]; $word = ""; for($i = $start; $i <= $end; $i++) { $word = ($word eq "") ? $raw_tokens[$i] : $word . " " . $raw_tokens[$i]; } $newlen = $end - $start + 1; print OUTFILE "$start\t$newlen\t$word\t$id\n"; #print OUTFILE $key. "\n"; #old line } close(OUTFILE); #*******************************NEW CODE ENDS ***************** } return ($num, $str_traw, $str_sraw, $str_lraw, $str_uraw, $str_craw); } sub make_list { my $name = shift; my ($en_file, $src_file) = @_; ($en_num, $en_str_tokens, $en_str_stan, $en_str_lbj, $en_str_union, $en_str_corr) = sync("sync", $en_file); @en_stan_tokens = split "\n", $en_str_tokens; @en_lbj_tokens = split "\n", $en_str_tokens; @en_union_tokens = split "\n", $en_str_tokens; @en_corr_tokens = split "\n", $en_str_tokens; @en_stan_tags = split "\n", $en_str_stan; @en_lbj_tags = split "\n", $en_str_lbj; @en_union_tags = split "\n", $en_str_union; @en_corr_tags = split "\n", $en_str_corr; ($src_num, $src_str_tokens, $src_str_stan, $src_str_lbj, $src_str_union, $src_str_corr) = sync("sync", $src_file); @src_stan_tokens = split "\n", $src_str_tokens; @src_lbj_tokens = split "\n", $src_str_tokens; @src_union_tokens = split "\n", $src_str_tokens; @src_corr_tokens = split "\n", $src_str_tokens; @src_stan_tags = split "\n", $src_str_stan; @src_lbj_tags = split "\n", $src_str_lbj; @src_union_tags = split "\n", $src_str_union; @src_corr_tags = split "\n", $src_str_corr; #en_corr and src_union my %en_list = (); my %src_list = (); my %comb_list = (); for($i = 0; $i < $src_num; $i++) { if($src_union_tags[$i] ne "NOT") { #print "src: $src_union_tokens[$i] Stem: " , Text::English::stem($src_union_tokens[$i]), "\n"; @stem = Text::English::stem($src_union_tokens[$i]); $src_list{$stem[0]}++; $comb_list{$stem[0]}++; } } $srclist = join ",", keys %src_list; if($en_str_corr eq "" or $en_str_tcorr eq "") { print "Correct file doesn't exist for $en_file, so using stan file\n"; for($i = 0; $i < $en_num; $i++) { if($en_union_tags[$i] ne "NOT") { #print "en: $en_corr_tokens[$i] Stem: " , Text::English::stem($en_corr_tokens[$i]), "\n"; @stem = Text::English::stem($en_union_tokens[$i]); $en_list{$stem[0]}++; $comb_list{$stem[0]}++; } } #return (0,0,0); } else { for($i = 0; $i < $en_num; $i++) { if($en_corr_tags[$i] ne "NOT") { #print "en: $en_corr_tokens[$i] Stem: " , Text::English::stem($en_corr_tokens[$i]), "\n"; @stem = Text::English::stem($en_corr_tokens[$i]); $en_list{$stem[0]}++; $comb_list{$stem[0]}++; } } } $enlist = join ",", keys %en_list; return(compare_lists("compare_lists", \%en_list, \%src_list, \%comb_list)); } sub giza { my $name = shift; my $lang = shift; my $in_cfile = shift; #in_corpus.txt my $out_cfile = shift; #out_corpus.txt my $align_pref = shift; my @files = @_; #make parallel input and output corpora unlink($in_cfile) if(-e $in_cfile); unlink($out_cfile) if(-e $out_cfile); if(lc $^O eq "mswin32") { open($ofh_i, ">>", $in_cfile); open($ofh_o, ">>", $out_cfile); } for my $file (@files) { my $incorp = $file; if($lang eq "sw") { $incorp =~ s/\/alasiri/\/sw_alasiri/g; $incorp =~ s/_swh/_raw/g; } else { ($incorp =~ /_arb/) ? $incorp =~ s/_arb/_raw/g : $incorp =~ s/-normalized-derivations/_raw/g; } if(lc $^O eq "mswin32") { $tc = 0; open($ifh, "<",$incorp); while(<$ifh>) { print $ofh_i $_; $tc++; } push @tcs1, $tc; close($ifh); } else { `cat $incorp >> $in_cfile`; } } for my $file (@files) { my $outcorp = $file; if($lang eq "sw") { $outcorp =~ s/\/alasiri/\/sw_alasiri/g; } else { $outcorp =~ s/-normalized-derivations/_arb/g if($outcorp !~ /_arb/); } if(lc $^O eq "mswin32") { open($ifh, "<",$outcorp); $tc2 = 0; while(<$ifh>) { print $ofh_o $_; $tc2++; } push @tcs2, $tc2; close($ifh); } else { `cat $outcorp >> $out_cfile`; } } if(lc $^O eq "mswin32") { close($ofh_i); close($ofh_o); } for($tc = 0; $tc < 50; $tc++) { print "file: $files[$tc] 1: $tcs1[$tc] 2: $tcs2[$tc]\n" if($tcs1[$tc] != $tcs2[$tc]); } #run GIZA++. by default we've used in_corpus3, out_corpus3, dic3 (i.e. dic3.AA3.final) $inc_pref = $in_cfile; $outc_pref = $out_cfile; $inc_pref =~ s/\.txt//g; $outc_pref =~ s/\.txt//g; system("giza-pp/GIZA++-v2/plain2snt.out $inc_pref.txt $outc_pref.txt"); system("giza-pp/mkcls-v2/mkcls -p$inc_pref.txt -V$inc_pref.vcb.classes"); system("giza-pp/mkcls-v2/mkcls -p$outc_pref.txt -V$outc_pref.vcb.classes"); system("giza-pp/GIZA++-v2/GIZA++ -S $inc_pref.vcb -T $outc_pref.vcb -C $inc_pref\_$outc_pref.snt -O $align_pref"); #print "Error: $!\n"; #break AA3.final into files into individual files my $cfile = $align_pref . ".AA3.final"; open($cfh, "<", $cfile); #"dic3.AA3.final" my $l_total; for my $file (@files) { my $infile = $file; if($lang eq "sw") { $infile =~ s/\/alasiri/\/sw_alasiri/g; } else { $infile =~ s/-normalized-derivations/_arb/g if($infile !~ /_arb/); } my $l_count = 0; open(my $ifh, "<", $infile); while(<$ifh>) { $l_count++; } close($ifh); $l_total += $l_count; #print "$file: $l_count lines, $l_total total\n"; my $outfile = $file; if($lang eq "sw") { $outfile =~ s/\/alasiri/\/sw_alasiri/g; $outfile =~ s/_swh/_corps/g; } else { ($outfile =~ /_arb/) ? $outfile =~ s/_arb/_corps/g : $outfile =~ s/-normalized-derivations/_corps/g; } open(my $ofh, ">", $outfile); for($i = 0; $i < 3 * $l_count; $i++) { $line = <$cfh>; print $ofh $line; } close($ofh); } close($cfh); } sub compare_lists { my $name = shift; my(%en_list) = %{$_[0]}; my(%src_list) = %{$_[1]}; my(%comb_list) = %{$_[2]}; my $tp = 0; my $fp = 0; my $fn = 0; my $tn = 0; my $tw_tp = 0; my $tw_fp = 0; my $tw_fn = 0; for my $key (keys %comb_list) { $en_list{$key} = 0 if($en_list{$key} eq ""); $src_list{$key} = 0 if($src_list{$key} eq ""); #print "KEY $key in correct: $en_list{$key} in translated: $src_list{$key}\n"; $lkey = lc $key; $src_list{$key} = 0 if($commons{$lkey} ne "" and $propers{$key} eq ""); # and $lkey eq $key); $src_list{$key} = 0 if($swh_commons{$lkey} ne "" and $swh_propers{$key} eq ""); # and $lkey eq $key); if($en_list{$key} == $src_list{$key}) { $tp += $en_list{$key}; } if($en_list{$key} > $src_list{$key}) { $tp += $src_list{$key}; $fn += $en_list{$key} - $src_list{$key}; print "Recl Error: KEY $key in correct: $en_list{$key} in translated: $src_list{$key}\n"; } if($en_list{$key} < $src_list{$key}) { $tp += $en_list{$key}; $fp += $src_list{$key} - $en_list{$key}; print "Prec Error: KEY $key in correct: $en_list{$key} in translated: $src_list{$key}\n"; } #type-wise $tw_tp++ if($en_list{$key} != 0 and $src_list{$key} != 0); $tw_fp++ if($en_list{$key} == 0 and $src_list{$key} != 0); $tw_fn++ if($en_list{$key} != 0 and $src_list{$key} == 0); } my $prec = $tp / ($tp + $fp); my $recl = $tp / ($tp + $fn); my $f1 = 2 * $prec * $recl / ($prec + $recl); print "Sentence-level Precision: $prec Recall: $recl F1 $f1\n"; $prec = $tw_tp / ($tw_tp + $tw_fp); $recl = $tw_tp / ($tw_tp + $tw_fn); $f1 = 2 * $prec * $recl / ($prec + $recl); print "\nTypeWise Sentence-level Precision: $prec Recall: $recl F1 $f1\n"; return ($tp, $fp, $fn, $tw_tp, $tw_fp, $tw_fn); } sub make_table { my $name = shift; my $outfile = shift; my @files = @_; if($outfile ne "") # and not -e $outfile) { print "Creating table file $outfile\n"; open($ofh, ">", $outfile); } else { #print "Running make_table in silent mode\n"; $ofh = ""; } print $ofh "Filename\tPosition\tEntity\tSystem\tNE_Type\tCoref_ID\n"; $stan_table = ""; $lbj_table = ""; $corr_table = ""; $union_table = ""; for my $infile (@files) { ($num, $str_tokens, $str_stan, $str_lbj, $str_union, $str_corr) = sync("sync", $infile); $filename = $infile; $filename =~ s/.+\\//g if($filename =~ /\\/); $filename =~ s/.+\///g if($filename =~ /\//); #$filename =~ s/_raw.*//g; @stan_tokens = split "\n", $str_tokens; @lbj_tokens = split "\n", $str_tokens; @union_tokens = split "\n", $str_tokens; @corr_tokens = split "\n", $str_tokens; @stan_tags = split "\n", $str_stan; @lbj_tags = split "\n", $str_lbj; @union_tags = split "\n", $str_union; @corr_tags = split "\n", $str_corr; if($str_stan ne "") { $hfile = $infile; $hfile =~ s/_raw/_stanford/g; $hfile =~ s/\.(txt)|(sgm)|(text)/\.html/g; open($hfh, ">", $hfile); print $hfh "\n"; for($i = 0; $i < $num; $i++) { if($stan_tags[$i] eq "NOT") { print $hfh "$stan_tokens[$i] "; } if($stan_tags[$i] ne "NOT") { $starttag = $stan_tags[$i]; $j = $i+1; $j++ while($stan_tags[$j] eq $starttag and $j < $num and $stan_tokens[$j-1] !~ /,$/); $start = $i; $end = $j; $i = $j - 1; $entity = ""; for($k = $start; $k < $end; $k++) { $entity = ($entity eq "") ? $stan_tokens[$k] : $entity . " " . $stan_tokens[$k]; } print $ofh "$filename\t$start\t$entity\tNE_STAN\t$stan_tags[$start]\tNULL\n"; $stan_table = $stan_table . "$filename\t$start\t$entity\tNE_STAN\t$stan_tags[$start]\tNULL\n"; if($starttag eq "PER") { print $hfh "$entity "; } if($starttag eq "LOC") { print $hfh "$entity "; } if($starttag eq "ORG") { print $hfh "$entity "; } if($starttag eq "MIS") { print $hfh "$entity "; } } } print $hfh "\n"; print $hfh "\n\n"; close($hfh); } if($str_lbj ne "") { $hfile = $infile; $hfile =~ s/_raw/_lbj/g; $hfile =~ s/\.(txt)|(sgm)|(text)/\.html/g; open($hfh, ">", $hfile); print $hfh "\n"; for($i = 0; $i < $num; $i++) { if($lbj_tags[$i] eq "NOT") { print $hfh "$lbj_tokens[$i] "; } if($lbj_tags[$i] ne "NOT") { $starttag = $lbj_tags[$i]; $j = $i+1; $j++ while($lbj_tags[$j] eq $starttag and $j < $num); $start = $i; $end = $j; $i = $j - 1; $entity = ""; for($k = $start; $k < $end; $k++) { $entity = ($entity eq "") ? $lbj_tokens[$k] : $entity . " " . $lbj_tokens[$k]; } print $ofh "$filename\t$start\t$entity\tNE_LBJ\t$lbj_tags[$start]\tNULL\n"; $lbj_table = $lbj_table . "$filename\t$start\t$entity\tNE_LBJ\t$lbj_tags[$start]\tNULL\n"; if($starttag eq "PER") { print $hfh "$entity "; } if($starttag eq "LOC") { print $hfh "$entity "; } if($starttag eq "ORG") { print $hfh "$entity "; } if($starttag eq "MIS") { print $hfh "$entity "; } } } print $hfh "\n"; print $hfh "\n\n"; close($hfh); } if($str_corr ne "") { $hfile = $infile; $hfile =~ s/_raw/_correct/g; $hfile =~ s/\.(txt)|(sgm)|(text)/\.html/g; open($hfh, ">", $hfile); print $hfh "\n"; for($i = 0; $i < $num; $i++) { if($corr_tags[$i] eq "NOT") { print $hfh "$corr_tokens[$i] "; } if($corr_tags[$i] ne "NOT") { $starttag = $corr_tags[$i]; $j = $i+1; $j++ while($corr_tags[$j] eq $starttag and $j < $num); $start = $i; $end = $j; $i = $j - 1; $entity = ""; for($k = $start; $k < $end; $k++) { $entity = ($entity eq "") ? $corr_tokens[$k] : $entity . " " . $corr_tokens[$k]; } print $ofh "$filename\t$start\t$entity\tNE_CORR\t$corr_tags[$start]\tNULL\n"; $corr_table = $corr_table . "$filename\t$start\t$entity\tNE_CORR\t$corr_tags[$start]\tNULL\n"; if($starttag eq "PER") { print $hfh "$entity "; } if($starttag eq "LOC") { print $hfh "$entity "; } if($starttag eq "ORG") { print $hfh "$entity "; } if($starttag eq "MIS") { print $hfh "$entity "; } } } print $hfh "\n"; print $hfh "\n\n"; close($hfh); } if($str_union ne "") { $hfile = $infile; $hfile =~ s/raw/union/g; $hfile =~ s/\.(txt)|(sgm)|(text)/\.html/g; open($hfh, ">", $hfile); print $hfh "\n"; for($i = 0; $i < $num; $i++) { if($union_tags[$i] eq "NOT") { print $hfh "$union_tokens[$i] "; } if($union_tags[$i] ne "NOT") { $starttag = $union_tags[$i]; $j = $i+1; $j++ while($union_tags[$j] eq $starttag and $j < $num); $start = $i; $end = $j; $i = $j - 1; $entity = ""; for($k = $start; $k < $end; $k++) { $entity = ($entity eq "") ? $union_tokens[$k] : $entity . " " . $union_tokens[$k]; } print $ofh "$filename\t$start\t$entity\tNE_UNION\t$union_tags[$start]\tNULL\n"; $union_table = $union_table . "$filename\t$start\t$entity\tNE_UNION\t$union_tags[$start]\tNULL\n"; if($starttag eq "PER") { print $hfh "$entity "; } if($starttag eq "LOC") { print $hfh "$entity "; } if($starttag eq "ORG") { print $hfh "$entity "; } if($starttag eq "MIS") { print $hfh "$entity "; } } } print $hfh "\n"; print $hfh "\n\n"; close($hfh); } } close($ofh); return ($stan_table, $lbj_table, $corr_table, $union_table); } sub email_pre { my $name = shift; my ($infile) = @_; open($ifh, "<", $infile); $str = ""; while(<$ifh>) { $str = ($str eq "") ? $_ : $str . $_; } $str =~ s/=20(\n|\r)/ /g; $str =~ s/=01;(\n|\r)/ /g; $str =~ s/=01(;|,|&|\d+)/ /g; $str =~ s/=(\n|\r)+//g; $str =~ s/\//\\/g; $str =~ s/<.+>/ /g; close($ifh); open($ifh, ">", $infile); print $ifh $str; close($ifh); } sub lbj { my $name = shift; my ($infile) = @_; die "Enter an input file\n" if ($infile eq ""); $ofile = $infile; $ofile =~ s/_raw/_lbj/g; $tf = "true"; #unlink($ofile) if(-e $ofile); return if(-e $ofile); print "Running lbj on $infile\n"; if(lc $^O eq "linux") { $command = "java -classpath LBJ2.jar:LBJ2Library.jar:LbjNerTagger.jar -Xmx1500m LbjTagger.NerTagger -annotate ../$infile ../$ofile $tf Config/allFeaturesBigTrainingSet.config"; } else { $command = "java -classpath \"LBJ2.jar;LBJ2Library.jar;LbjNerTagger.jar;LBJ2.jar:LBJ2Library.jar:LbjNerTagger.jar;\" -Xmx1500m LbjTagger.NerTagger -annotate ../$infile ../$ofile $tf Config/allFeaturesBigTrainingSet.config"; } if(lc $^O eq "mswin32") { chdir("LbjNerTagger1.11.release"); system($command); chdir(".."); } else { `cd LbjNerTagger1.11.release\n$command\ncd ..\n`; } } sub stanford { my $name = shift; my ($infile) = @_; die "Enter an input file\n" if ($infile eq ""); $ofile = $infile; $ofile =~ s/_raw/_tempstan/g; #unlink($ofile) if(-e $ofile); return if(-e $ofile); print "Running stanford on $infile\n"; if(lc $^O eq "linux") { $command = "java -mx1000m -cp stanford-ner.jar edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier classifiers/ner-eng-ie.crf-4-conll-distsim.ser.gz -textFile ../$infile > ../$ofile"; } else { $command = "java -mx1000m -cp \"stanford-ner.jar;\" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier classifiers/ner-eng-ie.crf-4-conll-distsim.ser.gz -textFile ../$infile > ../$ofile"; } if(lc $^O eq "mswin32") { chdir("stanford-ner-2009-01-16"); system($command); chdir(".."); } else { `cd stanford-ner-2009-01-16\n$command\ncd ..\n`; } open($ifh, "<", $ofile); $count = 0; $str = ""; while(<$ifh>) { $str = $str . " " . $_; $count++; } close($ifh); $o2file = $infile; $o2file =~ s/_raw/_stanford/g; $o2htm = $o2file; $o2htm =~ s/stanford/stanford2/g; $o2htm =~ s/\.(txt)|(sgm)|(text)/\.html/g; open($ofh, ">", $o2file); open($hfh, ">", $o2htm); @tokens = split /\s+/, $str; $count = 0; @formatted_tokens = (); @hformatted_tokens = (); $prevtag = ""; for(@tokens) { ($token, $tag) = split "/", $_; $formatted = $token; $hformatted = $token; $formatted = "[PER $formatted ]" if($tag eq "PERSON"); $hformatted = "$hformatted" if($tag eq "PERSON"); $formatted = "[ORG $formatted ]" if($tag eq "ORGANIZATION"); $hformatted = "$hformatted" if($tag eq "ORGANIZATION"); $formatted = "[LOC $formatted ]" if($tag eq "LOCATION"); $hformatted = "$hformatted" if($tag eq "LOCATION"); $formatted = "[MIS $formatted ]" if($tag eq "MISC"); $hformatted = "$hformatted" if($tag eq "MISC"); if($tag eq $prevtag and ($tag eq "PERSON" or $tag eq "ORGANIZATION" or $tag eq "LOCATION" or $tag eq "MISC")) { $formatted_tokens[$count - 1] = substr($formatted_tokens[$count - 1], 0, -3); $hformatted_tokens[$count - 1] = substr($hformatted_tokens[$count - 1], 0, -7); #$formatted = ($tag eq "MISC") ? substr($formatted, 6) : substr($formatted, 5); #deprecated $formatted = substr($formatted, 5); if($tag eq "PERSON") { $hformatted = substr($hformatted, 16); } if($tag eq "ORGANIZATION") { $hformatted = substr($hformatted, 18); } if($tag eq "LOCATION") { $hformatted = substr($hformatted, 17); } if($tag eq "MISC") { $hformatted = substr($hformatted, 19); } } push @formatted_tokens, $formatted; push @hformatted_tokens, $hformatted; $prevtag = $tag; $count++; } print $hfh "\n"; for(@formatted_tokens) { print $ofh "$_ "; } for(@hformatted_tokens) { print $hfh "$_ "; } print $ofh "\n"; print $hfh "\n"; print $hfh "\n\n"; close($ofh); close($hfh); } sub hcs_xml2txt { my $name = shift; my ($infile) = @_; die "Enter an input file\n" if ($infile eq ""); open($ifh, "<", $infile); $ofile = $infile; $ofile =~ s/\.xml/_swh\.txt/g; print "hcs_xml2txt: $infile -> $ofile\n"; open($ofh, ">", $ofile); $num = 0; $doc = ""; while(<$ifh>) { $line = $_; chomp($line); $line =~ s/"/"/g; $newline = ""; @lineparts = split "<", $line; for my $linepart (@lineparts) { if($linepart =~ /w .+ type="(.+?)".+>(.+)/) { $type = $1; $word = $2; $trans = ""; if($type =~ /PROP/) { $netype = "NE"; if($linepart =~ /trans="(.+?)"/) { $trans = $1; } else { $trans = $word; } #print "Trans: $trans\n"; } else { $netype = "O"; } #print "Word $word Type $type\n"; if($trans ne "") { print $ofh "$word\t$netype\t$trans\n"; } else { print $ofh "$word\t$netype\n"; } #@parts = split /\s+/, $2; #$num += scalar @parts; #$tag = $type . " ". $2; #$word = "[" . $type . " " . $2 . " ]"; #$newline = $newline . $word . " "; #print "$tag\n"; } } #$newline = substr($newline, 0, -1); #$newline = $newline . "\n"; #$doc = $doc . $newline; } #print $ofh $doc; close($ifh); close($ofh); } sub lingpipe_xml2txt { my $name = shift; my ($infile) = @_; die "Enter an input file\n" if ($infile eq ""); print "Starting lingpipe_xml2txt on $infile\n"; open($ifh, "<", $infile); $num = 0; $doc = ""; while(<$ifh>) { $line = $_; chomp($line); $line =~ s/"/"/g; $newline = ""; @lineparts = split "<", $line; for my $linepart (@lineparts) { if($linepart =~ /s i=.+>(.+)/) { $newline = $newline . $1. " "; } if($linepart =~ /ENAMEX TYPE="(.+)">(.+)/) { $type = ($1 =~ /MISC/) ? substr($1, 0, 4) : substr($1, 0, 3); @parts = split /\s+/, $2; $num += scalar @parts; $tag = $type . " ". $2; $word = "[" . $type . " " . $2 . " ]"; $newline = $newline . $word . " "; #print "$tag\n"; } if($linepart =~ /\/ENAMEX>(.+)/) { $newline = $newline . $1 . " "; } } $newline = substr($newline, 0, -1); #$newline = $newline . "\n"; $doc = $doc . $newline; } $ofile = $infile; $ofile =~ s/xml/txt/g; open($ofh, ">", $ofile); print $ofh $doc; close($ofh); } sub mix { my $name = shift; my ($infile) = @_; open(my $ifh, "<", $infile); my @part = (); my $tnum = 0; #takes a _mix file and produces a table_ file and a _stanford file $mlang = "swh"; while(<$ifh>) { $line = $_; chop($line) if(substr($line, -1, 1) eq "\n" or substr($line, -1, 1) eq "\r"); chop($line) if(substr($line, -1, 1) eq "\n" or substr($line, -1, 1) eq "\r"); if($line ne "") { if($line !~ /[a-zA-Z]/) { $mlang = "arb"; $tnum = 0; } else { $result = $service->detect($line); #printf "Detected language: %s\n", $result->language; if($result->language eq "en") { $tnum = 1; } else { $tnum = 0; } } } #print "|$line|\n"; #if($line eq "***") #{ # print "Toggled\n"; # $tnum = 1 - $tnum; #} $part[$tnum] = $part[$tnum] . "\n" . $line; } close($ifh); my $part0 = $infile; $part0 =~ s/_mix.txt//g; $part0 = $part0 . "_part0_$mlang.txt"; my $part1 = $infile; $part1 =~ s/_mix.txt//g; $part1 = $part1 . "_part1_raw.txt"; open(my $ofh, ">", $part0); print $ofh $part[0]; close($ofh); open($ofh, ">", $part1); print $ofh $part[1]; close($ofh); my $split = $infile; $split =~ s/_mix.txt/_split.html/g; open(my $sfh, ">", $split); print $sfh "\n"; print $sfh "\n\n"; print $sfh $part[0]; print $sfh "\n\n"; print $sfh $part[1]; print $sfh "\n\n"; close($sfh); if($^O =~ /Win/) { $part0 =~ s/.+\\//g; $part1 =~ s/.+\\//g; } else { $part0 =~ s/.+\///g; $part1 =~ s/.+\///g; } warn "New filenames: |$part0| |$part1|\n"; translate("translate", $part0); stanford("stanford", $part1); make_table("make_table", "", $part1); my $outfile = $infile; $outfile =~ s/_mix/_picked/g; my $part0_p = $part0; $part0_p =~ s/_(swh|arb)/_stanford/g; my $part1_p = $part1; $part1_p =~ s/_raw/_stanford/g; open($ofh, ">", $outfile); open($ifh, "<", $part0_p); while(<$ifh>) { print $ofh $_; } close($ifh); open($ifh, "<", $part1_p); while(<$ifh>) { print $ofh $_; } close($ifh); close($ofh); #for html $outfile =~ s/\.(txt)|(sgm)|(text)/\.html/g; $part0_p =~ s/\.(txt)|(sgm)|(text)/\.html/g; $part1_p =~ s/\.(txt)|(sgm)|(text)/\.html/g; open($ofh, ">", $outfile); open($ifh, "<", $part0_p); while(<$ifh>) { $line = $_; print $ofh $line if($line !~ /\/html/); } close($ifh); open($ifh, "<", $part1_p); <$ifh>; print $ofh "

"; while(<$ifh>) { print $ofh $_; } close($ifh); close($ofh); } ## -- Running LBJ Coref System -- sub lbj_coref { my $name = shift; my ($infile) = @_; die "Enter an input file\n" if ($infile eq ""); warn "Running lbj coref on $infile\n"; return if (-e "$infile.LBJ" or $^O ne "linux"); chdir("LBJ"); system("java -Xmx1300m -jar LBJCoref.jar ../$infile > ../$infile.LBJ"); chdir(".."); warn "Done\n"; } ## -- ## -- Running List Maker -- sub extract_lbj { my $name = shift; my ($infile, $ext) = @_; $ext = "raw" if(not defined $ext); warn "Perl script make_lists called\n"; return if (!(-e "$infile.LBJ")); system("perl Make_Lists.pl -i $infile -l $ext"); } ## -- ## -- Running Eval Matrix Maker -- sub make_imatrix { my $name = shift; (my $infile) = @_; return if (!(-e "$infile")); system("perl Make_Eval.pl -i $infile"); } ## -- sub b3_score { my $name = shift; (my $infile) = @_; return if(not -e $infile); #system("perl B3Scorer.pl -i $infile"); my $file = "$infile.eval"; my $precision; my $recall; my $fscore; my $pre_denom = 0; my $rec_denom = 0; my @a; my @b; open(FILE, $file) || die "cannot open file"; my (@lines) = ; my @rec; my $n; my $m; my $i; my $j; ## process first line @rec = split(" ", $lines[0]); $n = $rec[0]; $m = $rec[1]; ## process last two lines @a = split(" ",$lines[$n+1]); foreach (@a) { $rec_denom += $_; } @b = split(" ",$lines[$n+2]); foreach (@b) { $pre_denom += $_; } ## process the matrix for ($i=0;$i<$n;$i++){ @rec = split(" ",$lines[$i+1]); for ($j=0;$j<$m;$j++) { if ($rec[$j] ne 0) { $recall += ($rec[$j]*$rec[$j])/$a[$i]; $precision += ($rec[$j]*$rec[$j])/$b[$j]; } } } ## output the B3 score my $pre_num = $precision; my $rec_num = $recall; $precision /= $pre_denom; $recall /= $rec_denom; $fscore = ($precision + $recall == 0) ? 0 : (2*$precision*$recall)/($precision+$recall); print "Coref: $file: $precision\t$recall\t$fscore.\n"; close(FILE); return ($pre_num, $pre_denom, $rec_num, $rec_denom); } sub strip_sgm { my $name = shift; (my $sgmfile) = @_; my $ext; if(uc $sgmfile =~ /ARABIC/) { $ext = "arb"; } elsif(uc $sgmfile =~ /CHINESE/) { $ext = "chi"; } else { $ext = "raw"; } my $rawfile = $sgmfile; $rawfile =~ s/\.(SGM|sgm)/_$ext\.txt/g; #print "SGMfile: $sgmfile Rawfile: $rawfile\n"; return $rawfile if(-e $rawfile); my $doc = ""; my $sentence = ""; my $ifh; #if($ext ne "raw") #problematic for other languages coz dunno if

and

are counted or not, for the moment replace with dummy false if($ext eq "xyz") { open($ifh, "<", $sgmfile); while(<$ifh> !~ //) { next; } while(<$ifh>) { $line = $_; chomp($line); #print "$line\n"; if($line =~ /<\/TEXT>/) { if($doc eq "") { $sentence = substr($sentence, 0, length($sentence) - 2) . " ."; $doc = $sentence; } $doc = $doc . "\n"; last; } if($line =~ /

/) { $sentence = ""; next; } if($line =~ /<\/P>/) { $sentence = substr($sentence, 0, length($sentence) - 2) . " ."; $doc = ($doc eq "") ? $sentence : $doc . "\n" . $sentence; next; } $sentence = ($sentence eq "") ? $line : $sentence . " " . $line; } close($ifh); } else { { open($ifh, "<", $sgmfile) || die "cannot open file $sgmfile"; local $/; $doc = <$ifh>; close($ifh); } $doc = substr($doc, index($doc, "")+7); $doc = substr($doc, 0, index($doc, "")); } open(my $ofh, ">", $rawfile); print $ofh $doc; close($ofh); return $rawfile; } sub make_golds { my $name = shift; my $sgmfile = $_[0]; my $ext; if(uc $sgmfile =~ /ARABIC/) { $ext = "arb"; } elsif(uc $sgmfile =~ /CHINESE/) { $ext = "chi"; } else { $ext = "raw"; } my $rawfile = $sgmfile; $rawfile =~ s/\.(SGM|sgm)/_$ext\.txt/g; my $dtd; $dtd = ($rawfile =~ /LDC2005/) ? "ACE04" : (($rawfile =~ /LDC2006/) ? "ACE05" : "ACE01"); my $glistfile = "$rawfile.Gold_list"; my $gnlistfile = "$rawfile.Gold_nerlist"; #return if (-e $glistfile and -e $gnlistfile); my $xmlfile = $rawfile; if($dtd eq "ACE04") { $xmlfile =~ s/_$ext\.txt/\_APF\.XML/g; } elsif($dtd eq "ACE05") { $xmlfile =~ s/_$ext\.txt/\.apf\.XML/g; } else { $xmlfile =~ s/_$ext\.txt/\.sgm\.tmx\.rdc\.xml/g; } if (not -e $xmlfile) { print "No gold standard xml file $xmlfile available for $rawfile\n"; return; } print "Running make_golds on $rawfile\n"; open(GTEXT, "<", $xmlfile); open(OUTFILE, ">", $glistfile); open(OUTNFILE, ">", $gnlistfile); my $offset; my $NE; my $start; my $end; my $flag = 0; my $class =0; my @spaces; my $string; my $position; my $doc; { open(SGM, "<", $sgmfile) || die "cannot open file $sgmfile"; local $/; $doc = ; close(SGM); } my $docText = substr($doc, index($doc, "")+7); $docText = substr($docText, 0, index($docText, "")); my $charIt = 0; my $tokenCharIt = 0; my $tokenIt = 0; my @charTokenMap = (); my @tokens = split /\s+/, $docText; my %ghash = (); my %ghash2 = (); my %gnhash = (); my %gnhash2 = (); while (($charIt < length($docText)) && ($tokenIt < scalar(@tokens))) { push(@charTokenMap, $tokenIt); if (substr($docText,$charIt,1) eq substr($tokens[$tokenIt],$tokenCharIt,1)) { if ($tokenCharIt+1 < length($tokens[$tokenIt])) { $tokenCharIt ++; } else { $tokenCharIt = 0; $tokenIt ++; } } $charIt ++; } my $eflag = 0; my $nflag = 0; if($dtd eq "ACE01") #ACE 2 English { $offset = 143; while () { my $line = $_; chomp($line); if (($flag == 1) && ($eflag == 1)) { $start = substr($line, index($line, "") + 7); $start = substr($start, 0, index($start, "")); $end = substr($line, index($line, "") + 5); $end = substr($end, 0, index($end, "")); #DEB if ($NE ne substr($doc, $start+143, $end-$start+1)) { #for(my $tc = 0; $tc < 20; $tc++) #{ # print "$tc: " . $NE." vs ".substr($doc, $start+$offset+$tc, $end-$start+1), "\n"; #} #DEB print $NE." vs ".substr($doc, $start+143, $end-$start+1); #DEB print "--ERROR--\n"; #DEB } #print $posi."\n"; my $tNE = $NE; @spaces = ($tNE =~ /\s/g); $start= ($start+$offset-index($doc, "")-7); #print "$charTokenMap[$start]\t"; #print $charTokenMap[$start]."\t".(scalar(@spaces)+1)."\t".$NE."\t".$class."\n"; if (!defined $ghash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"}) { #print OUTFILE "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n"; $ghash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"} = 1; $ghash2{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t". $class."\n"} = $charTokenMap[$start]; } if($nflag == 1) { if (!defined $gnhash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"}) { #print OUTNFILE "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n"; #print "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n"; $gnhash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"} = 1; $gnhash2{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t". $class."\n"} = $charTokenMap[$start]; } } } $flag = 0; if (index($_, "")); #print $NE."\t"; } $nflag = 1 if($line =~ /") > 0); $eflag = 0 if (index($line, "") > 0); if (index($line, " 0) { last; } if (index($line, "") > 0) { $class++; } } } else #ACE 4 and 5 English only { $offset = ($dtd eq "ACE04") ? 126 : 111; #print "DTD: $dtd offset: $offset\n"; while () { my $line = $_; chomp($line); #print "GTEXT line: |$line|\n"; if($flag == 1 and $eflag == 1) { #$start = $1; #$end = $2; #$NE = $3; #print "|$start|$end|$NE|\n"; #DEB if ($NE ne substr($doc, $start+143, $end-$start+1)) { #for(my $tc = 0; $tc < 20; $tc++) #{ # print "$tc: " . $NE." vs ".substr($doc, $start+$offset, $end-$start+1), "\n"; #} #DEB print "--ERROR--\n"; #DEB } #print $posi."\n"; my $tNE = $NE; @spaces = ($tNE =~ /\s/g); $start= ($start+$offset-index($doc, "")-7); #print "start: $start ctmap $charTokenMap[$start] scalar ", scalar @charTokenMap, "\n"; #print "$charTokenMap[$start]\t"; #print $charTokenMap[$start]."\t".(scalar(@spaces)+1)."\t".$NE."\t".$class."\n"; if (!defined $ghash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"}) { #print OUTFILE "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n"; #print "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n"; $ghash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"} = 1; $ghash2{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t". $class."\n"} = $charTokenMap[$start]; } if($nflag == 1) { if (!defined $gnhash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"}) { #print OUTNFILE "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n"; #print "$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t",$class,"\n"; $gnhash{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t"} = 1; $gnhash2{"$charTokenMap[$start]\t".(scalar(@spaces)+1)."\t$NE\t". $class."\n"} = $charTokenMap[$start]; } } } $flag = 0; if($line =~ /(.+)<\/charseq>/) { $start = $1; $end = $2; $NE = $3; $flag = 1; } elsif($line =~ /(.+)/) { $start = $1; $end = $2; $NE = $3; #print $line . "\n"; $line = ; chomp($line); if($line =~ /(.+)<\/charseq>/) { $NE = $NE . " " . $1; } #print "$NE\n"; $flag = 1; } $nflag = 1 if($line =~ //); $eflag = 0 if ($line =~ /<\/extent>/); if ($line =~ //) { $class++; } } } close(GTEXT); for my $key (sort {$ghash2{$a} <=> $ghash2{$b}} keys %ghash2) { print OUTFILE $key; } close(OUTFILE); for my $key (sort {$gnhash2{$a} <=> $gnhash2{$b}} keys %gnhash2) { print OUTNFILE $key; } close(OUTNFILE); } sub make_clusters { my $name = shift; my $rawfile = $_[0]; my $tfile = $rawfile; $tfile =~ s/_raw/_table/g; my $outfile = $rawfile; $outfile =~ s/_raw/_cls/g; #return if(-e $outfile); print "Running make_clusters on $rawfile\n"; my @temps = make_table("make_table", $tfile, $rawfile); my $stan_table = $temps[0]; #print $stan_table, "\n"; my @in_table = split "\n", $stan_table; $color = 0; @colors = (); push @colors, $color; #Now, cluster stan_table for (my $i = 1; $i < scalar @in_table; $i++) { $record = $in_table[$i]; @strs = split "\t", $record; $str = lc $strs[2]; $str =~ s/'s//g; #$str =~ s/[^ \.\-\w]//g; $flag = 0; for($j = 0; $j < $i; $j++) { $prev_record = $in_table[$j]; @prev_strs = split "\t", $prev_record; $prev_str = $prev_strs[2]; $prev_str = lc $prev_strs[2]; $prev_str =~ s/'s//g; #$prev_str =~ s/[^ \.\-\w]//g; my $lcs_str = lcss($str, $prev_str); #$s1 =~ /\Q$s2\E/ #print "|$str|$prev_str|$lcs_str|\n"; if($str eq $prev_str or $str =~ /\Q$prev_str\E/ or $prev_str =~ /\Q$str\E/ or (length($lcs_str) > 5 and length($lcs_str) > (min(length($str), length($prev_str)) / 2.0))) #add more conditions here { if($str !~ /-/ and $prev_str !~ /-/) { push @colors, $colors[$j]; $flag = 1; #print "str has hpyhen\n" if($str =~ /-/); #print "prev_str has hpyhen\n" if($prev_str =~ /-/); #print "|$str|$prev_str|$lcs_str|\n" if($str =~ /\Q$prev_str\E/ or $prev_str =~ /\Q$str\E/); #print "Case 1\n" if($str =~ /\Q$prev_str\E/); #print "Case 2\n" if($prev_str =~ /\Q$str\E/); last; } } } if($flag == 0) { $color++; push @colors, $color; } } #Now, make out_table using in_table and colors open(my $ofh, ">", $outfile); for ($i = 0; $i < scalar @in_table; $i++) { $record = $in_table[$i]; @strs = split "\t", $record; $str = $strs[2]; @tokens = split " ", $str; $length = scalar @tokens; $new_record = "$strs[1]\t$length\t$strs[2]\t$colors[$i]\n"; print $ofh $new_record; } close($ofh); } sub english_nercoref { my $name = shift; return if($^O ne "linux"); #for sake of coref my $outfile = $_[0]; #_raw #NER my $tfile = $outfile; $tfile =~ s/_raw/_table/g; lbj("lbj", $outfile); stanford("stanford", $outfile); make_table("make_table", $tfile, $outfile); #COREF lbj_coref("lbj_coref", $outfile); #makes .LBJ file #extract_lbj("extract_lbj", $outfile); #makes .LBJ_list file make_imatrix("make_imatrix", $outfile); b3_score("b3_score", $outfile); } sub setT { my $name = shift; my $val = $_[0]; $l_jt = $val; } sub translate { my $name = shift; @infiles = @_; #%arb_propers = (); #%arb_commons = (); for my $infile (sort @infiles) { #start with infile, infile -> outfile, outfile -> tfile, trans_file, picked_file, correct_file print "Translating $infile\n"; $lang = ($infile =~ /normalized-derivations/ or $infile =~ /_arb/) ? "ar" : "sw"; open($ifh, "<", $infile); @sentences = (); @tr_sentences = (); $document = ""; @src_tokens = (); %src_trans = (); @src_poss = (); @src_lemmas = (); $labeled = 0; if($infile =~ /normalized-derivations/ or ($infile =~ /alasiri/ and $infile !~ /sw_alasiri/)) { $infile =~ s/-normalized-derivations/_arb/g; $infile =~ s/alasiri_/sw_alasiri_/g; $labeled = 1; while(1) { $inflag = 1; $sentence = ""; #get buckwalter data, convert to utf8 #so each tree corresponds to a sentence while($inflag == 1) { ##@got_arr = getdata(); @got_arr = getdata_aner(); goto LOOP if(! @got_arr); ##($token, $lemma, $llemma, $morph, $inflag) = @got_arr; $token = $got_arr[0]; $lemma = $got_arr[1]; $inflag = $got_arr[2]; $trans = (scalar @got_arr == 4) ? $got_arr[3] : ""; #print "t: |$token|$lemma|\n"; #next if($token eq "" or $lemma eq "" or $token =~ /\s+/ or $lemma =~ /\s+/ or $llemma eq "" or $llemma =~ /\s+/); next if($token eq "" or $lemma eq "" or $token =~ /\s+/ or $lemma =~ /\s+/); $sentence = ($sentence eq "") ? $token : $sentence . " ". $token; ##$src_token = decode("Buckwalter", $token); $src_token = $token; #print "s: $src_token\n"; push @src_tokens, $src_token; push @src_poss, $lemma; $src_trans{lc $trans}++ if($trans ne ""); #push @src_lemmas, $llemma; #if($lemma =~ /NOUN_PROP/) #{ # $arb_propers{$src_token}++; #} #else #{ # $arb_commons{$src_token}++; #} #print "Arb: |$src_token|$lemma|\n"; } ##$sentence = decode("Buckwalter", $sentence); #we got the sentence. now let's break it into parts if(length($sentence) > 700) { print "Big Length: ", length($sentence), "\n"; $f_split = 1; $remainder = length($sentence) % 700; $times = (length($sentence) - $remainder) / 700; for($temp1 = 0; $temp1 < $times; $temp1++) { for($tempc = 650; $tempc < 700; $tempc++) { if(substr($sentence, $tempc, 1) eq " ") { last; } } $part_sent[$temp1] = substr($sentence, 0, $tempc) . "."; print "split: ", length($part_sent[$temp1]), "\n"; $sentence = substr($sentence, $tempc); } if(length($sentence) > 700) { for($tempc = 650; $tempc < 700; $tempc++) { if(substr($sentence, $tempc, 1) eq " ") { last; } } $part_sent[$times] = substr($sentence, 0, $tempc) . "."; $part_sent[$times + 1] = substr($sentence, $tempc); print "split: ", length($part_sent[$times]), "\n"; print "split: ", length($part_sent[$times+1]), "\n"; } else { $part_sent[$times] = $sentence; print "split: ", length($part_sent[$times]), "\n"; } } else { $part_sent[0] = $sentence; } for($tempc = 0; $tempc < scalar @part_sent; $tempc++) { next if($part_sent[$tempc] eq "."); push @sentences, $part_sent[$tempc]; $document = ($document eq "") ? $part_sent[$tempc] : $document . "\n" . $part_sent[$tempc]; } #part-breaking ends here } LOOP: print scalar @src_tokens, " tokens, ", scalar @sentences, " sentences in $infile\n"; close($ifh); if(not -e $infile) { open($ofh, ">", $infile); print $ofh "$_\n" for(@sentences); close($ofh); } open($ifh, "<", $infile); } #next; $outfile = $infile; ($lang eq "ar") ? ($outfile =~ s/_arb/_raw/g) : ($outfile =~ s/_swh/_raw/g); #for unlabeled docs, make sure there is one sentence per line, and it ends with a . also, no blank sentences. $f_split = 0; $linenum = 0; @sentences = (); $document = ""; while(<$ifh>) { $sentence = $_; $linenum++; chop($sentence) if(substr($sentence, -1, 1) eq "\n" or substr($sentence, -1, 1) eq "\r"); chop($sentence) if(substr($sentence, -1, 1) eq "\n" or substr($sentence, -1, 1) eq "\r"); next if($sentence eq "" or $sentence =~ /^\s+$/); #print "|$sentence|\n"; #if($sentence =~ /^\s+$/); #src_tokens, src_poss, src_lemmas @temp_tokens = split /\s+/, $sentence; for(@temp_tokens) { #s/\W+//g if($lang ne "ar"); next if($_ eq ""); push @src_tokens, $_ if($labeled == 0); } #translate to english next if(-e $outfile); @part_sent = (); if(length($sentence) > 700) { print "Big Length: ", length($sentence), "\n"; $f_split = 1; warn "***********************f_split activated on labeled data, shouldn't have.*********************\n" if($labeled == 1); $remainder = length($sentence) % 700; $times = (length($sentence) - $remainder) / 700; for($temp1 = 0; $temp1 < $times; $temp1++) { for($tempc = 650; $tempc < 700; $tempc++) { if(substr($sentence, $tempc, 1) eq " ") { last; } } $part_sent[$temp1] = substr($sentence, 0, $tempc) . "."; print "split: ", length($part_sent[$temp1]), "\n"; $sentence = substr($sentence, $tempc); } if(length($sentence) > 700) { for($tempc = 650; $tempc < 700; $tempc++) { if(substr($sentence, $tempc, 1) eq " ") { last; } } $part_sent[$times] = substr($sentence, 0, $tempc) . "."; $part_sent[$times + 1] = substr($sentence, $tempc); print "split: ", length($part_sent[$times]), "\n"; print "split: ", length($part_sent[$times+1]), "\n"; } else { $part_sent[$times] = $sentence; print "split: ", length($part_sent[$times]), "\n"; } } else { $part_sent[0] = $sentence; } for($tempc = 0; $tempc < scalar @part_sent; $tempc++) { #print "line no: $linenum length: ", length($part_sent[$tempc]), "\n" if($linenum % 100 == 0); #print "Sentence: $part_sent[$tempc]\n"; if($translator eq "g" or $lang ne "ar") { if($lang eq "ar") { do { $res = REST::Google::Translate->new( q => $part_sent[$tempc], langpair => 'ar|en' ); print "Waiting for translation ", $res->responseStatus, "\n"; } until($res->responseStatus == 200); $translated = $res->responseData->translatedText; } else { do { $result = $service->translate($part_sent[$tempc]); print "Waiting for translation ", $result->error, "\n"; } while ($result->error and $result->code != 400); $translated = $result->translation; } } else { $command = "./bing en infile.txt outfile.txt"; open($bing_ifh, ">", "infile.txt"); print $bing_ifh $part_sent[$tempc]; close($bing_ifh); `$command`; open($bing_ofh, "<", "outfile.txt"); $translated = ""; while(<$bing_ofh>) { $translated = ($translated eq "") ? $_ : $translated . " " . $_; } close($bing_ofh); } $translated =~ s/"/\"/g; $translated =~ s/'//g; $translated =~ s/<.+>/ /g; $translated =~ s/\///g; $translated =~ s/-(\d+)-/$1/g; $translated =~ s/(\d+)-(\d+)/$1$2/g; $translated =~ s/(\w+):(\w+)/$1$2/g; $translated =~ s/(\w+)\.(\w+)/$1$2/g; substr($translated, -1, 1) = " "; $translated = $translated . "."; printf "%s\n", $translated; push @tr_sentences, $translated; push @sentences, $part_sent[$tempc]; $document = ($document eq "") ? $part_sent[$tempc] : $document . "\n" . $part_sent[$tempc]; } } $n_tokens += scalar @src_tokens; close($ifh); if(not -e $outfile) { print "Outfile: $outfile\n"; open($ofh, ">", $outfile); print $ofh "$_\n" for(@tr_sentences); close($ofh); } if($f_split == 1) { open($ofh, ">", $infile); print $ofh "$_\n" for(@sentences); close($ofh); } next if($l_jt == 0); #Read in aligned text from _corps file. make_aligncorps, run_giza and make_corpfiles need to be run to make _corps. my $corpfile = $outfile; $corpfile =~ s/_raw/_corps/g; if((not -e $corpfile) or $l_compare ne "giza") { $l_compare = "en" if($l_compare eq "giza"); goto SKIP_GIZA; } open($cfh, "<", $corpfile); @t_words = (); @s_words = (); @s_nums = (); @sg_words = (); @sg_nums = (); print "Reading $corpfile into memory\n"; while(<$cfh>) { $line = $_; chomp($line); chop($line) if(substr($line, -1, 1) eq "\n" or substr($line, -1, 1) eq "\r"); if($line =~ /# Sentence pair \((\d+)\) source length (\d+) target length (\d+)/) { #$sid = $1; #$s_len = $2; #$t_len = $3; $flag = 1; next; } if($flag == 1) { @temp = split /\s+/, $line; for my $word (@temp) { push @t_words, $word; } $t_senlen = scalar @temp; $flag = 2; next; } if($flag == 2) { #NER @temp = split(/\) /, $line); #why does it behave this way #@temp = split(/\}\) /, $line); #why does it behave this way #$temp_str = (join "|", @temp); #print "temp: $temp_str\n"; @wordnums = (); for($i = 0; $i < scalar @temp; $i++) { @parts = split(/ \(/, $temp[$i]); #why does it behave this way #@parts = split(/ \(\{/, $temp[$i]); #why does it behave this way #$parts_str = (join "|", @parts); #print "parts: $parts_str\n"; push @wordnums, $temp[$i] if($parts[0] ne "NULL"); } for($i = 0; $i < scalar @wordnums; $i++) { @s_parts = split(/ \(/, $wordnums[$i]); #why does it behave this way #@s_parts = split(/ \(\{/, $wordnums[$i]); #why does it behave this way #$s_parts_str = (join "|", @s_parts); #print "before s_parts: $s_parts_str\n"; #$s_parts[1] =~ s/^ //g; #$s_parts[1] =~ s/\s*$//g; $s_parts[1] =~ s/\{ //g; $s_parts[1] =~ s/\s*\}//g; next if($s_parts[0] eq "" or $s_parts[0] eq "-LRB-" or $s_parts[0] eq "-RRB-" or $s_parts[0] eq "s" or $s_parts[0] !~ /\w+/); $s_parts[0] =~ s/\W+//g; push @s_words, $s_parts[0]; #push @sg_words, $s_parts[0]; #print "s_parts: |$s_parts[0]|$s_parts[1]|\nnum: "; @nums = split /\s+/, $s_parts[1]; $ind_str = ""; for my $num (@nums) { #print "|$num| "; $ind = scalar(@t_words) - $t_senlen; $ind += $num - 1; $ind_str = ($ind_str eq "") ? $ind : $ind_str . " $ind"; } #print "\n"; #print "ind_str: |$ind_str|\n"; push @s_nums, $ind_str; #push @sg_nums, $ind_str; } #COREF #@temp = split(/\) /, $line); #why does it behave this way @temp = split(/\}\) /, $line); #why does it behave this way #$temp_str = (join "|", @temp); #print "temp: $temp_str\n"; @wordnums = (); for($i = 0; $i < scalar @temp; $i++) { #@parts = split(/ \(/, $temp[$i]); #why does it behave this way @parts = split(/ \(\{/, $temp[$i]); #why does it behave this way #$parts_str = (join "|", @parts); #print "parts: $parts_str\n"; push @wordnums, $temp[$i] if($parts[0] ne "NULL"); } for($i = 0; $i < scalar @wordnums; $i++) { #@s_parts = split(/ \(/, $wordnums[$i]); #why does it behave this way @s_parts = split(/ \(\{/, $wordnums[$i]); #why does it behave this way #$s_parts_str = (join "|", @s_parts); #print "before s_parts: $s_parts_str\n"; $s_parts[1] =~ s/^ //g; $s_parts[1] =~ s/\s*$//g; #$s_parts[1] =~ s/\{ //g; #$s_parts[1] =~ s/\s*\}//g; next if($s_parts[0] eq "" or $s_parts[0] eq "-LRB-" or $s_parts[0] eq "-RRB-" or $s_parts[0] eq "s" or $s_parts[0] !~ /\w+/); $s_parts[0] =~ s/\W+//g; #push @s_words, $s_parts[0]; push @sg_words, $s_parts[0]; #print "s_parts: |$s_parts[0]|$s_parts[1]|\nnum: "; @nums = split /\s+/, $s_parts[1]; $ind_str = ""; for my $num (@nums) { #print "|$num| "; $ind = scalar(@t_words) - $t_senlen; $ind += $num - 1; $ind_str = ($ind_str eq "") ? $ind : $ind_str . " $ind"; } #print "\n"; #print "ind_str: |$ind_str|\n"; #push @s_nums, $ind_str; push @sg_nums, $ind_str; } next; } } close($cfh); #for($k = 0; $k < scalar @src_tokens; $k++) #{ # print "Swh: |$src_tokens[$k]|\t|$t_words[$k]|\n"; #} #for($k = 0; $k < scalar @s_words; $k++) #{ # print "Eng: |$k|$s_words[$k]|\n"; #} SKIP_GIZA: #Run NER, make table #lbj("lbj", $outfile); stanford("stanford", $outfile); my $tfile = $outfile; $tfile =~ s/_raw/_table/g; my @temps = make_table("make_table", $tfile, $outfile); my $stan_table = $temps[0]; my @table = split "\n", $stan_table; #@table = split "\n", $lbj_table; #************** #Run coref, make LBJ_list if($l_coref == 1) { lbj_coref("lbj_coref", $outfile); #makes .LBJ file #extract_lbj("extract_lbj", $outfile); #makes .LBJ_list file my $ctfile = "$outfile.LBJ_list"; open(my $ctfh, "<", $ctfile); my %temp_table = (); @coref_table = (); while(<$ctfh>) { $line = $_; chomp($line); my @temp = split "\t", $line; $temp_table{$temp[0]} = $line; } close($ctfh); for my $key (sort {$a <=> $b} keys %temp_table) { push @coref_table, $temp_table{$key}; } } #************** #Make lists of NE, align s_words and s_nums. Read t_* for src or tl_* for en @indices = (); @entities = (); @en_entities = (); @t_indices = (); @t_entities = (); @t_types = (); @en_t_indices = (); @en_t_entities = (); @en_t_types = (); my %en_t_list = (); print "Making lists of named entities in English and source language\n"; warn "Making lists of ", scalar @table, " entities\n"; my $counter = 0; for my $record (@table) { @fields = split "\t", $record; $index = $fields[1]; $tag = $fields[2]; $type = $fields[4]; next if ($tag eq ""); $counter++; warn "entity no. $counter\n" if($counter % 10 == 0); #print "Entity:$tag\n"; push @en_entities, $tag; @en_words = split /\s+/, $tag; for($i = 0; $i < scalar @en_words; $i++) { $n = $index + $i; push @en_t_indices, $n; push @en_t_entities, $en_words[$i]; $en_t_list{lc $en_words[$i]}++; push @en_t_types, $type; $en_words[$i] =~ s/\W+//g; if($en_words[$i] ne $s_words[$n]) { if($en_words[$i] eq $s_words[$n-1]) { splice @s_words, $n - 1, 0, "NULL"; splice @s_nums, $n - 1, 0, "NULL"; } } #print "English: $n\t$en_words[$i]\t$s_words[$n]\n"; } #Make t_* for src next if($l_compare ne "src"); if($translator eq "g" or $lang ne "ar") { if($lang eq "ar") { do { $res = REST::Google::Translate->new( q => $tag, langpair => 'en|ar' ); } until($res->responseStatus == 200); $entity = $res->responseData->translatedText; } else { do { $result = $service->translate($tag, 'dest' => 'sw'); } while ($result->error and $result->code != 400); $entity = $result->translation; } } else { $command = "./bing ar infile.txt outfile.txt"; open($bing_ifh, ">", "infile.txt"); print $bing_ifh $tag; close($bing_ifh); `$command`; open($bing_ofh, "<", "outfile.txt"); $entity = ""; while(<$bing_ofh>) { $entity = ($entity eq "") ? $_ : $entity . " " . $_; } close($bing_ofh); #printf "Bing: %s\n", $translated; } push @entities, $entity; @words = split /\s+/, $entity; for($i = 0; $i < scalar @words; $i++) { $n = $index + $i; push @t_indices, $n; push @t_entities, $words[$i]; push @t_types, $type; } } #Make tl_* for en #goto ALIGN_METHODS if($l_compare ne "en"); ALIGN_METHODS: #Alignment methods %ne_list = (); %ne_list2 = (); %ne_type2 = (); $window = 30; #This used to be the default method if($l_compare eq "en" or $l_coref == 1) { @tl_tokens = (); @tl_words = (); @tl_indices = (); print "Transliterating document from source to English\n"; warn "Transliterating ", scalar @src_tokens, " tokens\n"; $counter = 0; $trans_file = $outfile; $trans_file =~ s/_raw/_translit/g; if(-e $trans_file) { print "Translit file exists\n"; open(my $trfh, "<", $trans_file); my $i = 0; while(<$trfh>) { my $translated = $_; chomp($translated); chop($translated) if(substr($translated, -1, 1) eq "\n" or substr($translated, -1, 1) eq "\r"); push @tl_words, $translated; @words = split /\s+/, $translated; for my $word (@words) { push @tl_indices, $i; push @tl_tokens, $word; } $i++; } close($trfh); } else { open (my $trfh, ">", $trans_file); for($i = 0; $i < scalar @src_tokens; $i++) { $src_token = $src_tokens[$i]; next if($src_token eq ""); $counter++; warn "token no. $counter\n" if($counter % 200 == 0); if($translator eq "g" or $lang ne "ar") { if($lang eq "ar") { do { $res = REST::Google::Translate->new( q => $src_token, langpair => 'ar|en' ); print "Waiting for translation ", $res->responseStatus, "\n"; } until($res->responseStatus == 200); $translated = $res->responseData->translatedText; #printf "Source: $src_token Google: %s\n", $translated; } else { do { $result = $service->translate($src_token); print "Waiting for translation ", $result->error, "\n"; } while ($result->error and $result->code != 400); $translated = $result->translation; #printf "Source: $src_token Google: %s\n", $translated; } } else { $command = "./bing en infile.txt outfile.txt"; open($bing_ifh, ">", "infile.txt"); print $bing_ifh $src_token; close($bing_ifh); `$command`; open($bing_ofh, "<", "outfile.txt"); $translated = ""; while(<$bing_ofh>) { $translated = ($translated eq "") ? $_ : $translated . " " . $_; } close($bing_ofh); #printf "Bing: %s\n", $translated; } $translated =~ s/"/\"/g; $translated =~ s/'//g; $translated =~ s/<.+>/ /g; $translated =~ s/\///g; $translated =~ s/-(\d+)-/$1/g; $translated =~ s/(\d+)-(\d+)/$1$2/g; $translated =~ s/(\w+):(\w+)/$1$2/g; $translated =~ s/(\w+)\.(\w+)/$1$2/g; push @tl_words, $translated; @words = split /\s+/, $translated; print $trfh "$translated\n"; for my $word (@words) { push @tl_indices, $i; push @tl_tokens, $word; } } close($trfh); } for($i = 0; $i < scalar @en_t_entities; $i++) { $index = $en_t_indices[$i]; $start = ($index < $window) ? 0 : $index - $window; $end = ($index > scalar @tl_tokens - $window) ? scalar @tl_tokens : $index + $window; @st2 = Text::English::stem($en_t_entities[$i]); $s2 = $en_t_entities[$i]; #next if($commons{$st2[0]} ne "" and $propers{$st2[0]} eq ""); # and $lkey eq $key); #next if($swh_commons{$st2[0]} ne "" and $swh_propers{$st2[0]} eq ""); # and $lkey eq $key); for($j = $start; $j < $end; $j++) { @st1 = Text::English::stem($tl_tokens[$j]); $s1 = $tl_tokens[$j]; if($lang ne "ar") { $s1 =~ s/\W+//g; $s2 =~ s/\W+//g; } #print "index $en_t_indices[$i] $en_t_entities[$i] j $j $tl_tokens[$j]\n"; if(lc $s1 eq lc $s2) { $ne_list{$i}++; $ne_list2{$tl_indices[$j]}++; $ne_type2{$tl_indices[$j]} = $en_t_types[$i]; #print "Matched here $en_t_indices[$i] $j\n"; } #print "Matched s1-s2 $en_t_indices[$i] $j\n" if($s1 eq $s2); } } } #elsif($l_compare eq "src") if($l_compare eq "src") { for($i = 0; $i < scalar @t_entities; $i++) { $index = $t_indices[$i]; $start = ($index < $window) ? 0 : $index - $window; $end = ($index > scalar @src_tokens - $window) ? scalar @src_tokens : $index + $window; for($j = $start; $j < $end; $j++) { @st1 = Text::English::stem($src_tokens[$j]); @st2 = Text::English::stem($t_entities[$i]); $s1 = $src_tokens[$j]; $s2 = $t_entities[$i]; if($lang ne "ar") { $s1 =~ s/\W+//g; $s2 =~ s/\W+//g; } #print "index $en_t_indices[$i] $en_t_entities[$i] j $j $tl_tokens[$j]\n"; if(lc $st1[0] eq lc $st2[0]) { $ne_list{$i}++; $ne_list2{$j}++; $ne_type2{$j} = $t_types[$i]; } } } } if($l_compare eq "giza") # giza { print "Giza: ", scalar @en_t_entities, "\n"; for($i = 0; $i < scalar @en_t_entities; $i++) { $entity = $en_t_entities[$i]; $index = $en_t_indices[$i]; $t_idlist = $s_nums[$index]; next if($t_idlist eq ""); #no alignment $ne_list{$i}++; @t_ids = split /\s+/, $t_idlist; for my $t_id (@t_ids) { $ne_list2{$t_id}++; $ne_type2{$t_id} = $en_t_types[$i]; } } } @nelist = sort {$a <=> $b} keys %ne_list; #print "Rematched:\n"; #for my $key (@nelist) #{ #print "$en_t_indices[$key]\t$en_t_entities[$key]\n"; #} #print "Overall:\n"; #for($i = 0; $i < scalar @en_t_entities; $i++) #{ #print "$en_t_indices[$i]\t$en_t_entities[$i]\n"; #} print "Rematched ", scalar @nelist, " out of ", scalar @en_t_entities, " = ", (scalar @nelist / scalar @en_t_entities), " Extractor-tagged entities\n"; if($pp == 1) { for($i = 0; $i < scalar @src_tokens; $i++) { $ne_list2{$i}++ if($lang eq "sw" and $swh_propers{lc $src_tokens[$i]} ne "" and $swh_commons{lc $src_tokens[$i]} eq ""); delete $ne_list2{$i} if($lang eq "sw" and $swh_propers{lc $src_tokens[$i]} eq "" and $swh_commons{lc $src_tokens[$i]} ne "" and $ne_list2{$i} ne ""); $ne_list2{$i}++ if($lang eq "ar" and $arb_propers{$src_tokens[$i]} ne "" and $arb_commons{$src_tokens[$i]} eq ""); delete $ne_list2{$i} if($lang eq "ar" and $arb_propers{$src_tokens[$i]} eq "" and $arb_commons{$src_tokens[$i]} ne "" and $ne_list2{$i} ne ""); } } #for($i = 0; $i < scalar @tl_indices; $i++) #{ # $ne_list2{$tl_indices[$i]}++ if($propers{lc $tl_tokens[$i]} ne "" and $commons{lc $tl_tokens[$i]} eq ""); # delete $ne_list2{$tl_indices[$i]} if($propers{lc $tl_tokens[$i]} eq "" and $commons{lc $tl_tokens[$i]} ne "" and $ne_list2{$tl_indices[$i]} ne ""); #} #************** #Make LBJ_list files for native language if($l_coref == 1) { my $counter = 0; my @new_table = (); #my %names = (); #my %p_ids = (); #=temp for my $record (@coref_table) { #print "Record: |$record|\n"; @fields = split "\t", $record; $index = $fields[0]; $tag = $fields[2]; next if ($tag eq ""); $counter++; warn "entity no. $counter\n" if($counter % 10 == 0); #print "Entity:$tag\n"; @en_words = split /\s+/, $tag; for($i = 0; $i < scalar @en_words; $i++) { $n = $index + $i; $en_words[$i] =~ s/\W+//g; if($en_words[$i] ne $sg_words[$n]) { if($en_words[$i] eq $sg_words[$n-1]) { splice @sg_words, $n - 1, 0, "NULL"; splice @sg_nums, $n - 1, 0, "NULL"; } } #print "Coref: $n\t$en_words[$i]\t$sg_words[$n]\n"; } } #=cut $counter = 0; for my $record (@coref_table) { print "Coref Record: $record\n"; @fields = split "\t", $record; $index = $fields[0]; $length = $fields[1]; $entity = $fields[2]; $e_id = $fields[3]; next if ($entity eq ""); $counter++; #print "Entity:|$entity|\n"; my %ids = (); @en_words = split /\s+/, $entity; for($i = 0; $i < scalar @en_words; $i++) { $n = $index + $i; #print "En word: $en_words[$i]\n"; #$names{$en_words[$i]} = $e_id if($lang eq "ar" and $arb_propers{$en_words[$i]} ne "" and $arb_commons{$en_words[$i]} eq ""); #=temp #Transliteration begins $start = ($n < $window) ? 0 : $n - $window; $end = ($n > scalar @tl_tokens - $window) ? scalar @tl_tokens : $n + $window; @st2 = Text::English::stem($en_words[$i]); $s2 = $en_words[$i]; #next if($commons{$st2[0]} ne "" and $propers{$st2[0]} eq ""); # and $lkey eq $key); #next if($swh_commons{$st2[0]} ne "" and $swh_propers{$st2[0]} eq ""); # and $lkey eq $key); for($j = $start; $j < $end; $j++) { @st1 = Text::English::stem($tl_tokens[$j]); $s1 = $tl_tokens[$j]; if($lang eq "ar") { $s1 =~ s/\W+//g; $s2 =~ s/\W+//g; } #print "index $en_t_indices[$i] $en_t_entities[$i] j $j $tl_tokens[$j]\n"; if(lc $st1[0] eq lc $st2[0]) { $ids{$tl_indices[$j]}++; #$p_ids{$tl_indices[$j]}++; #print "Matched here $en_t_indices[$i] $j\n"; } #print "Matched s1-s2 $en_t_indices[$i] $j\n" if($s1 eq $s2); } #Transliteration ends #=cut #=temp #Giza begins $t_idlist = $sg_nums[$n]; next if($t_idlist eq ""); #no alignment exists @t_ids = split /\s+/, $t_idlist; for my $t_id (@t_ids) { $ids{$t_id}++; #$p_ids{$tl_indices[$j]}++; #print "Src: $src_tokens[$t_id]\n"; } #Giza ends #=cut } next if(scalar keys %ids == 0); #no alignment my $minid = min keys %ids; my $maxid = max keys %ids; $keys = join ",", sort {$a <=> $b} keys %ids; print "Keys: $keys\n"; #make new record for native language my $new_index = $minid; my $new_length = ($maxid - $minid) + 1; #try using t_words instead of src_tokens if it doesn't work #my $new_entity = $src_tokens[$minid]; my $new_entity = ""; #for($i = $minid + 1; $i <= $maxid; $i++) for my $i (sort {$a <=> $b} keys %ids) { $new_entity = ($new_entity eq "") ? $src_tokens[$i] : $new_entity . " " . $src_tokens[$i]; } my $new_e_id = $e_id; my $new_record = "$new_index\t$new_length\t$new_entity\t$new_e_id"; print "New Record: $new_record\n"; push @new_table, $new_record; } =temp #NE post-processing, didnt work at all. numbers went down. for($i = 0; $i < scalar @src_tokens; $i++) { if($p_ids{$i} eq "" and $lang eq "ar" and $arb_propers{$src_tokens[$i]} ne "" and $arb_commons{$src_tokens[$i]} eq "") { my $new_index = $i; my $new_length = 1; my $new_entity = $src_tokens[$i]; my $new_e_id = $names{$src_tokens[$i]}; my $new_record = "$new_index\t$new_length\t$new_entity\t$new_e_id"; push @new_table, $new_record; } } =cut my $ctfile = "$outfile.LBJ_list"; my $new_ctfile = $ctfile; ($lang eq "ar") ? ($new_ctfile =~ s/_raw/_arb/g) : ($new_ctfile =~ s/_raw/_swh/g); open($ntfh, ">", $new_ctfile) || die "Cannot open ctfile $ctfile new_ctfile: $new_ctfile\n"; for my $new_record (@new_table) { print $ntfh "$new_record\n"; } close($ntfh); #here, an extract_gold fn to make the GOLD_LIST file. plus something to align LBJ_LIST and GOLD_LIST make_imatrix("make_imatrix", $infile); ($co_pren, $co_pred, $co_recn, $co_recd) = b3_score("b3_score", $infile); $s_co_pren += $co_pren; $s_co_pred += $co_pred; $s_co_recn += $co_recn; $s_co_recd += $co_recd; $co_pre = $s_co_pren / $s_co_pred; $co_rec = $s_co_recn / $s_co_recd; $co_f1 = 2 * $co_pre * $co_rec / ($co_pre + $co_rec); print "Cumulative Coref: Prec: $co_pre Recl: $co_rec F1: $co_f1\n"; warn "Cumulative Coref: Prec: $co_pre Recl: $co_rec F1: $co_f1\n"; } #************** @nelist2 = sort {$a <=> $b} keys %ne_list2; $picked = scalar @nelist2; %type_picked = (); $tp = 0; %type_tp = (); #$ecount = 0; #$ecount2 = 0; #write picked file $pickedfile = $outfile; $pickedfile =~ s/raw/picked/g; $pickedfile =~ s/sw_alasiri_/alasiri_/g if($labeled == 1); $pickedhtm = $pickedfile; $pickedhtm =~ s/\.(txt)|(sgm)|(text)/\.html/g; open($pfh, ">", $pickedfile); open($hfh, ">", $pickedhtm); print $hfh "\n"; for($i = 0; $i < scalar @src_tokens; $i++) { if($ne_type2{$i} eq "") { print $pfh "$src_tokens[$i] "; print $hfh "$src_tokens[$i] "; } else { $starttag = $ne_type2{$i}; $j = $i+1; $j++ while($ne_type2{$j} eq $starttag and $j < scalar @src_tokens); $start = $i; $end = $j; $i = $j - 1; $entity = ""; for($k = $start; $k < $end; $k++) { $entity = ($entity eq "") ? $src_tokens[$k] : $entity . " " . $src_tokens[$k]; } #entity is $entity, tag is $ne_type2{$start} or $starttag print $pfh "[$starttag $entity ] "; if($starttag eq "PER") { print $hfh "$entity "; } if($starttag eq "LOC") { print $hfh "$entity "; } if($starttag eq "ORG") { print $hfh "$entity "; } if($starttag eq "MIS") { print $hfh "$entity "; } } } print $pfh "\n"; print $hfh "\n"; print $hfh "\n\n"; close($pfh); close($hfh); next if($labeled == 0); #evaluation part begins here #Here, compare %ne_list2 and %src_trans. if($l_compare ne "src" and $lang eq "sw") { my %comb_list = (); $comb_list{$_}++ foreach(keys %en_t_list); $comb_list{$_}++ foreach(keys %src_trans); @co_result = compare_lists("compare_lists", \%src_trans, \%en_t_list, \%comb_list); #($tp, $fp, $fn) $ie_tp += $co_result[0]; $ie_fp += $co_result[1]; $ie_fn += $co_result[2]; $ietw_tp += $co_result[3]; $ietw_fp += $co_result[4]; $ietw_fn += $co_result[5]; } #write correct file $correctfile = $outfile; $correctfile =~ s/raw/correct/g; $correctfile =~ s/sw_alasiri_/alasiri_/g; $correcthtm = $correctfile; $correcthtm =~ s/\.(txt)|(sgm)|(text)/\.html/g; open($cfh, ">", $correctfile); open($hfh, ">", $correcthtm); print $hfh "\n"; for($i = 0; $i < scalar @src_tokens; $i++) { if($src_poss[$i] eq "O") { print $cfh "$src_tokens[$i] "; print $hfh "$src_tokens[$i] "; } else { $starttag = $src_poss[$i]; $j = $i+1; $j++ while($src_poss[$j] eq $starttag and $j < scalar @src_tokens); $start = $i; $end = $j; $i = $j - 1; $entity = ""; for($k = $start; $k < $end; $k++) { $entity = ($entity eq "") ? $src_tokens[$k] : $entity . " " . $src_tokens[$k]; } #entity is $entity, tag is $src_poss[$start] or $starttag print $cfh "[$starttag $entity ] "; print $hfh "$entity "; } } print $cfh "\n"; print $hfh "\n"; print $hfh "\n\n"; close($cfh); close($hfh); for my $key (@nelist2) { #print $src_poss[$key], "\n"; #$tp++ if($src_poss[$key] =~ /NOUN_PROP/); $tp++ if($src_poss[$key] ne "O"); if($lang eq "ar") { $type_tp{PER}++ if($src_poss[$key] =~ /PER/ and $ne_type2{$key} =~ /PER/); $type_tp{LOC}++ if($src_poss[$key] =~ /LOC/ and $ne_type2{$key} =~ /LOC/); $type_tp{ORG}++ if($src_poss[$key] =~ /ORG/ and $ne_type2{$key} =~ /ORG/); $type_tp{MIS}++ if($src_poss[$key] =~ /MIS/ and $ne_type2{$key} =~ /MIS/); $type_picked{PER}++ if($ne_type2{$key} =~ /PER/); $type_picked{LOC}++ if($ne_type2{$key} =~ /LOC/); $type_picked{ORG}++ if($ne_type2{$key} =~ /ORG/); $type_picked{MIS}++ if($ne_type2{$key} =~ /MIS/); print "ORG picked: $key|$tl_words[$key]\n" if($ne_type2{$key} =~ /ORG/); print "MIS picked: $key|$tl_words[$key]\n" if($ne_type2{$key} =~ /MIS/); } #$ecount++ if($src_poss[$key+1] =~ /NOUN_PROP/ and not grep {$_ == $key + 1} @nelist2); #$ecount2++ if($src_poss[$key+2] =~ /NOUN_PROP/ and not grep {$_ == $key + 2} @nelist2); } #print "tp: ", $tp, "\n"; #print "ecount: ", $ecount, "\n"; #print "ecount2: ", $ecount2, "\n"; $correct = 0; %type_correct = (); for($i = 0; $i < scalar @src_tokens; $i++) { #$correct++ if($src_poss[$i] =~ /NOUN_PROP/); $correct++ if($src_poss[$i] ne "O"); if($lang eq "ar") { $type_correct{PER}++ if($src_poss[$i] =~ /PER/); $type_correct{LOC}++ if($src_poss[$i] =~ /LOC/); $type_correct{ORG}++ if($src_poss[$i] =~ /ORG/); $type_correct{MIS}++ if($src_poss[$i] =~ /MIS/); print "ORG correct: $i|$tl_words[$i]\n" if($src_poss[$i] =~ /ORG/); print "MIS correct: $i|$tl_words[$i]\n" if($src_poss[$i] =~ /MIS/); } } #print "total correct: ", $total_correct, "\n"; $prec = sprintf("%.3f", ($picked == 0) ? 0 : $tp / $picked); $recl = sprintf("%.3f", ($correct == 0) ? 0 : $tp / $correct); $f1 = sprintf("%.3f", ($prec + $recl == 0) ? 0 : 2 * $prec * $recl / ($prec + $recl)); print "$infile Precision: $prec Recall: $recl F1: $f1\n"; warn "$infile Precision: $prec Recall: $recl F1: $f1\n"; $s_tp += $tp; $s_picked +=$picked; $s_correct += $correct; $s_prec = sprintf("%.3f", ($s_picked == 0) ? 0 : $s_tp / $s_picked); $s_recl = sprintf("%.3f", ($s_correct == 0) ? 0 : $s_tp / $s_correct); $s_f1 = sprintf("%.3f", ($s_prec + $s_recl == 0) ? 0 : 2 * $s_prec * $s_recl / ($s_prec + $s_recl)); print "Cumulative Tokens: $n_tokens TP: $s_tp Picked: $s_picked Correct: $s_correct\n"; print "Cumulative Precision: $s_prec Recall: $s_recl F1: $s_f1\n"; warn "Cumulative Tokens: $n_tokens TP: $s_tp Picked: $s_picked Correct: $s_correct\n"; warn "Cumulative Precision: $s_prec Recall: $s_recl F1: $s_f1\n"; if($lang eq "sw") { $ie_picked = $ie_tp + $ie_fp; $ie_correct = $ie_tp + $ie_fn; $ie_prec = sprintf("%.3f", ($ie_picked == 0) ? 0 : $ie_tp / $ie_picked); $ie_recl = sprintf("%.3f", ($ie_correct == 0) ? 0 : $ie_tp / $ie_correct); $ie_f1 = sprintf("%.3f", ($ie_prec + $ie_recl == 0) ? 0 : 2 * $ie_prec * $ie_recl / ($ie_prec + $ie_recl)); print "\n\nIE:\nCumulative Tokens: $n_tokens TP: $ie_tp Picked: $ie_picked Correct: $ie_correct\n"; print "Cumulative Precision: $ie_prec Recall: $ie_recl F1: $ie_f1\n"; warn "IE:\nCumulative Tokens: $n_tokens TP: $ie_tp Picked: $ie_picked Correct: $ie_correct\n"; warn "Cumulative Precision: $ie_prec Recall: $ie_recl F1: $ie_f1\n"; $ietw_picked = $ietw_tp + $ietw_fp; $ietw_correct = $ietw_tp + $ietw_fn; $ietw_prec = sprintf("%.3f", ($ietw_picked == 0) ? 0 : $ietw_tp / $ietw_picked); $ietw_recl = sprintf("%.3f", ($ietw_correct == 0) ? 0 : $ietw_tp / $ietw_correct); $ietw_f1 = sprintf("%.3f", ($ietw_prec + $ietw_recl == 0) ? 0 : 2 * $ietw_prec * $ietw_recl / ($ietw_prec + $ietw_recl)); print "\n\nIETW:\nCumulative Tokens: $n_tokens TP: $ietw_tp Picked: $ietw_picked Correct: $ietw_correct\n"; print "Cumulative Precision: $ietw_prec Recall: $ietw_recl F1: $ietw_f1\n"; warn "IETW:\nCumulative Tokens: $n_tokens TP: $ietw_tp Picked: $ietw_picked Correct: $ietw_correct\n"; warn "Cumulative Precision: $ietw_prec Recall: $ietw_recl F1: $ietw_f1\n"; } if($lang eq "ar") { $avg_f1 = 0; $s_avg_f1 = 0; for my $type (keys %mytypes) { $prec = sprintf("%.3f", ($type_picked{$type} == 0) ? 0 : $type_tp{$type} / $type_picked{$type}); $recl = sprintf("%.3f", ($type_correct{$type} == 0) ? 0 : $type_tp{$type} / $type_correct{$type}); $f1 = sprintf("%.3f", ($prec + $recl == 0) ? 0 : 2 * $prec * $recl / ($prec + $recl)); $avg_f1 += $f1; print "$type TP: $type_tp{$type} Picked: $type_picked{$type} Correct: $type_correct{$type}\n"; warn "$type TP: $type_tp{$type} Picked: $type_picked{$type} Correct: $type_correct{$type}\n"; print "$infile $type Precision: $prec Recall: $recl F1: $f1\n"; warn "$infile $type Precision: $prec Recall: $recl F1: $f1\n"; $s_type_tp{$type} += $type_tp{$type}; $s_type_picked{$type} +=$type_picked{$type}; $s_type_correct{$type} += $type_correct{$type}; $s_prec = sprintf("%.3f", ($s_type_picked{$type} == 0) ? 0 : $s_type_tp{$type} / $s_type_picked{$type}); $s_recl = sprintf("%.3f", ($s_type_correct{$type} == 0) ? 0 : $s_type_tp{$type} / $s_type_correct{$type}); $s_f1 = sprintf("%.3f", ($s_prec + $s_recl == 0) ? 0 : 2 * $s_prec * $s_recl / ($s_prec + $s_recl)); $s_avg_f1+= $s_f1; print "Cumulative $type TP: $s_type_tp{$type} Picked: $s_type_picked{$type} Correct: $s_type_correct{$type}\n"; print "Cumulative $type Precision: $s_prec Recall: $s_recl F1: $s_f1\n"; warn "Cumulative $type TP: $s_type_tp{$type} Picked: $s_type_picked{$type} Correct: $s_type_correct{$type}\n"; warn "Cumulative $type Precision: $s_prec Recall: $s_recl F1: $s_f1\n"; } $avg_f1 /= 4; $s_avg_f1 /= 4; print "avg F1 score: $avg_f1\n"; warn "avg F1 score: $avg_f1\n"; print "Cumulative avg F1 score: $s_avg_f1\n"; warn "Cumulative avg F1 score: $s_avg_f1\n"; } } } sub getdata_aner() { my $line = <$ifh>; chomp($line); if(!($line)) { print $debug "Getdata_aner: File done: $infile\n"; return (); } my @parts = split /\s+/, $line; #print scalar @parts, ": ", join(", ", @parts); print "\n"; #if(scalar @parts == 3) #{ # do # { # $result = $service->translate($parts[2]); # } while ($result->error and $result->code != 400); # $parts[2] = $result->translation; #} #if(substr($parts[0], -1, 1) eq "." and $parts[0] ne ".") #remove it #{ # $parts[0] = substr($parts[0], 0, length($parts[0]) - 1); #} if($parts[0] eq ".") { return (scalar @parts == 2) ? ($parts[0], $parts[1], 0) : ($parts[0], $parts[1], 0, $parts[2]); } else { return (scalar @parts == 2) ? ($parts[0], $parts[1], 1) : ($parts[0], $parts[1], 1, $parts[2]); } } sub getdata { if($in_flag == 0) { while(1) { $line = <$ifh>; goto GLOOP if(! ($line) or $line =~ /token listing end (.+)\.tree.+/); if($line =~ /token listing start (.+)\.tree.+/) { @tokens = (); @lemmas = (); @ltot = (); @llemmas = (); @morphs = (); if($1 ne $filename) { print $debug "Getdata: File done: $filename\n" if($filename ne "XXX"); $filename = $1; } next; } if($line =~ /#source Tokens:(.+)/) { $treesize = $1; next; } if($line =~ /s:([0-9]+).+·(.+)·(.+)·(.+)·(.+)·(.+)/) { $tokens[$1] = (substr($2, length($2) - 1, 1) eq "Â") ? substr($2, 0, length($2) - 1) : $2; @endpts = split "-", $4; for($i = $endpts[0]; $i <= $endpts[1]; $i++) { $ltot[$i] = $1; } next; } if($line =~ /t:([0-9]+).+·(.+)·(.+)·(.+)·(.+)·(.+)·(.+)·(.+)·(.+)/) { $temp = (substr($4, length($4) - 1, 1) eq "Â") ? substr($4, 0, length($4) - 1) : $4; if($lemmas[$ltot[$1]] eq "") { $lemmas[$ltot[$1]] = $temp; } else { $lemmas[$ltot[$1]] = $lemmas[$ltot[$1]] . "+" . $temp; } $temp = (substr($7, length($7) - 1, 1) eq "Â") ? substr($7, 0, length($7) - 1) : $7; if($temp eq "[TBupdate]" or $temp eq "dummylemma" or $temp eq "nolemma") { $temp = "[DEFAULT]"; } $llemmas[$ltot[$1]] = $temp if($temp ne "[clitics]" or ($temp eq "[clitics]" and $llemmas[$ltot[$1]] eq "")); $llemmas[$ltot[$1]] =~ tr/[]//d; $temp = (substr($5, length($5) - 1, 1) eq "Â") ? substr($5, 0, length($5) - 1) : $5; if($morphs[$ltot[$1]] eq "") { $morphs[$ltot[$1]] = $temp; } else { $morphs[$ltot[$1]] = $morphs[$ltot[$1]] . "+" . $temp; } next; } } GLOOP: $in_flag = 1; } if(!($line)) { print $debug "Getdata: File done: $filename\n"; $in_flag = 0; $filename = "XXX"; return (); } $token = $tokens[$treesize_i]; $lemma = $lemmas[$treesize_i]; $llemma = $llemmas[$treesize_i]; $morph = $morphs[$treesize_i]; $treesize_i++; if($treesize_i == $treesize) { $treesize_i = 0; $in_flag = 0; } @return_arr = ($token, $lemma, $llemma, $morph, $in_flag); return @return_arr; } 1;