#!/usr/local/bin/perl ## Enter a short description of the script use strict; use warnings; use File::Path; use File::chdir; use File::Temp "tempfile"; binmode STDOUT, ":utf8"; my $topDir = shift; $topDir =~ /^(.+)\/(.+?)\/?$/; # # Flatten # visitAndMirrorDir($topDir,"$1/flat",\&flattenLtf); # # English Segmentation visitAndMirrorDir($topDir,"$1/reseg",\&resegEnglish); # # Urdu Segmentation my %dict = (); # my $dictDir = '/usr0/users/abhaya/data/Nist-DVD/LCTL_Urdu_v1.0/Parallel_Text/flat/reseg/Train'; # visitDir($dictDir,\&collectSenEndStats,\%dict); # foreach (keys %dict){ # delete $dict{$_} if($dict{$_} < 2); # } # # visitAndMirrorDir($topDir,"$1/reseg",\&resegUrdu); # # Collect all files to align directory for sentence alignment # visitDir($topDir,\&collectForAlign); # # Check data for anomolies # $topDir =~ /^(.+)\/(.+?)\/?$/; # visitDir($topDir,\&checkDataForAligner); # Collect aligned sentences # $topDir =~ /^(.+)\/(.+?)\/?$/; # visitDir($topDir,\&collectAlignedData,"$1/aligned-1"); # Count the number of sentences # my %sCounts = (); # visitDir($topDir,\&collectSentenceCounts,\%sCounts); # # print "Total\tEnglish\t${$sCounts{'eng'}}{'-1'}\tUrdu\t${$sCounts{'urd'}}{'-1'}\n"; # foreach ( sort { $a <=> $b } keys %{$sCounts{'eng'}}){ # print "$_\tEnglish\t${$sCounts{'eng'}}{$_}\tUrdu\t${$sCounts{'urd'}}{$_}\n"; # } # Collect the unalined files # $topDir =~ /^(.+)\/(.+?)\/?$/; # visitDir($topDir,\&checkAndCopyUnaligned,"$1/unaligned"); # Collect the unaligned sentences and copy to new directory # $topDir =~ /^(.+)\/(.+?)\/?$/; # visitDir($topDir,\&collectUnaligned,"$1/align-1"); # Apply champollion #$topDir =~ /^(.+)\/(.+?)\/?$/; #visitDir($topDir,\&applyChamp,"$1/aligned.champ.eval"); # Compute length ratio for Chinese # $topDir =~ /^(.+)\/(.+?)\/?$/; # visitDir($topDir,\&lengthRatio,''); # Collect sentences # $topDir =~ /^(.+)\/(.+?)\/?$/; # visitDir($topDir,\&collectData,"$1/lm"); sub visitDir{ my $cdir = shift; my $callback = shift; my $cbarg = shift; local(*RDIR); opendir(RDIR,$cdir) or die("Couldn't open the dir: $cdir\n"); while(defined(my $entry = readdir(RDIR))){ next if($entry =~ /^\.\.?$/); if(-d "$cdir/$entry"){ print STDERR "visting $cdir/$entry..\n"; visitDir("$cdir/$entry",$callback,$cbarg); } else{ &$callback("$cdir/$entry",$cbarg); } } close RDIR; return; } sub visitAndMirrorDir{ my $inDir = shift; my $outDir = shift; my $callback = shift; my $cbarg = shift; # Start by going to the directory one level above the input directory. That is where we will mirror the directory $inDir =~ /^(.+)\/(.+?)\/?$/; local $CWD = $1; my $cdir = $2; # print "in $CWD, will mirror $cdir in $outDir/\n"; # return; local(*RDIR); mkpath("$outDir/$cdir", 0, 0755); opendir(RDIR,$cdir) or die("Couldn't open the dir: $cdir\n"); while(defined(my $entry = readdir(RDIR))){ next if($entry =~ /^\.\.?$/); next if($entry =~ /^English$/); if(-d "$cdir/$entry"){ print STDERR "visting $cdir/$entry..\n"; visitAndMirrorDir("$inDir/$entry","$outDir/$cdir",$callback,$cbarg); } else{ &$callback("$inDir/$entry","$outDir/$cdir/$entry",$cbarg); } } close RDIR; return; } sub collectData{ my $fileName = shift; my $collect = shift; my $collectFile = $collect; if($fileName =~ /_URD_/){ $collectFile .= '.urd'; } else{ $collectFile .= '.eng'; } `cat $fileName >> $collectFile`; } sub lengthRatio{ my $uFileName = shift; my $eFileName = $uFileName; print STDERR $uFileName,"\n"; $eFileName =~ s/\.ch\./\.en\./g; open(UFILE, $uFileName) or die("Couldn't open the file $uFileName\n"); open(EFILE, $eFileName) or die("Couldn't open the file $eFileName\n"); my $snum = 0; while (my $urd = ){ my $eng = ; $snum++; chomp($urd); chomp($eng); my @ul = split /\s+/,$urd; my @el = split /\s+/,$eng; my $r = ($#el+1)/($#ul+1); print "$#el $r\n"; print STDERR "$urd $snum $#el $#ul\n" if($r == 1); } close UFILE; close EFILE; } sub collectUnaligned{ my $file = shift; my $outDir = shift; return unless($file =~ /_urd.snt$/); my $twinFile = $file; $twinFile =~ s/_urd\.snt/_eng\.snt/; my @alignedU = (); open(UFILE,"<$file.aligned") or die("Couldn't find the aligned file for $file\n"); while (){ push @alignedU , $_; } close UFILE; my @alignedE = (); open(EFILE,"<$twinFile.aligned") or die("Couldn't find the aligned file for $twinFile\n"); while (){ push @alignedE , $_; } close EFILE; my $ulen = $#alignedU; my $elen = $#alignedE; die("Something is worng in $file $twinFile | $ulen $elen\n") if($#alignedU != $#alignedE); if($ulen == -1){ `cp $file $outDir/`; `cp $twinFile $outDir/`; return; } open(UFILE,"<$file") or die("Couldn't open $file\n"); my $unalignedU = ""; while (){ if($#alignedU > -1 and $_ eq $alignedU[0]){ shift @alignedU; next; } $unalignedU .= $_; } close UFILE; open(EFILE,"<$twinFile") or die("Couldn't open $twinFile\n"); my $unalignedE = ""; while (){ if($#alignedE > -1 and $_ eq $alignedE[0]){ shift @alignedE; next; } $unalignedE .= $_; } close EFILE; return if($unalignedU eq "" or $unalignedE eq ""); $file =~ /^.+\/([^\/]+?)$/; open(UFILE,">$outDir/$1") or die("Couldn't open $outDir/$1\n"); print UFILE $unalignedU; close UFILE; $twinFile =~ /^.+\/([^\/]+?)$/; open(EFILE,">$outDir/$1") or die("Couldn't open $outDir/$1\n"); print EFILE $unalignedE; close EFILE; } sub checkAndCopyUnaligned{ my $file = shift; my $outDir = shift; return unless($file =~ /_urd.snt$/); open(AFILE,"$file.aligned") or die ("Couldn't find alignment file for $file\n"); my @contents = ; close AFILE; return if($#contents >= 0); print $file,"\n"; my $twinFile = $file; $twinFile =~ s/_urd\.snt$/_eng\.snt/; `cp $file $outDir/`; `cp $twinFile $outDir/`; } sub collectSentenceCounts{ my $file = shift; my $countHashRef = shift; return unless($file =~ /.snt$/); open(IFILE,"<:encoding(utf-8)",$file) or die("Couldn't open file $file\n"); my @lines = ; close IFILE; my $lang = ""; if($file =~ /(_ENG_|_eng)/){ ${${$countHashRef}{'eng'}}{'-1'} += $#lines+1; $lang = 'eng'; } elsif($file =~ /(_URD_|_urd)/){ ${${$countHashRef}{'urd'}}{'-1'} += $#lines+1; $lang = 'urd'; } foreach (@lines){ chomp; my @tokens = split /\s+/; ${${$countHashRef}{'eng'}}{"$#tokens"} = 0 unless(defined ${${$countHashRef}{'eng'}}{"$#tokens"}); ${${$countHashRef}{'urd'}}{"$#tokens"} = 0 unless(defined ${${$countHashRef}{'urd'}}{"$#tokens"}); ${${$countHashRef}{$lang}}{"$#tokens"}++; # print "@tokens\n" if($#tokens == 0); } } sub collectAlignedData{ my $ufile = shift; my $outFile = shift; return unless($ufile =~ /_urd.snt.aligned$/); my $efile = $ufile; $efile =~ s/_urd.snt.aligned$/_eng.snt.aligned/; `cat $ufile >> $outFile.urd`; `cat $efile >> $outFile.eng`; } sub checkDataForAligner{ my $file = shift; return unless($file =~ /_urd.snt$/); my $twinFile = $file; $twinFile =~ s/_urd\.snt$/_eng.snt/; $/ = ""; open(UFILE,"<$file") or die("Couldn't open $file\n"); my $urd = ; close UFILE; $/ = "\n"; unless(defined $urd and $urd !~ /^\s+$/){ print $file," - Empty urdu file\n" ; `mv $file $file.mismatch`; `mv $twinFile $twinFile.mismatch`; return; } my @urdCnt = split /\s+/,$urd; $/ = ""; open(EFILE,"<$twinFile") or die("Couldn't open $twinFile\n"); my $eng = ; close EFILE; $/ = "\n"; unless(defined $eng and $eng !~ /^\s+$/){ print $file," - Empty english file\n"; `mv $file $file.mismatch`; `mv $twinFile $twinFile.mismatch`; return; } my @engCnt = split /\s+/,$eng; if($#urdCnt - $#engCnt > 3*$#engCnt or $#engCnt - $#urdCnt > 3*$#urdCnt){ print $file," - size mismatch\n"; `mv $file $file.mismatch`; `mv $twinFile $twinFile.mismatch`; } } sub applyChamp{ my $fileName = shift; my $outFile = shift; return unless($fileName =~ /_urd.snt$/); my $twinFile = $fileName; $twinFile =~ s/_urd\.snt/_eng\.snt/; $/ = ""; open(UFILE,$fileName) or die("Couldn't open $fileName\n"); my $data = ; close UFILE; $/ = "\n"; $data =~ s/\s+$//; open(UFILE,">$fileName.clean") or die("Couldn't open $fileName for writing\n"); print UFILE $data,"\n"; close UFILE; `export CTK=/afs/cs.cmu.edu/user/abhayaa/abhaya-local/downloads/champollion-1.1;\n/afs/cs.cmu.edu/user/abhayaa/abhaya-local/downloads/champollion-1.1/bin/champollion.EA $twinFile $fileName.clean $fileName.champ`; open(UFILE,"<$fileName.clean") or die("Urdu file not found\n"); my @urdu = ; close UFILE; open(EFILE,"<$twinFile") or die("English file not found\n"); my @english = ; close EFILE; open(OFILE,">> $outFile") or die("$outFile couldn't be opened\n"); open(AFILE,"<$fileName.champ") or die("No alignment file found\n"); print OFILE "\n$fileName\n"; while (){ chomp; next if(/omitted/); my @ei = (); my @ui = (); (my $eng, my $urd) = split /<=>/; $eng =~ s/\s+//; $urd =~ s/\s+//; if($eng =~ /,/){ @ei = split /,/,$eng; } else{ push @ei, $eng; } if($urd =~ /,/){ @ui = split /,/,$urd; } else{ push @ui, $urd; } my $ulen = 0; my $elen = 0; # print OFILE "@ei -> @ui\n"; foreach my $eind (@ei){ chomp($english[$eind-1]); my @ew = split /\s+/, $english[$eind-1]; $elen += $#ew+1; print OFILE "$english[$eind-1] "; } print OFILE "\n"; foreach my $uind (@ui){ chomp($urdu[$uind-1]); my @uw = split /\s+/, $urdu[$uind-1]; $ulen += $#uw+1; print OFILE "$urdu[$uind-1] "; } print OFILE "\n"; # unless(($elen+1 > 100) or ($ulen+1 > 100) or ($ulen > 20 and $elen > 2*$ulen) or ($elen > 20 and $ulen > 2*$elen)){ # print OFILE "eng\n"; # foreach my $eind (@ei){ # print OFILE "$english[$eind-1]\n"; # } # print OFILE "urd\n"; # foreach my $uind (@ui){ # print OFILE "$urdu[$uind-1]\n"; # } # print OFILE "#\n"; # # } } close OFILE; } sub collectForAlign{ my $fileName = shift; $fileName =~ /([^\/]+).ltf.xml.flat.reseg$/; # print "$fileName\n"; my $name = $1; $fileName =~ /(Found|Translations|Special_Corpora)/; $name = "$1_$name"; if($fileName =~ /(_ENG_)|(English)/){ $name =~ s/_ENG_/_/; $name =~ s/English_//; $name .= "_eng.snt"; } else{ $name =~ s/_URD_/_/; $name =~ s/Urdu_//; $name .= "_urd.snt"; } # print $name,"\n"; `cp $fileName /usr0/users/abhaya/data/Nist-DVD/LCTL_Urdu_v1.0/Parallel_Text/flat/reseg/align-eval/$name`; } sub collectSenEndStats{ my $file = shift; my $dictRef = shift; return if($file =~ /_ENG_|English/); # print STDERR $file,"\n";; open(IFILE,"<:encoding(utf-8)",$file) or die("Couldn't open $file\n"); while (my $line = ){ # ${$dictRef}{$1}++ if($line =~ /([^\x{06D4}\x{061F}! ]+?)\s*\n/); $line =~ s/([\x{06D4}\x{061F}!])/ $1 /; # print $_; while ($line =~ /(\S+)\s+[\x{06D4}\x{061F}!]/g){ # next if($1 =~ /\d/); ${$dictRef}{$1}++; } } close IFILE; } sub resegEnglish{ my $inFile = shift; my $outFile = shift; my $onlpDir = "/usr0/users/abhaya/downloads/opennlp-tools-1.3.0"; # Run the openNLP sentence splitter `/usr0/users/abhaya/downloads/java/bin/java -cp $onlpDir/lib/trove.jar:$onlpDir/lib/maxent-2.4.0.jar:$onlpDir/output/opennlp-tools-1.3.0.jar opennlp.tools.lang.english.SentenceDetector $onlpDir/EnglishSD.bin.gz < $inFile > $outFile.reseg`; } sub resegUrdu{ my $inFile = shift; my $outFile = shift; $/ = ""; open(IFILE,"<:encoding(utf-8)",$inFile) or die("Couldn't open $inFile\n"); my $converted = ; close IFILE; $/ = "\n"; return if($converted eq ""); # Convert the numbers $converted =~ s/[\x{06F0}\x{0660}]/0/gm; $converted =~ s/[\x{06F1}\x{0661}]/1/gm; $converted =~ s/[\x{06F2}\x{0662}]/2/gm; $converted =~ s/[\x{06F3}\x{0663}]/3/gm; $converted =~ s/[\x{06F4}\x{0664}]/4/gm; $converted =~ s/[\x{06F5}\x{0665}]/5/gm; $converted =~ s/[\x{06F6}\x{0666}]/6/gm; $converted =~ s/[\x{06F7}\x{0667}]/7/gm; $converted =~ s/[\x{06F8}\x{0668}]/8/gm; $converted =~ s/[\x{06F9}\x{0669}]/9/gm; # Convert the punct $converted =~ s/\x{060C}/,/gm; #Arabic Comma $converted =~ s/\x{060D}/-/gm; #Arabic Date separater $converted =~ s/\x{061B}/;/gm; $converted =~ s/\x{061F}/?/gm; $converted =~ s/\x{0640}//gm; # tatweel (used to elongate characters for justification) $converted =~ s/\x{066A}/%/gm; $converted =~ s/\x{066B}/./gm; # decimal seperater $converted =~ s/\x{066C}/,/gm; # thousands seperater $converted =~ s/\x{066D}/*/gm; # five point star $converted =~ s/\x{06D4}/ \x{06D4} /gm; # full stop # Convert the direction quotes to neutral quotes $converted =~ s/[\x{2018}\x{2019}]/'/gm; $converted =~ s/[\x{201C}\x{201D}]/"/gm; $converted =~ s/^\s+//; $converted =~ s/\s+$//; my @lines = split /\n/,$converted; if($#lines > 0 and $lines[0] eq $lines[1]){ @lines = @lines[1..$#lines]; } # print "Before ---------------\n@lines\n"; $converted = ""; foreach my $line (@lines){ $line =~ s/^\s+//; $line =~ s/\s+$//; if($line =~ /[\x{06D4}\?!]$/){#print "matched \x{06D4}\n"; $converted .= $line." "; } elsif($line =~ /([^\x{06D4}\?! ]+)$/ and $dict{$1}){ $converted .= $line."\x{06D4} "; } else{ $converted .= $line." "; } } $converted =~ s/([\x{06D4}\?!])\s+/$1\n/g; open(OFILE,">:encoding(utf-8)",$outFile.".reseg") or die("Couldn't open file for output\n"); print OFILE $converted; close OFILE; # print "After -------------- $converted\n"; # ; } sub flattenLtf{ my $inFile = shift; my $outFile = shift; $inFile =~ /^(.+)\/(.+?)\/?$/; # print "Flattening $2..\n"; open(OUTP, ">$outFile.flat") || die("can't open output file ($outFile)!"); # read in the file open(FILE, "<$inFile") || die("can't open \"$inFile\""); my $file = join("", ); close(FILE); # while there are segments while($file =~ s/\(.*?)\<\/SEG\>//s){ my $seg = $1; # while there are tokens my $sent = ""; while($seg =~ s/\(.*?)\<\/TOKEN\>//s){ my $token = $1; $sent .= "$token "; # output token to output file. # print OUTP "$token "; } $sent =~ s/\s+\././g; print OUTP "$sent\n"; } # close output file close(OUTP); }