#!/usr/local/bin/perl # This script reads in a lexicon file given in llf format and generates # a xfer style lexicon from that. use strict; use warnings; my $fileName = shift; my $type = shift || 'xfer'; my %lexicon = (); #[TODO] Read the pos mapping from docs/POS-mapping.txt my %posMap = (noun => 'N', verb => 'V', adj => 'JJ', adv => 'RB', none => 'LEX', pron => 'PRP', tmp => 'IN', loc => 'IN', num => 'CD', whq => 'WH', infl => 'INFL', dem => 'DEM', conj => 'CC', det => 'DET', other => 'LEX'); loadLLFLexiconFromFile($fileName, \%lexicon); printXferLexicon(\%lexicon,"/afs/cs/project/avenue-1/Avenue/Urdu-MT/data") if($type eq 'xfer'); printBiLingualDict(\%lexicon) if($type =~ /dict/); # Takes in a hash and prints that out as a simple # bilingual dict with 2 words on every line, # English words followed by urdu word sub printBiLingualDict{ my $lexiconRef = shift; my $count++; foreach (sort keys %{$lexiconRef}){ my %done = (); # Keep track of repeated entries foreach my $pos (keys %{${$lexiconRef}{$_}}){ next if($pos eq 'id'); foreach my $eWord (@{ ${${$lexiconRef}{$_}}{$pos} }){ # next if(defined $done{$eWord}); # $done{$eWord} = 1; $count++; print "$eWord $_ $pos\n" if($type eq 'dict-giza'); # print "$eWord @ $_\n" if($type eq 'dict-hun'); } } } print "$count\n"; } # Takes in a hash and prints out a Xfer format lexicon # printXferLexicon(hashRef) sub printXferLexicon{ my $lexiconRef = shift; my $outDir = shift; my %donePOS = (); foreach my $key (keys %posMap){ my $pos = $posMap{$key}; next if(defined $donePOS{$pos}); $donePOS{$pos} = 1; print "Writing $pos lexicon..\n"; if($pos eq 'WH' or $pos eq 'CD' or $pos eq 'INFL' or $pos eq 'PRP' or $pos eq 'DEM' or $pos eq 'CC' or $pos eq 'DET'){ open(OFILE,">$outDir/fullform-dvd-others.lex") or die("couldn't open file to write closed class lexicon\n"); } else{ open(OFILE,">$outDir/fullform-dvd-$pos.lex") or die("couldn't open file to write $pos lexicon\n"); } foreach (sort keys %{$lexiconRef}){ my $count = 1; my %done = (); # Keep track of repeated entries foreach my $eWord (@{ ${${$lexiconRef}{$_}}{$pos} }){ $eWord =~ s/\s*$.*$$//; next if(defined $done{$eWord}); $done{$eWord} = 1; print OFILE "{${${$lexiconRef}{$_}}{'id'}-$pos,$count}\n"; print OFILE "$pos\::$pos |: "; print OFILE "[\"$_\"] -> "; print OFILE "[\"$eWord\"]\n"; print OFILE "(\n\t(X1::Y1)\n"; print OFILE "\t((x0 lex) = $_)\n"; print OFILE ")\n\n"; $count++; } } close OFILE; } } # Load the lexicon from a LLF format file # loadLLFLexiconFromFile(fileName) sub loadLLFLexiconFromFile{ my $fileName = shift; my $lexiconRef = shift; open(IFILE, $fileName) or die("Couldn't open the file $fileName\n"); # Read off the header my $head = ""; $head = until($head =~ /^/; my $entryId = $1; # Get the entry word $lines[1] =~ /(.+)<\/WORD>/; my $word = $1; ${${$lexiconRef}{$word}}{'id'} = $entryId; for(my $i = 2; $i < $#lines; $i++){ next unless($lines[$i] =~ /(.+)<\/POS>(.+)<\/GLOSS>/; my $pos = $1; my $src = $2; my $eString = $3; $pos = 'other' if($pos =~ /\s+/); # Handle "Adj or Adv" print STDERR lc($pos),' ',$_,' ' unless(defined $posMap{lc($pos)}); $pos = $posMap{lc($pos)}; # print "$pos $src $eString\n"; my @engWords = split /,/,$eString; return ($pos, $src, \@engWords); } # readEntry(File) sub readEntry{ my $entry = ""; while($entry !~ /<\/ENTRY>/g){ return "" unless(my $line = ); # Check for the end of the file $entry .= $line; } return $entry; }