#!/usr/local/bin/perl

# This script reads in a rules file and a input file and romanizes the urdu
# unicode input.

# Author: Abhaya Agarwal abhayaa@cs.cmu.edu
# 10-29-2007

use strict;
use warnings;
use Getopt::Long;

binmode(STDOUT,":utf8");

use vars qw($opt_out $opt_in $opt_original $opt_noDict);
GetOptions( "in=s", "out=s", "original", "noDict");

my $ruleFile = "romanization-mappings.txt";
my $dictFile = "romanization-dict.txt";

unless(defined $opt_in){
	printUsage();
	die("\nNo input file !\n") 
}

my %rules = ();
open(RFILE, $ruleFile) or die("Couldn't open the file $ruleFile\n");

while(<RFILE>){
	chomp;
	(my $urd, my $eng) = split /\-/;
	$urd =~ s/_0x/}\\x{/g;
	$urd =~ s/^0x/\\x{/;
	$urd .= '}';
	$rules{$urd} = $eng;
}
close RFILE;

my %dict = ();
unless(defined $opt_noDict){
	open(DFILE, "<:encoding(utf8)", $dictFile) or die("Couldn't open the file $dictFile\n");
	
	while(<DFILE>){
		chomp;
		(my $eng, my $urd) = split /\s+/;
		$dict{$urd} = $eng;
	}
	close DFILE;
}

open(OUT,">$opt_out") or die("Couldn't open the output file: $opt_out\n") if(defined $opt_out);
open(IFILE, "<:encoding(UTF-8)",$opt_in) or die("Couldn't open the file $opt_in\n");

while(<IFILE>){
	if(defined $opt_out){
		print OUT romanizeUrdu($_,1);
	}
	else{
		print romanizeUrdu($_,1);
	}
}
close IFILE;
close OUT;

# take in one line of text and romanize it
# romanizeUrdu(String)
sub romanizeUrdu{
	my $text = shift;
	
	my $rtext = "";
	my @lines = split /\n/; 
	foreach my $line(@lines){
		$line =~ s/(\x{06D4})/ $1 /g; # seperate out urdu sentence marker
		my @words = split /\s+/,$line;
		foreach my $word (@words){
			my $ew = $word;
			unless (defined $dict{$ew}){
				$ew =~ s/(.)\x{0651}/$1$1/g; #gemination
				
				$ew =~ s/[\x{06CC}\x{06D2}]\x{0670}$/a/; #word end yeh + superscript alef
				$ew =~ s/^[\x{06CC}\x{06D2}]/y/; #word initial big and small yeh forms a consonant
				
				$ew =~ s/\x{0627}\x{064B}/an/g; # alef+do zabar changes to -an
				$ew =~ s/^\x{0648}/v/; # word initial vao forms a consonant
				
# Apply rules in decreasing order of context length
				foreach my $rule (sort {length($b) <=> length($a)} keys %rules){
					$ew =~ s/$rule/$rules{$rule}/g;
	# 				last if($ew !~ /[\x{0600}-\x{0647}\x{0649}-\x{06FF}]/);
				}
				
				$ew =~ s/([bptjhdkg])[\x{0647}\x{06C1}]/$1_h/g; # Aspirated characters are formed only by dochashmi heh and not the other heh
				$ew =~ s/[\x{0647}\x{06c1}]/h/g; # Rest of the cases
				
				$ew =~ s/\x{0648}\x{0648}/vo/g; # special case of two vao together
				$ew =~ s/\x{0648}N$/oN/g; # At the end of the word, followed by nasalization, vao has to be a vowel.
				$ew =~ s/\x{06CC}\x{0648}/yo/g; # following yeh, it is a vowel
				$ew =~ s/([aeiou])\x{0648}/$1v/g; # otherwise following a vowel (starting a syllable), voa is a consonant
				$ew =~ s/\x{0648}/o/g; # else it is a vowel
				
				$ew =~ s/([aeiou])\x{06CC}/$1y/g; # farsi yeh is used for y and I, so after a vowel, it will be y.
				$ew =~ s/\x{06CC}o/yo/g; # yeh followed by vao that is realized as a vowel is a consonant
				$ew =~ s/\x{06CC}/i/g; # other places, let us hope it is I.
								
				$ew =~ s/\x{0654}//g; # hamza is unproductive in remaining cases
				
				# Convert the punctuation
				$ew =~ s/\x{060C}/,/;
				$ew =~ s/\x{061F}/?/;
				$ew =~ s/\x{061B}/;/;
				$ew =~ s/\x{06D4}/./;
				$ew =~ s/\x{066A}/%/;
				$ew =~ s/\x{066B}/./;
				$ew =~ s/\x{066C}/,/;
				
				$dict{$word} = $ew;
			}
			$rtext .= "$dict{$word} ";
			$rtext .= "/$word " if (defined($opt_original));
		}
		$rtext .= "\n";
	}
	return $rtext;
}

sub printUsage{
	print "romanize.pl: Convert unicode urdu text in romanazied Urdu.\n";
	print "Options: -i <input file> [-output<output file>] [-original] [-noDict]\n";
	print "This script uses a rule file and optionally a dict file. They should be present in the same directory\n";
}