#!/usr/local/bin/perl

# Conversion of Arabic punctuation and numerals

use strict;
use warnings;

my $fileName = shift;

open(IFILE, "<:encoding(utf-8)",$fileName) or die("Couldn't open the file $fileName\n");

while(<IFILE>){
	my $line = convertUrduPunctuations($_);
	$line = convertUrduNumbers($line);
	$line = tokenizePunctuation($line);
	print $line,"\n";
}
close IFILE;


sub tokenizePunctuation{
	my $string = shift;
	
	$string =~ s/([<>"=?])/ $1 /g;
	$string =~ s/\s+/ /g;
	$string =~ s/^\s+//g;
	$string =~ s/\s+$//g;
	return $string;
}

# Converts the urdu punctuation to English punctuation
sub convertUrduPunctuations{
	my $string = shift;
	
	$string =~ s/\x{060C}/ ,/g; #Arabic Comma
	$string =~ s/\x{060D}/-/g; #Arabic Date separater
	$string =~ s/\x{061B}/ ;/g;
	$string =~ s/\x{061F}/ ?/g;
	$string =~ s/\x{0640}//g; # tatweel (used to elongate characters for justification)
	$string =~ s/\x{066A}/%/g;
	$string =~ s/\x{066B}/./g; # decimal seperater
	$string =~ s/\x{066C}/,/g; # thousands seperater
	$string =~ s/\x{066D}/*/g; # five point star
	$string =~ s/\x{06D4}/ ./g; # full stop
	
	$string =~ s/[\x{2018}\x{2019}]/ ' /g;
	$string =~ s/[\x{201C}\x{201D}]/ " /g;

	return $string;
}

# Converts the urdu punctuation to English punctuation
sub convertUrduNumbers{
	my $string = shift;
	
	$string =~ s/[\x{06F0}\x{0660}]/0/g;
	$string =~ s/[\x{06F1}\x{0661}]/1/g;
	$string =~ s/[\x{06F2}\x{0662}]/2/g;
	$string =~ s/[\x{06F3}\x{0663}]/3/g;
	$string =~ s/[\x{06F4}\x{0664}]/4/g;
	$string =~ s/[\x{06F5}\x{0665}]/5/g;
	$string =~ s/[\x{06F6}\x{0666}]/6/g;
	$string =~ s/[\x{06F7}\x{0667}]/7/g;
	$string =~ s/[\x{06F8}\x{0668}]/8/g;
	$string =~ s/[\x{06F9}\x{0669}]/9/g;
	
	return $string;
}