#!/usr/bin/perl5.8.5

$in_dir = shift(@ARGV);
$out_dir = shift(@ARGV);

# open directory
opendir(DIR, $in_dir);
@files = readdir(DIR);
closedir(DIR);

# load in ldc dict as well (only english side)
open(DICT, "</afs/cs/project/avenue-1/Avenue/Urdu-MT/data/ldc-dict-hun");
%dict_english = ();
while($line = <DICT>){
    $line =~ /(\S+)\s+(\S+)/;
    $english = lc($1);
    if($english !~ /[0-9]+/){
	$dict_english{$english} = 1;
    }
}
$dict_english{"outstanding"} = 1;
$dict_english{"understand"} = 1;
$dict_english{"understood"} = 1;
$dict_english{"barnard"} = 1;
$dict_english{"alipur"} = 1;
$dict_english{"preferred"} = 1;
$dict_english{"goodwill"} = 1;
$dict_english{"ceasefire"} = 1;
$dict_english{"everyone"} = 1;
$dict_english{"tipped"} = 1;
$dict_english{"usefulness"} = 1;
$dict_english{"newspaper"} = 1;
$dict_english{"newspapers"} = 1;
$dict_english{"cannot"} = 1;
$dict_english{"wellbeing"} = 1;
$dict_english{"crossroad"} = 1;
$dict_english{"crossroads"} = 1;
$dict_english{"mindset"} = 1;
$dict_english{"mindsets"} = 1;
$dict_english{"backdrop"} = 1;
close(DICT);

# for all files in all directories, resegment
print "resegmenting ..\n";
# look at contents of directory
foreach $filename (@files){
    if(($filename ne ".") && ($filename ne "..")){
	print "  $filename!\n";
	# open output file & read in original file
	open(OUT, ">$out_dir/$filename");
	open(IN, "<$in_dir/$filename");
	@lines = <IN>;
	$file = join("", @lines);
	close(IN);
	# split weird pairs
	$file =~ s/(^| )([A-Za-z][a-z]+)([A-Z][a-z]+)/$1$2 $3/gm;
	# to lowercase
	$file = lc($file);
	# remove one of first two lines if identical
	if($file =~ /^(.*)\n(.*)\n/m){
	    $line_1 = $1;
	    $line_2 = $2;
	    if($line_1 eq $line_2){
		$file =~ s/^.*\n//;
	    }
	}
	# collapse known abbreviations
	$file =~ s/(^| )a \. b \./__AB__/gm;
	$file =~ s/(^| )a \. d \./__AD__/gm;
	$file =~ s/(^| )a \. m \./__AM__/gm;
	$file =~ s/(^| )b \. c \./__BC__/gm;
	$file =~ s/(^| )b \. s \./__BS__/gm;
	$file =~ s/(^| )d \. c \./__DC__/gm;
	$file =~ s/(^| )aug \./ august/gm;
	$file =~ s/(^| )sep \./ september/gm;
	$file =~ s/(^| )oct \./ october/gm;
	$file =~ s/(^| )nov \./ november/gm;
	$file =~ s/(^| )dec \./ december/gm;
	$file =~ s/(^| )jan \./ january/gm;
	$file =~ s/(^| )feb \./ february/gm;
	$file =~ s/(^| )mar \./ march/gm;
	$file =~ s/(^| )apr \./ april/gm;
	$file =~ s/(^| )jun \./ june/gm;
	$file =~ s/(^| )jul \./ july/gm;
	$file =~ s/(^| )ave \./ avenue/gm;
	$file =~ s/(^| )jr \./ junior/gm;
	$file =~ s/(^| )abbr \./ abbreviation/gm;
	$file =~ s/(^| )capt \./ captain/gm;
	$file =~ s/(^| )col \./ colonel/gm;
	$file =~ s/(^| )comdr \./ commander/gm;
	$file =~ s/(^| )corp \./ corporation/gm;
	$file =~ s/(^| )cpl \./ corporal/gm;
	$file =~ s/(^| )gen \./ general/gm;
	$file =~ s/(^| )lt \./ lieutenant/gm;
	$file =~ s/(^| )dept \./ department/gm;
	$file =~ s/(^| )inc \./ incorporated/gm;
	$file =~ s/(^| )dr \./__DR__/gm;
	$file =~ s/(^| )mr \./__MR__/gm;
	$file =~ s/(^| )mrs \./__MRS__/gm;
	$file =~ s/(^| )ms \./__MS__/gm;
	$file =~ s/(^| )u \. s \./__US__/gm;
	$file =~ s/(^| )m \. d \./__MD__/gm;
	$file =~ s/(^| )([a-z]) \./__$1__/gm; #initials
	$file =~ s/(^| )([0-9]) \. ([0-9]) ?lak/__$1_$2__/gm; #money
	# resegment (periods create newlines) (unless already nl behind it)
	#if($file =~ /\.( *(\S|\")])/m){
        #  die "yes!\n";
        #       }
	#$file =~ s/\.( *(\S|\")])/ \.\n$1/gm;
	# resegment according to period
	$file =~ s/\n+//gm;
	$file =~ s/\./\.\n/gm;
	# normalize quotes
	$file =~ s/\`\`/\"/gm;
	$file =~ s/\'\'/\"/gm;
	# break up punctuation from other tokens
	$file =~ s/(\S)(\.)/$1 $2/gm; # period
	$file =~ s/(\.)(\S)/$1 $2/gm;
	$file =~ s/(\S)(,)/$1 $2/gm; # comma
	$file =~ s/(,)(\S)/$1 $2/gm;
	$file =~ s/(\S)(\?)/$1 $2/gm; # question mark
	$file =~ s/(\?)(\S)/$1 $2/gm;
	$file =~ s/(\S)(\;)/$1 $2/gm; # semicolon
	$file =~ s/(\;)(\S)/$1 $2/gm;
	$file =~ s/(\S)(\")/$1 $2/gm; # quotes
	$file =~ s/(\")(\S)/$1 $2/gm;
	$file =~ s/(\S)(\')(\s)/$1 $2$3/gm; # single
	$file =~ s/(\s)(\')(\S)/$1$2 $3/gm;
	$file =~ s/(\S)(\`)(\s)/$1 $2$3/gm; # single
	$file =~ s/(\s)(\`)(\S)/$1$2 $3/gm;
	$file =~ s/(\S)(\[)/$1 $2/gm; # brackets
	$file =~ s/(\[)(\S)/$1 $2/gm;
	$file =~ s/(\S)(\])/$1 $2/gm; # brackets
	$file =~ s/(\])(\S)/$1 $2/gm;
	$file =~ s/(\S)(\/)/$1 $2/gm; # slashes
	$file =~ s/(\/)(\S)/$1 $2/gm;
	$file =~ s/(\S)(\\)/$1 $2/gm; # slashes
	$file =~ s/(\\)(\S)/$1 $2/gm;
	# split 's from token
	$file =~ s/(\S)\'s/$1 \'s/gm;
	# rejoin 's
	$file =~ s/\'\s+s/\'s/gm;
        # abbreviations to canonical form
	$file =~ s/__AB__/ a\.b\./gm;
	$file =~ s/__AD__/ a\.d\./gm;
	$file =~ s/__AM__/ a\.m\./gm;
        $file =~ s/__US__/ u\.s\./gm;
        $file =~ s/__BC__/ b\.c\./gm;
        $file =~ s/__BS__/ b\.s\./gm;
        $file =~ s/__DC__/ d\.c\./gm;
        $file =~ s/__MD__/ m\.d\./gm;
        $file =~ s/__DR__/ dr\./gm;
        $file =~ s/__MR__/ mr\./gm;
        $file =~ s/__MRS__/ mrs\./gm;
        $file =~ s/__MS__/ ms\./gm;
        $file =~ s/__(.)__/ $1./gm;
        $file =~ s/__(.)_(.)__/ $1.$2lak/gm;
        # outp
	$file =~ s/\n\s+/\n/gm;
	if($file[length($file) - 1] == '\n'){
	    chop($file);
	}
	print OUT $file;
        #@lines = split(/\n+/, $file);
	#$i = 0;
        #foreach $line (@lines){
        #    $i++;
	#    # dont wanna deal with empty lines
	#    if($line =~ /^\s*$/){
	#	print "skipping empty line=$line\n";
	#	sleep(5);
#		next;
	#    }
  	#    # remove newlines (we know they are there)
	#    $line =~ s/\n//gm;
	#    while($line =~ s/^\s*(\S+)\s*//){
	#	$token = $1;
		# is this token in the ldc dict, then just continue as normal
		#if(exists($dict_english{$token})){
		#    print OUT "$token ";
		#}else{
		#    # some substring is in dictionary?
		#    $token_other = "";
		#    while($token){
		#	$token_other = chop($token) . $token_other;
		#	# are both halves in dictionary?
		#	if(defined($dict_english{$token}) && defined($dict_english{$token_other}) &&
		#	   (length($token) > 3) && (length($token_other) > 3) && ($token_other ne "fully")
		#	   && ($token_other ne "ally")){
		#	    print "got $token and $token_other .. or $token$token_other\n";
		#	    die;
		#	}
		#    }
		    # whatever, just put it out
	#	print OUT "$token ";
	#	#}
	#    }
        #    if($i != scalar(@lines)){
  	#      print OUT "\n";
        #    }
        #}
        close(OUT);
    }
}


