#!/usr/bin/perl5.8.5

use Encode;
binmode STDOUT, ":utf8";

$vocab_reseg_filename = shift(@ARGV);
$out_dir = shift(@ARGV);

# for all files in all directories, resegment
print "resegmenting ..\n";
while($dir = shift(@ARGV)){   
    # open directory
    print ".. $dir\n";
    opendir(DIR, $dir);
    @files = readdir(DIR);
    closedir(DIR);
    # look at contents of directory
    foreach $filename (@files){
	if(($filename ne ".") && ($filename ne "..")){
	    print "  $filename\n";
	    # open output file & read in original file
	    open(OUT, ">:utf8", "$out_dir/$filename");
	    open(IN, "<:utf8", "$dir/$filename");
	    @lines = <IN>;
	    $file = join("", @lines);
	    close(IN);
	    # to lowercase
	    #$file = lc($file);
	    # collapse known abbreviations
	    #$file =~ s/ jr \./__JR__/gm;
	    #$file =~ s/ u \. s \./__US__/gm;
	    #$file =~ s/ ([a-z]) \./__$1__/gm; #initials
	    #$file =~ s/ ([0-9]) \. ([0-9]) ?lak/__$1_$2__/gm; #money
	    # convert urdu to english numerals
	    $file =~ s/[\x{06F0}\x{0660}]/0/gm;
	    $file =~ s/[\x{06F1}\x{0661}]/1/gm;
	    $file =~ s/[\x{06F2}\x{0662}]/2/gm;
	    $file =~ s/[\x{06F3}\x{0663}]/3/gm;
	    $file =~ s/[\x{06F4}\x{0664}]/4/gm;
	    $file =~ s/[\x{06F5}\x{0665}]/5/gm;
	    $file =~ s/[\x{06F6}\x{0666}]/6/gm;
	    $file =~ s/[\x{06F7}\x{0667}]/7/gm;
	    $file =~ s/[\x{06F8}\x{0668}]/8/gm;
	    $file =~ s/[\x{06F9}\x{0669}]/9/gm;
	    # remove one of first two lines if identical
	    if($file =~ /^(.*)\n(.*)\n/m){
		$line_1 = $1;
		$line_2 = $2;
		if($line_1 eq $line_2){
		    $file =~ s/^.*\n//;
		}
	    }
            # split numbers from other tokens
	    $file =~ s/(^|\S)(\s*?)([0-9]+)(\s*)/$2 $3 $4/gm;
	    # resegment by full stops
	    $file =~ s/\n+//gm;
	    $file =~ s/\x{06D4}/\x{06D4}\n/gm;
	    $file =~ s/\x{066B}/\x{066B}\n/gm;
            # break up punctuation from other tokens
            $file =~ s/(\S)(\x{060C})/$1 $2/gm; # comma
	    $file =~ s/(\x{060C})(\S)/$1 $2/gm;
            $file =~ s/(\S)(\x{066C})/$1 $2/gm;
	    $file =~ s/(\x{066C})(\S)/$1 $2/gm;
            $file =~ s/(\S)(\x{061F})/$1 $2/gm; # question mark
	    $file =~ s/(\x{061F})(\S)/$1 $2/gm;
            $file =~ s/(\S)(\x{061B})/$1 $2/gm; # semicolon
	    $file =~ s/(\x{061B})(\S)/$1 $2/gm;
            $file =~ s/(\S)(\x{06D4})/$1 $2/gm; # full stop
	    $file =~ s/(\x{06D4})(\S)/$1 $2/gm;
            $file =~ s/(\S)(\x{066B})/$1 $2/gm;
	    $file =~ s/(\x{066B})(\S)/$1 $2/gm;
            $file =~ s/(\S)(\x{066A})/$1 $2/gm; # percent
	    $file =~ s/(\x{066A})(\S)/$1 $2/gm;
	    # convert numbers back
	    #$file =~ s/0/\x{06F0}/gm;
	    #$file =~ s/1/\x{0661}/gm;
	    #$file =~ s/2/\x{0662}/gm;
	    #$file =~ s/3/\x{0663}/gm;
	    #$file =~ s/4/\x{0664}/gm;
	    #$file =~ s/5/\x{0665}/gm;
	    #$file =~ s/6/\x{0666}/gm;
	    #$file =~ s/7/\x{0667}/gm;
	    #$file =~ s/8/\x{0668}/gm;
	    #$file =~ s/9/\x{0669}/gm;
	    # resegment (periods create newlines) (unless already nl behind it)
	    $file =~ s/\x{06D4}( *(\S|\")])/ \x{06D4}\n$1/gm;
	    # abbreviations to canonical form
	    #$file =~ s/__JR__/ jr\./gm;
	    #$file =~ s/__US__/ u\.s\./gm;
	    #$file =~ s/__(.)__/ $1./gm;
	    #$file =~ s/__(.)_(.)__/ $1.$2lak/gm;
	    # outp
	    $file =~ s/\n\s+/\n/gm;
   	    if($file[length($file) - 1] == '\n'){
	      chop($file);
	    }
	    print OUT $file;
	    #@lines = split(/\n+/, $file);
	    #foreach $line (@lines){
		# remove newlines (we know they are there)
		#$line =~ s/\n//gm;
		#while($line =~ s/^\s*(\S+)\s*//){
		#    $token = $1;
		#    print OUT "$token ";
		#}
		#print OUT "\n";
	    #}
	    close(OUT);
	}
    }
}
