#!/usr/local/bin/perl

require "convert-utf8-hebrom.pl";


open(LEX, "wikilex-20070402-he-ar.txt") or die $!;

open(TRF, "> wiki.trf") or die $!;
while ($line = <LEX>) {
    if ($line =~ s/(\|:\s+\[\")([^\]]+)/$1 . &convertHebrew($2)/e) {
	$line =~ s/(\-\>\s+\[\")([^\]]+)/$1 . &to_buckwalter($2)/e;
	#if ($line =~ m/[\x{80}-\x{ffff}]/) { print TRF "; UTF-8\n"; }
	print TRF $line;
    } else {
	print TRF $line;
    }
    #$arabic =~ s/\'/\\\'/g;

}

print STDERR "Total $total\n";


sub to_buckwalter {
  my $input = shift;
  my $arabic = decode("utf8", $input);

  # Character Substitution
  $arabic =~ tr/\x{060C}\x{061F}\x{0635}\x{064E}\x{0637}\x{064C}\x{064D}\x{062F}\x{0649}\x{0639}\x{062C}\x{064A}\x{0638}\x{064F}\x{0643}\x{063A}\x{0625}\x{062A}\x{0686}\x{0622}\x{0623}\x{062B}\x{0633}\x{062D}\x{0642}\x{0636}\x{0628}\x{0624}\x{06AF}\x{0670}\x{0632}\x{064B}\x{0648}\x{0634}\x{0631}\x{062E}\x{0630}\x{06A4}\x{0651}\x{0647}\x{0641}\x{0650}\x{0627}\x{0646}\x{0671}\x{067E}\x{0645}\x{0640}\x{0644}\x{0629}\x{0621}\x{0626}\x{0652}/,?SaTNKdYEjyZukg<tJ|>vsHqDb\&G\`zFw$rx*V~hfiAn{Pm_lp\'}o/;

  $arabic =~ s/[FNKaui~o\`]//g;  # Remove "diacritics"
  $arabic =~ s/[A\>\<\|]/A/g;    # "Ambiguate" Alif Hamza
  $arabic =~ s/[Yy]/y/g;         # "Ambiguate" Alif maqsura forms

  $arabic =~ s/[\x{80}-\x{ffff}]//g;

  return $arabic;
}
