#!/usr/local/bin/perl -w

open(DATA, "UnicodeData.txt") or die $!;



# 0. Unicode point
# 1. Name
# 2. General Category
# 3. Canonical combining classes
# 4. Bidirectional category
# 5. Character Decomposition Mapping
# 6. Decimal digit value
# 7. Digit value
# 8. Numeric value
# 9. Mirrored (for bidi)
# 10. Unicode 1.0 Name
# 11. 10646 Comment field
# 12. Uppercase mapping
# 13. Lower case mapping
# 14. Title case mapping

# Handle isWS, isPunct, upcase

while (<DATA>) {
    @fields = split(/;/);
    next if length($fields[0]) != 4;

    if ($fields[2] eq "Ll" and $fields[12] ne "") {
	$l2ucase{&hex2utf8($fields[0])} = &hex2utf8($fields[12]);
    }

    if ($fields[2] =~ m/^L/) {
	$letter{&hex2utf8($fields[0])} = 1;
    } elsif ($fields[2] =~ m/^Z/) {
	$ws{&hex2utf8($fields[0])} = 1;
    } elsif ($fields[2] =~ m/^P/) {
	$punct{&hex2utf8($fields[0])} = 1;
    } elsif ($fields[2] =~ m/^N/) {
	$num{&hex2utf8($fields[0])} = 1;
    }
}

open(UNI, "> UnicodeTools.cpp") or die $!;
#open(UNIL, "> UnicodeLetter.cpp") or die $!;
#open(UNIN, "> UnicodeNumber.cpp") or die $!;
#open(UNIW, "> UnicodeWS.cpp") or die $!;
#open(UNIP, "> UnicodePunct.cpp") or die $!;
#open(UNILC, "> UnicodeLC.cpp") or die $!;

print UNI '#include "Unicode.hpp"';

print UNI '
string UnicodeTools::upcase(string utf8str) {
  string testchar, output;

  for (int i = 0; i < utf8str.length(); i++) {
     if ((unsigned char)utf8str[i] <= 0x7f) {
       testchar = utf8str.substr(i, 1);
     } else if ((unsigned char)utf8str[i] <= 0xDF) {
       testchar = utf8str.substr(i, 2);
       i+=1;
     } else if ((unsigned char)utf8str[i] <= 0xEF) {
       testchar = utf8str.substr(i, 3);
       i+=2;
     } else if ((unsigned char)utf8str[i] <= 0xF7) {
       testchar = utf8str.substr(i, 4);
       i+=3;
     }

     if (lcase.find(testchar) != lcase.end()) {
       output += lcase[testchar];
     } else {
       output += testchar;
     }
  }
  return output;
}
';

print UNI 'void UnicodeTools::initlCase() {' . "\n";
foreach $key (keys %l2ucase) {
    print UNI ' lcase["' . $key . "\"] = \"" . $l2ucase{$key} . "\";\n";
}
print UNI '}
';


#print UNI '#include "Unicode.hpp"';

print UNI '
bool UnicodeTools::isLetter(string testchar) {
  if (letters.find(testchar) != letters.end()) {
    return true;
  }
  return false;
}

';

$keycount = 0;
$inits = 0;

foreach $key (keys %letter) {
    if ($keycount % 1000 == 0) {
	if ($inits != 0) { print UNI "}\n\n"; }
	print UNI "void UnicodeTools::initLetter${inits}() {\n";
        $inits++;
    }
    print UNI ' letters.insert("' . $key . "\");\n";
    $keycount++;
}
print UNI "}\n\n";

print UNI 'void UnicodeTools::initLetter() {' . "\n";
for ($i = 0; $i < $inits; $i++) {
    print UNI "\tinitLetter$i();\n";
}
print UNI "}\n";


#print UNI '#include "Unicode.hpp"';

print UNI '
bool UnicodeTools::isNumber(string testchar) {
  if (numbers.find(testchar) != numbers.end()) {
    return true;
  }
  return false;
}
';

print UNI 'void UnicodeTools::initNumber() {' . "\n";
foreach $key (keys %num) {
    print UNI ' numbers.insert("' . $key . "\");\n";
}
print UNI '}';

#print UNI '#include "Unicode.hpp"';
print UNI '
bool UnicodeTools::isPunct(string testchar) {
  if (punctuation.find(testchar) != punctuation.end()) {
    return true;
  }
  return false;
}
';

print UNI 'void UnicodeTools::initPunct() {'. "\n";
foreach $key (keys %punct) {
    $key =~ s/^\\$/\\\\/;
    $key =~ s/^\"$/\\\"/;
    print UNI ' punctuation.insert("' . $key . "\");\n";
}
print UNI '}';

#print UNIW '#include "Unicode.hpp"';

print UNI '
bool UnicodeTools::isWS(string testchar) {
  if (ws.find(testchar) != ws.end()) {
    return true;
  }
  return false;
}

';

print UNI 'void UnicodeTools::initWS() {' . "\n";
foreach $key (keys %ws) {
    print UNI ' ws.insert("' . $key . "\");\n";
}
print UNI '}
';




# hex2utf8:  Take a string of 4 hex digits (0-9A-F) and convert it
# to the corresponding (1, 2, or 3 byte) UTF-8 representation.

sub hex2utf8 {
    my($hexchar) = @_;
    #print "$hexchar \n";
    if ($hexchar !~ m/^0x/) {
	$hexchar = "0x" . $hexchar;
    }
    $binchar = oct($hexchar);
    if ($binchar <= 127) {
	$retval = pack("C", $binchar);
    } elsif ($binchar <= 2047) {
	$bin1 = ($binchar >> 6) | 0xC0;
	$bin2 = ($binchar & 0x3F) | 0x80;
	$retval = pack("C2", $bin1, $bin2);
    } else {
	$bin1 = ($binchar >> 12) | 0xE0;
	$bin2 = (($binchar & 0x0FFF) >> 6) | 0x80;
	$bin3 = ($binchar & 0x003F) | 0x80;
	$retval = pack("C*", $bin1, $bin2, $bin3);
#	#print "in 3 char version with $hexchar and $retval bin1 $bin1 bin2 $bin2 bin3 $bin3\n";
    }
    $retval;
}
