#!/usr/local/bin/perl -w

open(DATA, "UnicodeData.txt") or die $!;



# 0. Unicode point
# 1. Name
# 2. General Category
# 3. Canonical combining classes
# 4. Bidirectional category
# 5. Character Decomposition Mapping
# 6. Decimal digit value
# 7. Digit value
# 8. Numeric value
# 9. Mirrored (for bidi)
# 10. Unicode 1.0 Name
# 11. 10646 Comment field
# 12. Uppercase mapping
# 13. Lower case mapping
# 14. Title case mapping

# Handle isWS, isPunct, upcase

while (<DATA>) {
    @fields = split(/;/);
    next if length($fields[0]) != 4;

    if ($fields[2] eq "Ll" and $fields[12] ne "") {
	$l2ucase{&hex2utf8($fields[0])} = &hex2utf8($fields[12]);
    }

    if ($fields[2] =~ m/^L/) {
	$letter{&hex2utf8($fields[0])} = 1;
    } elsif ($fields[2] =~ m/^Z/) {
	$ws{&hex2utf8($fields[0])} = 1;
    } elsif ($fields[2] =~ m/^P/) {
	$punct{&hex2utf8($fields[0])} = 1;
    } elsif ($fields[2] =~ m/^N/) {
	$num{&hex2utf8($fields[0])} = 1;
    }
}
close(DATA);

open(UNI, "> UnicodeTools.cpp") or die $!;
#open(UNIL, "> UnicodeLetter.cpp") or die $!;
#open(UNIN, "> UnicodeNumber.cpp") or die $!;
#open(UNIW, "> UnicodeWS.cpp") or die $!;
#open(UNIP, "> UnicodePunct.cpp") or die $!;
#open(UNILC, "> UnicodeLC.cpp") or die $!;

print UNI '#include "Unicode.hpp"
#include <ctype.h>

';

print UNI '
string UnicodeTools::upcase(string utf8str) {
  string testchar, output;

  for (unsigned int i = 0; i < utf8str.length(); i++) {
     if ((unsigned char)utf8str[i] <= 0x7f) {
       testchar = utf8str.substr(i, 1);
     } else if ((unsigned char)utf8str[i] <= 0xDF) {
       testchar = utf8str.substr(i, 2);
       i+=1;
     } else if ((unsigned char)utf8str[i] <= 0xEF) {
       testchar = utf8str.substr(i, 3);
       i+=2;
     } else if ((unsigned char)utf8str[i] <= 0xF7) {
       testchar = utf8str.substr(i, 4);
       i+=3;
     }

     if (lcase.find(testchar) != lcase.end()) {
       output += lcase[testchar];
     } else {
       output += testchar;
     }
  }
  return output;
}

string UnicodeTools::lowercase(string utf8str) {
  string testchar, output;

  for (unsigned int i = 0; i < utf8str.size(); i++) {
     if ((unsigned char)utf8str[i] <= 0x7f) {
       testchar = utf8str.substr(i, 1);
     } else if ((unsigned char)utf8str[i] <= 0xDF) {
       testchar = utf8str.substr(i, 2);
       i+=1;
     } else if ((unsigned char)utf8str[i] <= 0xEF) {
       testchar = utf8str.substr(i, 3);
       i+=2;
     } else if ((unsigned char)utf8str[i] <= 0xF7) {
       testchar = utf8str.substr(i, 4);
       i+=3;
     }

     /*if (lcase.find(testchar) != lcase.end()) {
       output += lcase[testchar];
     } else {
       output += testchar;
       }*/

     if (testchar.length() == 1) {
       testchar[0] = tolower(testchar[0]);
       output += testchar;
     } else {
       output += testchar;
     }
  }
  return output;
}

';

print UNI 'void UnicodeTools::addSet(set<string> &tmpset, string utf8str) {
  for (unsigned int i = 0; i < utf8str.length(); i++) {
     if ((unsigned char)utf8str[i] <= 0x7f) {
       tmpset.insert(utf8str.substr(i, 1));
     } else if ((unsigned char)utf8str[i] <= 0xDF) {
       tmpset.insert(utf8str.substr(i, 2));
       i+=1;
     } else if ((unsigned char)utf8str[i] <= 0xEF) {
       tmpset.insert(utf8str.substr(i, 3));
       i+=2;
     } else if ((unsigned char)utf8str[i] <= 0xF7) {
       tmpset.insert(utf8str.substr(i, 4));
       i+=3;
     }
   }
}

';

print UNI 'void UnicodeTools::addMap(map<string,string> &tmpmap, string utf8str) {
  string key, value;

  for (unsigned int i = 0; i < utf8str.length(); i++) {
     if ((unsigned char)utf8str[i] <= 0x7f) {
       key = utf8str.substr(i, 1);
     } else if ((unsigned char)utf8str[i] <= 0xDF) {
       key = utf8str.substr(i, 2);
       i+=1;
     } else if ((unsigned char)utf8str[i] <= 0xEF) {
       key = utf8str.substr(i, 3);
       i+=2;
     } else if ((unsigned char)utf8str[i] <= 0xF7) {
       key = utf8str.substr(i, 4);
       i+=3;
     }
     i++;

     if ((unsigned char)utf8str[i] <= 0x7f) {
       value = utf8str.substr(i, 1);
     } else if ((unsigned char)utf8str[i] <= 0xDF) {
       value = utf8str.substr(i, 2);
       i+=1;
     } else if ((unsigned char)utf8str[i] <= 0xEF) {
       value = utf8str.substr(i, 3);
       i+=2;
     } else if ((unsigned char)utf8str[i] <= 0xF7) {
       value = utf8str.substr(i, 4);
       i+=3;
     }
     
     tmpmap[key] = value;
   }
}

';

(@l2ucases) = sort keys %l2ucase;
print UNI 'void UnicodeTools::initlCase() {' . "\n";
for ($i = 0; $i < scalar(@l2ucases); ) {
    print UNI "  addMap(lcase, \"";
    $plusforty = $i+25;
    for ($j = $i; $j < $plusforty and $j < scalar(@l2ucases); $i++, $j++) {
	print UNI $l2ucases[$j] . $l2ucase{$l2ucases[$j]};
    }
    print UNI "\");\n";
}
#print UNI "}\n\n";

#foreach $key (sort keys %l2ucase) {
#    print UNI ' lcase["' . $key . "\"] = \"" . $l2ucase{$key} . "\";\n";
#}
print UNI '}

';


#print UNI '#include "Unicode.hpp"';

print UNI '
bool UnicodeTools::isLetter(string testchar) {
  if (letters.find(testchar) != letters.end()) {
    return true;
  }
  return false;
}

';

#$keycount = 0;
#$inits = 0;

(@letters) = sort keys %letter;

print UNI "void UnicodeTools::initLetter() {\n";
for ($i = 0; $i < scalar(@letters); ) {
    print UNI "  addSet(letters, \"";
    $plusforty = $i+40;
    for ($j = $i; $j < $plusforty and $j < scalar(@letters); $i++, $j++) {
	print UNI $letters[$j];
    }
    print UNI "\");\n";
}
print UNI "}\n\n";

print UNI '
bool UnicodeTools::isNumber(string testchar) {
  if (numbers.find(testchar) != numbers.end()) {
    return true;
  }
  return false;
}

';

print UNI 'void UnicodeTools::initNumber() {' . "\n";
@numbers = sort keys %num;
for ($i = 0; $i < scalar(@numbers); ) {
    print UNI "  addSet(numbers, \"";
    $plusforty = $i+40;
    for ($j = $i; $j < $plusforty and $j < scalar(@numbers); $i++, $j++) {
	print UNI $numbers[$j];
    }
    print UNI "\");\n";
}

#foreach $key (keys %num) {
#    print UNI ' numbers.insert("' . $key . "\");\n";
#}
print UNI '}
';

#print UNI '#include "Unicode.hpp"';
print UNI '
bool UnicodeTools::isPunct(string testchar) {
  if (punctuation.find(testchar) != punctuation.end()) {
    return true;
  }
  return false;
}
';

@puncts = sort keys %punct;
print UNI 'void UnicodeTools::initPunct() {'. "\n";
for ($i = 0; $i < scalar(@puncts); ) {
    print UNI "  addSet(punctuation, \"";
    $plusforty = $i+40;
    for ($j = $i; $j < $plusforty and $j < scalar(@puncts); $i++, $j++) {
	$puncts[$j] =~ s/^\\$/\\\\/;
	$puncts[$j] =~ s/^\"$/\\\"/;
	print UNI $puncts[$j];
    }
    print UNI "\");\n";
}

#foreach $key (keys %punct) {
#    $key =~ s/^\\$/\\\\/;
#    $key =~ s/^\"$/\\\"/;
#    print UNI ' punctuation.insert("' . $key . "\");\n";
#}
print UNI '}
';

#print UNIW '#include "Unicode.hpp"';

print UNI '
bool UnicodeTools::isWS(string testchar) {
  if (ws.find(testchar) != ws.end()) {
    return true;
  }
  return false;
}

';

print UNI 'void UnicodeTools::initWS() {' . "\n";
print UNI "  addSet(ws, \"";
foreach $key (sort keys %ws) {
    print UNI $key;
#    print UNI ' ws.insert("' . $key . "\");\n";
}
print UNI "\");\n";
print UNI '}
';




# hex2utf8:  Take a string of 4 hex digits (0-9A-F) and convert it
# to the corresponding (1, 2, or 3 byte) UTF-8 representation.

sub hex2utf8 {
    my($hexchar) = @_;
    #print "$hexchar \n";
    if ($hexchar !~ m/^0x/) {
	$hexchar = "0x" . $hexchar;
    }
    $binchar = oct($hexchar);
    if ($binchar <= 127) {
	$retval = pack("C", $binchar);
    } elsif ($binchar <= 2047) {
	$bin1 = ($binchar >> 6) | 0xC0;
	$bin2 = ($binchar & 0x3F) | 0x80;
	$retval = pack("C2", $bin1, $bin2);
    } else {
	$bin1 = ($binchar >> 12) | 0xE0;
	$bin2 = (($binchar & 0x0FFF) >> 6) | 0x80;
	$bin3 = ($binchar & 0x003F) | 0x80;
	$retval = pack("C*", $bin1, $bin2, $bin3);
#	#print "in 3 char version with $hexchar and $retval bin1 $bin1 bin2 $bin2 bin3 $bin3\n";
    }
    $retval;
}
