#!/usr/local/bin/perl -w open(DATA, "UnicodeData.txt") or die $!; # 0. Unicode point # 1. Name # 2. General Category # 3. Canonical combining classes # 4. Bidirectional category # 5. Character Decomposition Mapping # 6. Decimal digit value # 7. Digit value # 8. Numeric value # 9. Mirrored (for bidi) # 10. Unicode 1.0 Name # 11. 10646 Comment field # 12. Uppercase mapping # 13. Lower case mapping # 14. Title case mapping # Handle isWS, isPunct, upcase while () { @fields = split(/;/); next if length($fields[0]) != 4; if ($fields[2] eq "Ll" and $fields[12] ne "") { $l2ucase{&hex2utf8($fields[0])} = &hex2utf8($fields[12]); } if ($fields[2] =~ m/^L/) { $letter{&hex2utf8($fields[0])} = 1; } elsif ($fields[2] =~ m/^Z/) { $ws{&hex2utf8($fields[0])} = 1; } elsif ($fields[2] =~ m/^P/) { $punct{&hex2utf8($fields[0])} = 1; } elsif ($fields[2] =~ m/^N/) { $num{&hex2utf8($fields[0])} = 1; } } open(UNI, "> UnicodeTools.cpp") or die $!; #open(UNIL, "> UnicodeLetter.cpp") or die $!; #open(UNIN, "> UnicodeNumber.cpp") or die $!; #open(UNIW, "> UnicodeWS.cpp") or die $!; #open(UNIP, "> UnicodePunct.cpp") or die $!; #open(UNILC, "> UnicodeLC.cpp") or die $!; print UNI '#include "Unicode.hpp"'; print UNI ' string UnicodeTools::upcase(string utf8str) { string testchar, output; for (int i = 0; i < utf8str.length(); i++) { if ((unsigned char)utf8str[i] <= 0x7f) { testchar = utf8str.substr(i, 1); } else if ((unsigned char)utf8str[i] <= 0xDF) { testchar = utf8str.substr(i, 2); i+=1; } else if ((unsigned char)utf8str[i] <= 0xEF) { testchar = utf8str.substr(i, 3); i+=2; } else if ((unsigned char)utf8str[i] <= 0xF7) { testchar = utf8str.substr(i, 4); i+=3; } if (lcase.find(testchar) != lcase.end()) { output += lcase[testchar]; } else { output += testchar; } } return output; } '; print UNI 'void UnicodeTools::initlCase() {' . "\n"; foreach $key (keys %l2ucase) { print UNI ' lcase["' . $key . "\"] = \"" . $l2ucase{$key} . "\";\n"; } print UNI '} '; #print UNI '#include "Unicode.hpp"'; print UNI ' bool UnicodeTools::isLetter(string testchar) { if (letters.find(testchar) != letters.end()) { return true; } return false; } '; $keycount = 0; $inits = 0; foreach $key (keys %letter) { if ($keycount % 1000 == 0) { if ($inits != 0) { print UNI "}\n\n"; } print UNI "void UnicodeTools::initLetter${inits}() {\n"; $inits++; } print UNI ' letters.insert("' . $key . "\");\n"; $keycount++; } print UNI "}\n\n"; print UNI 'void UnicodeTools::initLetter() {' . "\n"; for ($i = 0; $i < $inits; $i++) { print UNI "\tinitLetter$i();\n"; } print UNI "}\n"; #print UNI '#include "Unicode.hpp"'; print UNI ' bool UnicodeTools::isNumber(string testchar) { if (numbers.find(testchar) != numbers.end()) { return true; } return false; } '; print UNI 'void UnicodeTools::initNumber() {' . "\n"; foreach $key (keys %num) { print UNI ' numbers.insert("' . $key . "\");\n"; } print UNI '}'; #print UNI '#include "Unicode.hpp"'; print UNI ' bool UnicodeTools::isPunct(string testchar) { if (punctuation.find(testchar) != punctuation.end()) { return true; } return false; } '; print UNI 'void UnicodeTools::initPunct() {'. "\n"; foreach $key (keys %punct) { $key =~ s/^\\$/\\\\/; $key =~ s/^\"$/\\\"/; print UNI ' punctuation.insert("' . $key . "\");\n"; } print UNI '}'; #print UNIW '#include "Unicode.hpp"'; print UNI ' bool UnicodeTools::isWS(string testchar) { if (ws.find(testchar) != ws.end()) { return true; } return false; } '; print UNI 'void UnicodeTools::initWS() {' . "\n"; foreach $key (keys %ws) { print UNI ' ws.insert("' . $key . "\");\n"; } print UNI '} '; # hex2utf8: Take a string of 4 hex digits (0-9A-F) and convert it # to the corresponding (1, 2, or 3 byte) UTF-8 representation. sub hex2utf8 { my($hexchar) = @_; #print "$hexchar \n"; if ($hexchar !~ m/^0x/) { $hexchar = "0x" . $hexchar; } $binchar = oct($hexchar); if ($binchar <= 127) { $retval = pack("C", $binchar); } elsif ($binchar <= 2047) { $bin1 = ($binchar >> 6) | 0xC0; $bin2 = ($binchar & 0x3F) | 0x80; $retval = pack("C2", $bin1, $bin2); } else { $bin1 = ($binchar >> 12) | 0xE0; $bin2 = (($binchar & 0x0FFF) >> 6) | 0x80; $bin3 = ($binchar & 0x003F) | 0x80; $retval = pack("C*", $bin1, $bin2, $bin3); # #print "in 3 char version with $hexchar and $retval bin1 $bin1 bin2 $bin2 bin3 $bin3\n"; } $retval; }