#!/usr/local/bin/perl58

use Encode;

use utf8;

$date = "071001";
$version = "5.007";

open(TXT, ">:utf8", "adso$date.txt") or die $!;

$adsofile = "adso-v$version/database/adso-$date-latin1.sql";
open(ADSO, "$adsofile") or die $!;
while ($line = <ADSO>) {
    $line =~ s/[\r\n]*$//;
    #@fields = split(/\* /, $line);
    next if $line !~ m/^INSERT INTO \`expanded_unified/;

    while ($line =~ m/\(\d+,\'([^\']+)\',\'([^\']+)\',\'([^\']+)\',\'([^\']+)\',\'([^\']+)\',\'([^\']+)\',/g) {
	#print LEX "$2 $5 $6\n"; 
	$chinese = $2;
	$chinese = decode("utf8", $chinese);

	next if $ignore{$chinese};
	$english = $6;
	$english = cleanup($english);
	$pos = $5;

	next if $chinese eq "";
	next if $chinese =~ m/[a-z0-9\?]/;
	next if $english eq "";
	next if $english =~ m/\"/;
	next if $pos eq "";
	
	$english =~ s/\\//g;

	print TXT "$chinese\t$english\n";
	
    }
}
close(ADSO);
close(TXT);


sub cleanup {
    my $text = shift;

    $text = encode("utf8", $text);

    # Replace accented chars w/ unaccented equivalents
    $text =~ s/\xc3\x80/A/g;
    $text =~ s/\xc3\x81/A/g;
    $text =~ s/\xc3\x82/A/g;
    $text =~ s/\xc3\x83/A/g;
    $text =~ s/\xc3\x84/A/g;
    $text =~ s/\xc3\x85/A/g;
    $text =~ s/\xc3\x86/AE/g;
    $text =~ s/\xc3\x87/C/g;
    $text =~ s/\xc3\x88/E/g;
    $text =~ s/\xc3\x89/E/g;
    $text =~ s/\xc3\x8a/E/g;
    $text =~ s/\xc3\x8b/E/g;
    $text =~ s/\xc3\x8c/I/g;
    $text =~ s/\xc3\x8d/I/g;
    $text =~ s/\xc3\x8e/I/g;
    $text =~ s/\xc3\x8f/I/g;
    $text =~ s/\xc3\x90/D/g;
    $text =~ s/\xc3\x91/N/g;
    $text =~ s/\xc3\x92/O/g;
    $text =~ s/\xc3\x93/O/g;
    $text =~ s/\xc3\x94/O/g;
    $text =~ s/\xc3\x95/O/g;
    $text =~ s/\xc3\x96/O/g;
    $text =~ s/\xc3\x97/x/g;
    $text =~ s/\xc3\x98/O/g;
    $text =~ s/\xc3\x99/U/g;
    $text =~ s/\xc3\x9a/U/g;
    $text =~ s/\xc3\x9b/U/g;
    $text =~ s/\xc3\x9c/U/g;
    $text =~ s/\xc3\x9d/Y/g;
    $text =~ s/\xc3\x9e/Th/g;
    $text =~ s/\xc3\x9f/ss/g;

    $text =~ s/\xc3\xa0/a/g;
    $text =~ s/\xc3\xa1/a/g;
    $text =~ s/\xc3\xa2/a/g;
    $text =~ s/\xc3\xa3/a/g;
    $text =~ s/\xc3\xa4/a/g;
    $text =~ s/\xc3\xa5/a/g;
    $text =~ s/\xc3\xa6/ae/g;
    $text =~ s/\xc3\xa7/c/g;

    $text =~ s/\xc3\xa8/e/g;
    $text =~ s/\xc3\xa9/e/g;
    $text =~ s/\xc3\xaa/e/g;
    $text =~ s/\xc3\xab/e/g;
    $text =~ s/\xc3\xac/i/g;
    $text =~ s/\xc3\xad/i/g;
    $text =~ s/\xc3\xae/i/g;
    $text =~ s/\xc3\xaf/i/g;
    $text =~ s/\xc3\xb0/d/g;
    $text =~ s/\xc3\xb1/n/g;
    $text =~ s/\xc3\xb2/o/g;
    $text =~ s/\xc3\xb3/o/g;
    $text =~ s/\xc3\xb4/o/g;
    $text =~ s/\xc3\xb5/o/g;
    $text =~ s/\xc3\xb6/o/g;
    $text =~ s/\xc3\xb8/o/g;

    $text =~ s/\xc3\xb9/u/g;
    $text =~ s/\xc3\xba/u/g;
    $text =~ s/\xc3\xbb/u/g;
    $text =~ s/\xc3\xbc/u/g;
    $text =~ s/\xc3\xbd/y/g;
    $text =~ s/\xc3\xbe/th/g;
    $text =~ s/\xc3\xbf/y/g;

    $text =~ s/\xc4\x81/a/g;
    $text =~ s/\xc4\x83/a/g;
    $text =~ s/\xc4\x93/e/g;
    $text =~ s/\xc4\x95/e/g;
    $text =~ s/\xc4\xab/i/g;
    $text =~ s/\xc4\xad/i/g;
    $text =~ s/\xc5\x8d/o/g;
    $text =~ s/\xc5\x8f/o/g;
    $text =~ s/\xc5\xab/u/g;
    $text =~ s/\xc5\xad/u/g;
    $text =~ s/\xc7\x96/u/g;
    $text =~ s/\xc7\x98/u/g;
    $text =~ s/\xc7\x9a/u/g;
    $text =~ s/\xc7\x9c/u/g;
    
    $text =~ s/[\x80-\xff]//g;


    return $text;
}
