#!/usr/local/bin/perl58

# get POS from adso
%posmap = ("NOUN", "N",
	   "VERB", "V",
	   "ADJT", "ADJ",
	   "OTHR", "LEX",
	   "ADVB", "ADV",
	   "AUXV", "AUX");


open(ADSO, "<:encoding(gbk)", "adso.gb") or die $!;
while ($line = <ADSO>) {
    $line =~ s/[\r\n]*$//;
    @fields = split(/\* /, $line);

    next if $fields[1] eq "";
    next if defined($chineselex{$fields[1]});

    if (defined($posmap{$fields[4]})) {
	$pos = $posmap{$fields[4]};
    } else {
	$pos = "LEX";
    }
    if ($word2pos{$fields[1]} !~ m/\b$pos\b/) {
	$word2pos{$fields[1]} .= $pos . " ";
    }
    
}
close(ADSO);

binmode(STDOUT, ">:utf8");

open(LEX, ">:encoding(gbk)", "/usr0/eepeter/Chinese/ldc-augmented.trf") or die $!;
print LEX "; -*- coding: cn-gb -*- \n\n";

open(LDC, "<:encoding(gbk)", "ldc.augmented-All.seg.prob.td") or die $!;
while ($line = <LDC>) {
    $line =~ s/[\r\n]*$//;
    $line =~ s/(\p{Han}) (\p{Han})/$1$2/g;
    $line =~ s/(\p{Han}) (\p{Han})/$1$2/g;
    $line =~ s/\s+\#/\#/g;
    $line =~ s/\#\s+/\#/g;
    @fields = split(/\#/, $line);  # 1 is Chinese, 2 is English, 3 prob.
    $fields[1] =~ s/ //g;
    $chinese = $fields[1];
    #print "Field2 $fields[2]\n";
    $english = $fields[2];
    if (defined($word2pos{$chinese})) {
	@poses = split(/\s/, $word2pos{$chinese});
	foreach $pos (@poses) {
	    next if $pos ne "V" and $pos ne "N";
	    print LEX "$pos\:\:$pos |: [\"$chinese\"] -> [\"$english\"]
   (
     (X1::Y1)
   )
";
	}

    }
    #else {
#	print LEX "LEX::LEX |: [\"$chinese\"] -> [\"$english\"]
#   (
#     (X1::Y1)
#   )
#";
#    }
}
close(LDC);
close(LEX);

=comment
open(LDC, "package/chinese_english_translation_lexicon/data/ldc_cedict.gb.v3") or die $!;
while ($line = <LDC>) {
    $line =~ s/[\r\n]*$//;
    $line =~ s/\s*\([^\)]+\)\s*//g;
    ($chinese, $english) = split(/\t/, $line);
    $english =~ s!^/(.*?)/$!$1!;
    (@defs) = split(/\//, $english);

    foreach $def (@defs) {
	print LEX "LEX::LEX |: [\"$chinese\"] -> [\"$def\"]
   (
     (X1::Y1)
   )
";
    }

}
close(LDC);
=cut
