#!/usr/local/bin/perl58

#@files = <*trans.txt>;
@files = <*shadow.sgm>;

foreach $file (@files) {
    $cleanfile = $file;
    print "Processing $file\n";
    $cleanfile =~ s/\.sgm/.txt/;
    $cleanfile =~ s/\.txt/-clean.txt/;
    open(FD, "<:utf8", $file) or die $!;
    open(CLEAN, ">:encoding(euc-cn)", $cleanfile) or die $!;
    while ($line = <FD>) {
	next if $line =~ m/^\<([^>]+)\>$/;
	$line =~ s/^\s*<seg([^>]*)>\s*//;
	$line =~ s/\s*<\/seg>([ ]*)$//;
	$line =~ s/\$([a-z]+)_\(([^\|\)]+)\)/$2/g;
	$line =~ s/\$([a-z]+)_\(([^\|\)]+)\|\|([^\)]+)\)/$2/g;
	#$line =~ s/\|\|[^\)]+\)//g;
	$line =~ s/\@\@/ /g;
	$line =~ s/(\p{Han}) (\p{Han})/$1$2/g;
	$line =~ s/(\p{Han}) (\p{Han})/$1$2/g;
	$line =~ s/\x{00b7}/./g;
	$line =~ s/\x{30fb}/ /g;
	$line =~ s/\x{2022}/./g;
	$line =~ s/\x{2122}/TM/g;
	$line =~ s/\x{2665}/ love /g;
	$line =~ s/\x{8b13}//g;
	$line =~ s/\x{7421}//g;
	$line =~ s/\x{e5d8}//g;
	$line =~ s/\x{e5d9}//g;
	$line =~ s/\x{8a36}/\x{8bc3}/g;

	print CLEAN $line;
    }
    close(FD);
    close(CLEAN);
}
