#!/usr/local/bin/perl

use bytes;
use strict;
# use warnings;
# binmode(STDOUT,":utf8");
# Check for correct usage:
if($#ARGV < 2)
{
    print STDERR "Usage: perl intersect-aligns.pl <e-f.aln> <f-e.aln> <e-pos-file>\n";
    print STDERR "Output goes to standard out.\n";
    exit;
}


# Open input and output files:
open(EFILE, $ARGV[0]) or die "Can't open input E-F aligns file: $!";
open(FFILE, $ARGV[1]) or die "Can't open input F-E aligns file: $!";
open(EPFILE, $ARGV[2]) or die "Can't open input E POS file: $!";
open(PDFILE,"<:encoding(utf8)" ,$ARGV[3]) or die "Can't open input POS dict: $!";
my %dict = ();
while(<PDFILE>){
	chomp;
	my @tokens = split /\s+/,$_;
	foreach my $ptag (@tokens[1..$#tokens]){
		${$dict{$tokens[0]}}{$ptag} = 1;
	}
}
close PDFILE;

# This loop ends up going over sentences:
my $sCount = 0;
while((my $num_e = <EFILE>) && (my $num_f = <FFILE>))
{
    chomp $num_e;
    chomp $num_f;
	# Get sentence strings:
	my $eLineFromE = <EFILE>;
	my $fLineFromF = <FFILE>;
	my $fLineFromE = <EFILE>;
	my $eLineFromF = <FFILE>;
	
# Get the E POS tags
	my $eposStr = <EPFILE>;chomp($eposStr);
	chomp $eLineFromE; chomp $fLineFromF; chomp $fLineFromE; chomp $eLineFromF;
	$sCount++;

	# Sanity check:
	if($eLineFromE ne $eLineFromF)
	{
		print STDERR "E-F source line doesn't match F-E target line for sentence num_e:$num_e, num_f:$num_f\n";
#		exit;
	}
	if($fLineFromF ne $fLineFromE)
	{
		print STDERR "F-E source line doesn't match E-F target line for sentence num_e:$num_e, num_f:$num_f\n";
#		exit;
	}

	# Separate strings into words:
	my @EString = split /\s+/, $eLineFromE;
	unshift(@EString, "NULL");
	my @FString = split /\s+/, $fLineFromF;
	unshift(@FString, "NULL");

	my @epos = split /\s+/, $eposStr;
	unshift(@epos, 'NULL');
	
	# Create two-dimensional array for alignments:
	my @Aligns = ();
	for my $i (0..$#EString)
	{
		my @ATemp = ();
		for my $j (0..$#FString) { push(@ATemp, "0"); }
		$Aligns[$i] = [@ATemp];
	}

	# Extract alignments and put in two-dimensional array:
	while(my $eline = <EFILE>)
	{
		chomp $eline;
		last if($eline =~ /^\s*$/);
		if($eline =~ /(\d+) \(\{ (\d+ )*\}\)/)
		{
			my $eIndex = $1;
			my @FList = split /\s+/, $2;
			for my $i (0..$#FList) { $Aligns[$eIndex][$FList[$i]] += 1;	}
		}
	}
	while(my $fline = <FFILE>)
	{
		chomp $fline;
		last if($fline =~ /^\s*$/);
		if($fline =~ /(\d+) \(\{ (\d+ )*\}\)/)
		{
			my $fIndex = $1;
			my @EList = split /\s+/, $2;
			for my $i (0..$#EList) { $Aligns[$EList[$i]][$fIndex] += 1;	}
		}
	}

	# Sanity check on content and bounds of the array:
	if($#Aligns != $#EString)
	{
		print STDERR "Error filling in alignments: Sentence array has $#Aligns English words when it should have $#EString.\n";
		exit;
	}
	for my $i (0..$#Aligns)
	{
		if($#{$Aligns[$i]} != $#FString)
		{
			print STDERR "Error filling in alignments: Sentence array has $#{$Aligns[$i]} French words in row $i when it should have $#FString.\n";
			exit;
		}
		for my $j (0..$#{$Aligns[$i]})
		{
			if($Aligns[$i][$j] > 2)
			{
				print STDERR "Error in alignment matrix: Value more than 2.\n";
				exit;
			}
		}
	}
			
	# Print out intersection of alignments for this sentence:
        print "__SENTENCE__ $sCount\n";
        my $isize = 0;
	for my $i (0..$#Aligns)
	{
		print "@{$Aligns[$i]}\n";
		for my $j (0..$#{$Aligns[$i]})
		{
# 			(my $eword, my $etag) = split /_/,$epos[$i];
# 			$etag = 'PUNCT' if($eword =~ m/^[,\.;\?\!\(\)\-:"'#@*]+$/);
# 			if($Aligns[$i][$j] == 2 and ($etag eq 'NUM' or $etag eq 'PUNCT' or $etag eq 'FW' or !defined($dict{$EString[$i]}) or ${$dict{$EString[$i]}}{$etag} == 1)) {
# 				print "$FString[$j]\t$EString[$i]\t$etag\n";
# 				$isize++ unless($eword =~ m/^[,\.;\?\!\(\)\-:"'#@*]+$/);
# 			}
# 			elsif($Aligns[$i][$j] == 2){
# 				print "Bad POS $EString[$i] $etag\n";
# 			}
		}
	}
	<STDIN>;
# 	print STDERR "$sCount\n" if($isize == 0);
}
