#!/usr/local/bin/perl5

use bytes;
use strict;


# Check for correct usage:
if($#ARGV < 0)
{
    print STDERR "Usage: perl reformat-A3.pl <f-e-A3-file>\n";
    print STDERR "Output goes to standard out.\n";
    exit;
}

# Load the devset urd side
# my $devset = "";
my %devset = ();

open(DFILE,"<:encoding(utf-8)","/afs/cs/project/avenue-1/Avenue/Urdu-MT/eval/devset.eng.norm") or die("Couldn't open file\n");
while(<DFILE>){
#	print $_;
	chomp;
	s/[,\-\"'\.\?!\(\)\[\]]/ /g;
	s/[^A-Za-z0-9 ]/ /g;
	s/\s+/ /g;
	s/^\s+//g;
	s/\s+$//g;
# 	$devset .= "$_ ";
	$devset{$_} = 1;
}
# $devset =~ s/\s+/ /g;
close DFILE;

# Open input and output files:
open(my $I1FILE,"<:encoding(utf-8)",$ARGV[0]) or die "Can't open input A3 file: $!";
open(my $I2FILE, $ARGV[1]) or die "Can't open input A3 file: $!";
open(my $POSFILE, $ARGV[2]) or die "Can't open POS file: $!";

# Open input and output files:
open(my $O1FILE, ">$ARGV[0].ffilt") or die "Can't open output A3 file: $!";
open(my $O2FILE, ">$ARGV[1].ffilt") or die "Can't open output A3 file: $!";
open(my $POSOFILE, ">$ARGV[2].ffilt") or die "Can't open output POS file: $!";

# Open input and output files:
open(my $Y1FILE, ">$ARGV[0].ftxt") or die "Can't open output txt file: $!";
open(my $Y2FILE, ">$ARGV[1].ftxt") or die "Can't open output txt file: $!";

my $senCount = 0;
# Sentence pair (1) source length 11 target length 9 alignment score :
while(my $st = <$I1FILE>)
{
	$senCount++;
	my $ts = <$I2FILE>;

	#print "$st$ts";
	$st =~ /^# Sentence pair \((\d+)\) source length (\d+) target length (\d+) alignment score/ or die("Bad !");
	my $sl = $2;
	my $tl = $3;

	$ts =~ /^# Sentence pair \((\d+)\) source length (\d+) target length (\d+) alignment score/ or die("Bad !");

	if($sl == $3 and $tl == $2){
		my $line = <$I1FILE>;
		my $mline = $line;
		chomp($mline);
		$mline = lc($mline);
		$mline =~ s/[,\-"'\.\?!\(\)\[\]]/ /g;
		$mline =~ s/[^A-Za-z0-9 ]/ /g;
		$mline =~ s/\s+/ /g;
		$mline =~ s/^\s+//g;
		$mline =~ s/\s+$//g;
		my $flag = 0;
		foreach (keys %devset){
			if(length($_) > 25 and length($mline) > 20 and (index($mline, $_) != -1 or index($_,$mline) != -1)){
				print "$senCount\n";
# 				delete $devset{$_};
				$flag = 1;
				last;
			}
		}
		if($flag == 0){
			print $O1FILE $st;
			print $O2FILE $ts;
			
			#my $line = <$I1FILE>;
			print $O1FILE $line;
			print $Y1FILE $line;
			$line = <$I1FILE>;
			print $O1FILE $line;
			
			$line = <$I2FILE>;
			print $O2FILE $line;
			print $Y2FILE $line;
			$line = <$I2FILE>;
			print $O2FILE $line;
			
			$line = <$POSFILE>;
			print $POSOFILE $line;
		}
		else{
# 			print "$senCount $1\n";
			<$I1FILE>;
			my $eng = <$I2FILE>;
# 			print "$mline\n";
			<$I2FILE>;
			<$POSFILE>;
		}
	}
	else{
		<$I1FILE>;
		<$I1FILE>;
		<$I2FILE>;
		<$I2FILE>;
		<$POSFILE>;
	}
}

close $O1FILE;
close $O2FILE;
close $I1FILE;
close $I2FILE;
close $Y1FILE;
close $Y2FILE;
close $POSFILE;
close $POSOFILE;
