#!/usr/local/bin/perl

## Break urdu sentences and clean up the English side of parallel corpus

use strict;
use warnings;

my $fileName = shift;
my $posFileName = shift;

open(IFILE, $fileName) or die("Couldn't open the file $fileName\n");
open(PIFILE, $posFileName) or die("Couldn't open the file $posFileName\n");
<IFILE>;
while(<IFILE>){
	chomp;
# 	print "Something off !\n" unless($_ eq '');
# 	<IFILE>;
	while(1){
		my $l = <IFILE>;
		chomp($l);
		last if($l eq '');
		print "Something off with eng\n" unless($l eq 'eng');
		my @eng = ();
		while(1){
			$l = <IFILE>;
			chomp($l);
			last if($l eq 'urd');
			push @eng,$l;
# 			print $l,"\n";
		}
		my @urd = ();
		while(1){
			$l = <IFILE>;
			chomp($l);
			last if($l eq '#');
			push @urd,$l;
# 			print $l,"\n";
		}
		my $upos = <PIFILE>;
		if($#eng == 1 and $#urd == 0){
			my $elen = '';
			foreach my $esen (@eng){
				my @tokens = split /\s+/,$esen;
				$elen .= "$#tokens ";
			}
			my @tokens = split /\s+/,$urd[0];
			my $ulen = $#tokens;
			print "$#eng $#urd $elen $ulen\n";
			print "$eng[0]\n$eng[1]\n";
			print "$urd[0]\n$upos";
		}
# 		<STDIN>;
	}
}
close IFILE;
