#!/usr/local/bin/perl

## Filter data for LM

use strict;
use warnings;

my $fileName = shift;

my %devset = ();
open(IFILE, '/afs/cs/user/abhayaa/work/eng.lc') or die("Couldn't open file\n");
# open(IFILE, '/afs/cs/project/avenue-1/Avenue/Urdu-MT/eval/devset.eng.norm') or die("Couldn't open file\n");
while(<IFILE>){
	chomp;
	s/[,\-\"'\.\?!\(\)\[\]]/ /g;
	s/[^A-Za-z0-9 ]/ /g;
	s/\s+/ /g;
	s/^\s+//g;
	s/\s+$//g;
	$devset{$_} = 1;
}
close IFILE;

open(IFILE, $fileName) or die("Couldn't open the file $fileName\n");
my $senCount = 0;
while(my $line = <IFILE>){
	$senCount++;
# 	next if($senCount < 71000);
	my $mline = $line;
	chomp($mline);
	$mline = lc($mline);
	$mline =~ s/[,\-"'\.\?!\(\)\[\]]/ /g;
	$mline =~ s/[^A-Za-z0-9 ]/ /g;
	$mline =~ s/\s+/ /g;
	$mline =~ s/^\s+//g;
	$mline =~ s/\s+$//g;
# 	print STDERR $mline if($senCount == 71737);
	my $flag = 0;
	foreach (keys %devset){
# 		print STDERR $_,"\n" if($_ =~ /in the united nation s/ and $senCount == 71737);
		if(length($_) > 25 and length($mline) > 20 and (index($mline, $_) != -1 or index($_,$mline) != -1)){
			print STDERR "$senCount\n";
# 			delete $devset{$_};
			$flag = 1;
			last;
		}
	}
	print $line if($flag == 0);
}
close IFILE;