#!/usr/bin/perl -w -CD
binmode STDOUT, ":utf8";
binmode STDIN, ":utf8";
no warnings;


#-----Description------------------------------------------------------
#
# Program:    urdu-segmenter.pl
# Written by: Danish Munir
# Purpose:    breaks urdu text into sentences
#
# Syntax: urdu-segmenter.pl [filename]
#      or program_that_outputs_urdu_text | urdu-segmenter.pl [-x] -s [docid(optional)]
# This script takes a utf8 encoded file with Urdu text as input
# and outputs to STDOUT, the text after segmenting it into sentences.
#
# The xml format of the output is as follows
# <DOC docid = "Filename" lang = "URD">
# <SEG id = "1">Urdu Sentence 1</SEG>
# <SEG id = "2">Urdu Sentence 2</SEG>
# <SEG id = "3">Urdu Sentence 3</SEG>
# </DOC>
#
# This script breaks urdu sentences based on the following punctuations:
# [dash]      Unicode 06D4
# [question]  Unicode 061F
# multiple newline characters
#-----------------------------------------------------------------------

if ($ARGV[0] =~ m/^-h$/ || $ARGV[0] =~ m/^-+help$/){
    print "\n
breaksenteces.pl
----------------
Syntax: urdu-segmenter.pl [filename]
     of urdu-segmenter.pl -x [filename]
     or program_that_outputs_urdu_text | urdu-segmenter.pl -s [docid(optional)]
     of program_that_outputs_urdu_text | urdu-segmenter.pl -s -x [docid(optional)]
     eg: more sourcefile1.txt | urdu-segmenter.pl -s Title

The -x option is used to output xml tags, if and only if the -x option is used

This script takes a utf8 encoded file with Urdu text as input and outputs to STDOUT, the text after segmenting it into sentences.

The xml format of the output is as follows
<DOC docid = \"Filename\" lang = \"URD\">
<SEG id = \"1\">Urdu Sentence 1</SEG>
<SEG id = \"2\">Urdu Sentence 2</SEG>
</DOC>

This script breaks urdu sentences based on the following punctuations:

 multiple newline characters
 [dash]        Unicode 06D4
 [question]    Unicode 061F
 [ellipsis]    Unicode 2026
 [bullet]      Unicode 2022 

\n\n";
    exit;
    }


#Code Starts here
if ($ARGV[0] =~ m/^-s$/){
    if ($ARGV[1] =~ m/^-x$/) {
	$printxml = 1;
	$filename = $ARGV[2];
	$/=undef;
	$_=<STDIN>;
    } else {
	$printxml = 0;
	$filename = $ARGV[1];
	$/=undef;
	$_=<STDIN>;
    }
}
else {
    if ($ARGV[0] =~ m/^-x$/) {
	$printxml = 1;
	open(I,"<:utf8", $ARGV[1])                                                                       #Open the file passed, or exit upon error
	    or die "Cannot open file $ARGV[1]: $!";
	
	$filename = $ARGV[1];                                                                            #This and the next 2 lines cleanup the
	
	$filename =~ s/.*\///;                                                                           #filename by removing the path and the
	$filename =~ s/\.[^\.]*$//;                                                                           #extension.
	
	$/=undef;                                                                                        #Set the input delimeter to undef to 
	#read the entire file at once.
	
	$_=<I>; 
    } else {
	$printxml = 0;
	open(I,"<:utf8", $ARGV[0])                                                                       #Open the file passed, or exit upon error
	    or die "Cannot open file $ARGV[0]: $!";
	
	$filename = $ARGV[0];                                                                            #This and the next 2 lines cleanup the
	
	$filename =~ s/.*\///;                                                                           #filename by removing the path and the
	$filename =~ s/\.[^\.]*$//;                                                                           #extension.
	
	$/=undef;                                                                                        #Set the input delimeter to undef to 
	#read the entire file at once.
	
	$_=<I>; 
    }
}
s/\r//sgi;
s/\n/\n\n/sg;
if ($printxml) {
    print "<DOC docid = \"$filename\" lang = \"URD\">\n"; 
}
s/\s*\x{2022}\s*/\n\n\n\n\n/g;                                                                       #Replace bullets with sentence breaks.

s/\t* +\t*$/ /g;
s/[\n\x{000D}][ ]+[\n\x{000d}]/\n\n/sg;                                                              #This and the following 4 lines
s/^[\t\x{0020}]+$/\n\n/g;                                                                            #attempt to remove lines with
#s/ +$/\n\n/g;                                                                                       #spaces only.

#s/([\x{06d4}\x{061f}\n\x{000d}]) *[\n\x{000d}]*/$1/g;

s/|//g;                                                                                              #Remove pipe character from files.


my @sentences=split(/(\n{2,}|!|\x{061f}|\x{06D4}|\x{2022}|\x{000d}|\s{2,}|\x{2026}|\x{002e})/);              #This line actually splits the text into
                                                                                                    #sentences based on the various delimiters
                                                                                                    #described above

   
my $i = 0;                                                                                           #Initialize loop counter i, and the
my $j=1;                                                                                             #segment counter j

 sent: while ( $i < @sentences ) {                                                                   #The @sentences array has a list of items such that
                                                                                                     #an item at index i, is followed by the punctuation
                                                                                                     #that marked the end of this sentence at index i+1

     $string=$sentences[$i];                                                                         #Take a sentence and trim any white d
     $string =~ s/^\s*(.*?)\s*$/$1/g;                                                                #spaces at the start or en

     if (length($string) <=3 || $string =~ m/^\s+$/){                                                #Check to see if a sentence contains only white
	 $i += 0;                                                                                    #space. If it does, than discard it.
     }    
     else{
	 if ($printxml) {
	     print "<SEG id=\"$j\">$string";                                                         #If it passes the test, than sentence is valid, so print it
	 } else {
	     print "$string";
	 }

	 if($sentences[$i+1] =~ m/[\n\x{000d}\x{2022}]/){                                            #Check the punctuation follownig the sentence. 
	     print "";                                                                               #If newline, carraige-return, or bullet than dont print it.
	 } else{                                                                                     #If passes both tests than print it.
	     print "$sentences[$i+1]";
	 }

	 if ($printxml) {
	     print "</SEG>\n";                                                                           #Close segment tag.
	 } else {
	     print "\n";
	 }
	 $j++;                                                                                       #Increment segment counter.
     }
     $i+=2;                                                                                          #Increment sentence counter by 2, to move to next
 }                                                                                                   #set of sentence and its ending punctuation.

if ($printxml) {
    print "</DOC>\n";                                                                                    #Close DOC tag.
}

close;
