#!/usr/bin/perl -w -CD binmode STDOUT, ":utf8"; binmode STDIN, ":utf8"; no warnings; #-----Description------------------------------------------------------ # # Program: urdu-segmenter.pl # Written by: Danish Munir # Purpose: breaks urdu text into sentences # # Syntax: urdu-segmenter.pl [filename] # or program_that_outputs_urdu_text | urdu-segmenter.pl [-x] -s [docid(optional)] # This script takes a utf8 encoded file with Urdu text as input # and outputs to STDOUT, the text after segmenting it into sentences. # # The xml format of the output is as follows # # Urdu Sentence 1 # Urdu Sentence 2 # Urdu Sentence 3 # # # This script breaks urdu sentences based on the following punctuations: # [dash] Unicode 06D4 # [question] Unicode 061F # multiple newline characters #----------------------------------------------------------------------- if ($ARGV[0] =~ m/^-h$/ || $ARGV[0] =~ m/^-+help$/){ print "\n breaksenteces.pl ---------------- Syntax: urdu-segmenter.pl [filename] of urdu-segmenter.pl -x [filename] or program_that_outputs_urdu_text | urdu-segmenter.pl -s [docid(optional)] of program_that_outputs_urdu_text | urdu-segmenter.pl -s -x [docid(optional)] eg: more sourcefile1.txt | urdu-segmenter.pl -s Title The -x option is used to output xml tags, if and only if the -x option is used This script takes a utf8 encoded file with Urdu text as input and outputs to STDOUT, the text after segmenting it into sentences. The xml format of the output is as follows Urdu Sentence 1 Urdu Sentence 2 This script breaks urdu sentences based on the following punctuations: multiple newline characters [dash] Unicode 06D4 [question] Unicode 061F [ellipsis] Unicode 2026 [bullet] Unicode 2022 \n\n"; exit; } #Code Starts here if ($ARGV[0] =~ m/^-s$/){ if ($ARGV[1] =~ m/^-x$/) { $printxml = 1; $filename = $ARGV[2]; $/=undef; $_=; } else { $printxml = 0; $filename = $ARGV[1]; $/=undef; $_=; } } else { if ($ARGV[0] =~ m/^-x$/) { $printxml = 1; open(I,"<:utf8", $ARGV[1]) #Open the file passed, or exit upon error or die "Cannot open file $ARGV[1]: $!"; $filename = $ARGV[1]; #This and the next 2 lines cleanup the $filename =~ s/.*\///; #filename by removing the path and the $filename =~ s/\.[^\.]*$//; #extension. $/=undef; #Set the input delimeter to undef to #read the entire file at once. $_=; } else { $printxml = 0; open(I,"<:utf8", $ARGV[0]) #Open the file passed, or exit upon error or die "Cannot open file $ARGV[0]: $!"; $filename = $ARGV[0]; #This and the next 2 lines cleanup the $filename =~ s/.*\///; #filename by removing the path and the $filename =~ s/\.[^\.]*$//; #extension. $/=undef; #Set the input delimeter to undef to #read the entire file at once. $_=; } } s/\r//sgi; s/\n/\n\n/sg; if ($printxml) { print "\n"; } s/\s*\x{2022}\s*/\n\n\n\n\n/g; #Replace bullets with sentence breaks. s/\t* +\t*$/ /g; s/[\n\x{000D}][ ]+[\n\x{000d}]/\n\n/sg; #This and the following 4 lines s/^[\t\x{0020}]+$/\n\n/g; #attempt to remove lines with #s/ +$/\n\n/g; #spaces only. #s/([\x{06d4}\x{061f}\n\x{000d}]) *[\n\x{000d}]*/$1/g; s/|//g; #Remove pipe character from files. my @sentences=split(/(\n{2,}|!|\x{061f}|\x{06D4}|\x{2022}|\x{000d}|\s{2,}|\x{2026}|\x{002e})/); #This line actually splits the text into #sentences based on the various delimiters #described above my $i = 0; #Initialize loop counter i, and the my $j=1; #segment counter j sent: while ( $i < @sentences ) { #The @sentences array has a list of items such that #an item at index i, is followed by the punctuation #that marked the end of this sentence at index i+1 $string=$sentences[$i]; #Take a sentence and trim any white d $string =~ s/^\s*(.*?)\s*$/$1/g; #spaces at the start or en if (length($string) <=3 || $string =~ m/^\s+$/){ #Check to see if a sentence contains only white $i += 0; #space. If it does, than discard it. } else{ if ($printxml) { print "$string"; #If it passes the test, than sentence is valid, so print it } else { print "$string"; } if($sentences[$i+1] =~ m/[\n\x{000d}\x{2022}]/){ #Check the punctuation follownig the sentence. print ""; #If newline, carraige-return, or bullet than dont print it. } else{ #If passes both tests than print it. print "$sentences[$i+1]"; } if ($printxml) { print "\n"; #Close segment tag. } else { print "\n"; } $j++; #Increment segment counter. } $i+=2; #Increment sentence counter by 2, to move to next } #set of sentence and its ending punctuation. if ($printxml) { print "\n"; #Close DOC tag. } close;