#!/usr/bin/python

# Linguistic Data Consortium

import sys, re, codecs, string, os, popen2, getopt, string

Encode, Decode, Reader, Writer = codecs.lookup("UTF-8")

class UrduTokenizer:
    def tokenize(self, segment):
        segment = re.sub(",", " , ", segment)
        segment = re.sub("\.", " . ", segment)
        segment = re.sub("\|", " | ", segment)
        segment = re.sub("-", " - ", segment)
        segment = re.sub("\?", " ? ", segment)
        segment = re.sub("!", " ! ", segment)
        segment = re.sub("\"", " \" ", segment)
        segment = re.sub("\'", " ' ", segment)
        segment = re.sub("\(", " ( ", segment)
        segment = re.sub("\)", " ) ", segment)
        segment = re.sub("\{", " { ", segment)
        segment = re.sub("\{", " } ", segment)
        segment = re.sub("\<", " < ", segment)
        segment = re.sub("\>", " > ", segment)
        segment = re.sub("\;", " ; ", segment)
        segment = re.sub("\:", " : ", segment)
        segment = re.sub(u"\u0964", " "+u"\u0964"+" ", segment)
        segment = re.sub(u"\u0965", " "+u"\u0965"+" ", segment)
        segment = re.sub(u"\u09F7", " "+u"\u09F7"+" ", segment)
        segment = re.sub(u"\u09FB", " "+u"\u09FB"+" ", segment)

        segment = re.sub("\s+", " ", segment)
        segment = re.sub("&amp ;", "&amp;", segment)
        segment = re.sub("&quot ;", "&quot;", segment)
        segment = re.sub("&quote ;", "&quote;", segment)                
        
        tokens = segment.split()
        return tokens

class TokenizeUrdu:
    def __init__(self, *args):
        self.lang = ""
        self.file = ""
    
    def processFile(self, fs):
        input = Reader(fs)
        out = Writer(sys.stdout)
        while 1:
            line = input.readline()
            if not line: break

            bt = UrduTokenizer()                    
                
            tokens = bt.tokenize(line)
            tokenOffset = 0
            s = string.join(tokens, " ")
            out.write(s)
            print

def printUsage():
    print "Usage : %s input.txt > output.txt" % sys.argv[0]
    
try:
    opts, pargs = getopt.getopt(sys.argv[1:], '')
except:
    printUsage()
    sys.exit()
    
if len(pargs) < 1:
    fs = sys.stdin
else:
    fs = open(sys.argv[1])
    
c = TokenizeUrdu()
c.processFile(fs)

