#!/usr/local/bin/python
#
# .ltf.xml generater from source text file
#
# Usage : txt2ltf.py -l (BEN,THA) [options] file.txt > file.ltf.xml
# Options:   -l specifies the language : BEN or URD
#            -s : one segment per line in the input text file
#
# This tool uses bengali-segmenter.pl, thai-segmenter.pl
# and thaiWordSegmenter.sh
#
# Author: Kazuaki Maeda <maeda@ldc.upenn.edu>

import sys, re, codecs, string, os, popen2, getopt, os.path, commands

Encode, Decode, Reader, Writer = codecs.lookup("UTF-8")

toolDir = os.path.dirname(sys.argv[0])

def fixCharacterEntity(str):
    if not re.match(".*&[a-zA-Z]*(?=;)", str):
        str = re.sub("&", "&amp;", str)
    return str

class Segmenter:
    def __init__(self, *args):
        pass

    def sentenceSegment(self, line):
        pass

class UrduSegmenter(Segmenter):
    def sentenceSegment(self, line):

        if os.path.exists(toolDir+"/../Sentence_Segmenter/urdu-segmenter.pl"):
            segmenter = toolDir+"/../Sentence_Segmenter/urdu-segmenter.pl"
        else:
            segmenter = "urdu-segmenter.pl"
                
        (segOut, segIn) = popen2.popen2(segmenter+" -s")

        segIn = Writer(segIn)
        segOut = Reader(segOut)
        
        segIn.write(line)
        segIn.close()
        segments = segOut.read().split("\n")
        segOut.close()

        return segments

class EnglishSegmenter(Segmenter):
    def sentenceSegment(self, line):

        (segOut, segIn) = popen2.popen2("td-sentbrk")

        segIn = Writer(segIn)
        segOut = Reader(segOut)
        
        segIn.write(line)
        segIn.close()
        segments = segOut.read().split("\n")
        segOut.close()

        return segments    

class ThaiSegmenter(Segmenter):
    def sentenceSegment(self, line):

        if os.path.exists(toolDir+"/../Sentence_Segmenter/thai-segmenter.pl"):
            segmenter = toolDir+"/../Sentence_Segmenter/thai-segmenter.pl"
        else:
            segmenter = "thai-segmenter.pl"

        (segOut, segIn) = popen2.popen2(segmenter+" -s")            

        segIn = Writer(segIn)
        segOut = Reader(segOut)
        
        segIn.write(line)
        segIn.close()
        segments = segOut.read().split("\n")
        segOut.close()

        return segments    

class Tokenizer:
    def __init__(self, *args):
        pass

    def tokenizer(self, segment):
        pass

class BengaliTokenizer(Tokenizer):
    def tokenize(self, segment):
        segment = re.sub(",", " , ", segment)
        segment = re.sub("\.", " . ", segment)
        segment = re.sub("\|", " | ", segment)
        segment = re.sub("-", " - ", segment)
        segment = re.sub("\?", " ? ", segment)
        segment = re.sub("!", " ! ", segment)
        segment = re.sub("\"", " \" ", segment)
        segment = re.sub("\'", " ' ", segment)
        segment = re.sub("\(", " ( ", segment)
        segment = re.sub("\)", " ) ", segment)
        segment = re.sub("\{", " { ", segment)
        segment = re.sub("\{", " } ", segment)
        segment = re.sub("\<", " < ", segment)
        segment = re.sub("\>", " > ", segment)
        segment = re.sub("\;", " ; ", segment)
        segment = re.sub("\:", " : ", segment)
        segment = re.sub(u"\u0964", " "+u"\u0964"+" ", segment)
        segment = re.sub(u"\u0965", " "+u"\u0965"+" ", segment)
        segment = re.sub(u"\u09F7", " "+u"\u09F7"+" ", segment)
        segment = re.sub(u"\u09FB", " "+u"\u09FB"+" ", segment)

        segment = re.sub("\s+", " ", segment)
        segment = re.sub("&amp ;", "&amp;", segment)
        segment = re.sub("&quot ;", "&quot;", segment)
        segment = re.sub("&quote ;", "&quote;", segment)                
        
        tokens = segment.split()
        return tokens

class UrduTokenizer(BengaliTokenizer):
    pass

class EnglishTokenizer(Tokenizer):
    def tokenize(self, segment):
        segment = re.sub(",", " , ", segment)
        segment = re.sub("\.", " . ", segment)
        segment = re.sub("\|", " | ", segment)
        segment = re.sub("-", " - ", segment)
        segment = re.sub("\?", " ? ", segment)
        segment = re.sub("!", " ! ", segment)
        segment = re.sub("\"", " \" ", segment)
        segment = re.sub("\'", " ' ", segment)
        segment = re.sub("\(", " ( ", segment)
        segment = re.sub("\)", " ) ", segment)
        segment = re.sub("\{", " { ", segment)
        segment = re.sub("\{", " } ", segment)
        segment = re.sub("\<", " < ", segment)
        segment = re.sub("\>", " > ", segment)
        segment = re.sub("\;", " ; ", segment)
        segment = re.sub("\:", " : ", segment)

        segment = re.sub("\s+", " ", segment)
        segment = re.sub("&amp ;", "&amp;", segment)
        segment = re.sub("&quot ;", "&quot;", segment)
        segment = re.sub("&quote ;", "&quote;", segment)
        
        tokens = segment.split()
        return tokens    

class ThaiTokenizer(Tokenizer):
    def tokenize(self, segment):

        if os.path.exists(toolDir+"/../Tokenizer/ldc/thaiWordSegmenter.sh"):
            tokenizer = toolDir+"/../Tokenizer/ldc/thaiWordSegmenter.sh"
        else:
            tokenizer = "thaiWordSegmenter.sh"
            
        (tokOut, tokIn) = popen2.popen2(tokenizer)

        tokIn = Writer(tokIn)
        tokOut = Reader(tokOut)
        
        tokIn.write(segment)
        tokIn.close()
        tokenString = tokOut.read()

        tokenString = re.sub(",", " , ", tokenString)
        tokenString = re.sub("\.", " . ", tokenString)
        tokenString = re.sub("\|", " | ", tokenString)
        tokenString = re.sub("-", " - ", tokenString)
        tokenString = re.sub("\?", " ? ", tokenString)
        tokenString = re.sub("!", " ! ", tokenString)
        tokenString = re.sub("\"", " \" ", tokenString)
        tokenString = re.sub("\'", " ' ", tokenString)
        tokenString = re.sub("\(", " ( ", tokenString)
        tokenString = re.sub("\)", " ) ", tokenString)
        tokenString = re.sub("\{", " { ", tokenString)
        tokenString = re.sub("\{", " } ", tokenString)
        tokenString = re.sub("\<", " < ", tokenString)
        tokenString = re.sub("\>", " > ", tokenString)
        tokenString = re.sub("\;", " ; ", tokenString)
        tokenString = re.sub("\:", " : ", tokenString)
        tokenString = re.sub("\s+", " ", tokenString)
        tokenString = re.sub("&amp ;", "&amp;", tokenString)
        tokenString = re.sub("&quot ;", "&quot;", tokenString)
        tokenString = re.sub("&quote ;", "&quote;", tokenString)        

        tokens = tokenString.split()
        tokOut.close()
        return tokens 

class CreateLCTLText:
    def __init__(self, *args):
        self.lang = ""
        self.file = ""
    
    def processFile(self, file, lang, oneSegPerLine=0):
        self.file = file
        self.lang = lang
        
        docId = file.split("/")[-1]
        docId = re.sub("\.txt", "", docId)
        input = Reader(open(file))
        out = Writer(sys.stdout)

        charIndex = 0
        lineEndIndex = 0        
        startChar = 0
        endChar = 0    
        segIdNum = 1

        #source_type = "broadcast_news"
        source_type = "web_news"        
        author = "LDC"

        dtd = "ltf.v1.2.dtd"

        print '<?xml version="1.0"?>'
        print '<!DOCTYPE LCTL_TEXT SYSTEM "%s">' % dtd
        print '<LCTL_TEXT lang="%s" source_file="%s" source_type="%s" author="%s" encoding="UTF-8">' % (lang, file, source_type, author)
        print '<DOC id="%s" lang="%s">' % (docId, lang)
        print '<TEXT>'

        while 1:
            line = input.readline()
            if not line: break

            #print "TEST", charIndex, lineEndIndex, len(line)            

            line = re.sub("\n", "", line)
            line = re.sub("\r", "", line)            

            #print charIndex

            #if re.match("<", line):
            #    charIndex += len(line)
            #    continue

            if re.match("\s*$", line):
                charIndex += len(line) + 1
                continue

            #print "TEST", charIndex, lineEndIndex, len(line)            

            m = re.match("^(\s*<[^>]*>\s*)(.*)$", line)
            m2 = re.match("^(\s+)(.*)$", line)
                
            if m:
                #startChar = charIndex + len(m.group(1))
                charIndex += len(m.group(1))
                line = m.group(2)
            elif m2:
                #startChar = charIndex + len(m2.group(1))
                charIndex += len(m2.group(1))
                line = m2.group(2)                    
            else:
                #startChar = charIndex
                pass

            #print "TEST", charIndex, lineEndIndex, len(line)            

            m = re.match("^([^<]*)(\s*<[^>]*>\s*)$", line)
            m2 = re.match("^(.*)(\s+)$", line)

            if m:
                lineEndIndex = charIndex + len(line)
                line = m.group(1)
            elif m2:
                lineEndIndex = charIndex + len(line)
                line = m2.group(1)
            else:
                lineEndIndex = charIndex + len(line)

            #print "TEST", charIndex, lineEndIndex, len(line)

            if lang == "BEN":
                seg = BengaliSegmenter()
            elif lang == "URD":
                seg = UrduSegmenter()                
            elif lang == "THA":
                seg = ThaiSegmenter()
            elif lang == "ENG":
                seg = EnglishSegmenter()

            if oneSegPerLine:
                segments = line.split("\n")
            else:
                segments = seg.sentenceSegment(line)
        
            for segment in segments:
                if segment == "":
                    continue

                m = re.match("^(\s*<[^>]*>\s*)(.*)$", segment)
                m2 = re.match("^(\s+)(.*)$", segment)
                
                if m:
                    startChar = charIndex + len(m.group(1))
                    charIndex += len(m.group(1))
                    segment = m.group(2)
                elif m2:
                    startChar = charIndex + len(m2.group(1))
                    charIndex += len(m2.group(1))
                    segment = m2.group(2)                    
                else:
                    startChar = charIndex

                m = re.match("^([^<]*)(\s*<[^>]*>\s*)$", segment)
                m2 = re.match("^(.*)(\s+)$", segment)                
                if m:
                    endChar = charIndex + len(m.group(1)) - 1
                    charIndex += len(segment) + 1
                    segment = m.group(1)
                elif m2:
                    endChar = charIndex + len(m2.group(1)) - 1
                    charIndex += len(segment) + 1
                    segment = m2.group(1)                    
                else:
                    endChar = charIndex + len(segment) - 1
                    charIndex += len(segment) + 1

                if segment == "":
                    continue                    

                segId = docId+"-"+str(segIdNum)

                segment = fixCharacterEntity(segment)
                print '<SEG id="%s" start_char="%s" end_char="%s">'  %(segId, str(startChar), str(endChar))
                print '<ORIGINAL_TEXT>',
                out.write(segment)
                print '</ORIGINAL_TEXT>'

                # tokenization

                restSegment = segment
                tokenIdNum = 1

                if lang == "BEN":
                    bt = BengaliTokenizer()
                if lang == "URD":
                    bt = UrduTokenizer()                    
                elif lang == "THA":
                    bt = ThaiTokenizer()
                elif lang == "ENG":
                    bt = EnglishTokenizer()                    
                
                tokens = bt.tokenize(segment)
                tokenOffset = 0
                for token in tokens:

                    #tokenId = "T"+"-"+str(segIdNum)+"-"+str(tokenIdNum)
                    tokenId = segId+"-"+str(tokenIdNum)
                    tokenStartOffset = restSegment[tokenOffset:].find(token) + tokenOffset
                    tokenEndOffset = tokenStartOffset + len(token) - 1

                    if tokenStartOffset >= 0:
                        tokenStart = startChar + tokenStartOffset
                        tokenEnd = startChar + tokenEndOffset
                    else:
                        tokenStart = -1
                        tokenEnd = -1

                    tokenOffset = tokenEndOffset + 1

                    token = fixCharacterEntity(token)
                    
                    print '  <TOKEN id="%s" start_char="%s" end_char="%s">' % (tokenId, str(tokenStart), str(tokenEnd)),
                    out.write(token)
                    print '</TOKEN>'
                    tokenIdNum += 1
                    
                print "</SEG>"

                segIdNum += 1

            charIndex = lineEndIndex + 1 # add 1 for newline

        print "</TEXT>"
        print "</DOC>"        
        print "</LCTL_TEXT>"

def printUsage():
    print "Usage : %s -l (URD) [options] file.txt > file.ltf.xml" % sys.argv[0]
    print "Options:   -l specifies the language : URD"
    print "           -s : one segment per line in the input text file"

try:
    opts, pargs = getopt.getopt(sys.argv[1:], 'l:s')
except:
    printUsage()
    sys.exit()
    
lang = ""
oneSegPerLine = 0

for opt in opts:
    if opt[0] == "-l":
        lang = opt[1]
    elif opt[0] == "-s":
        oneSegPerLine = 1 

if not lang:
    printUsage()
    sys.exit()
    
if len(pargs) < 1:
    printUsage()
    sys.exit()
    
c = CreateLCTLText()
c.processFile(pargs[0], lang, oneSegPerLine)

        
