#!/usr/local/bin/python
#
# This scripts adds pos tags to LCTL TEXT xml format using a pos tagger format
# file
#
# POS tagger format is:
# word tag
# word tag
# ...
# a "word tag" pair on each line, with a blank line between sentences.
#
# "Usage : %s file.pos file.ltf.xml > file.ltf.pos.xml" % sys.argv[0]
#
# Author: maeda@ldc.upenn.edu

import sys, re, codecs
from xml.dom import minidom

(UTF8Encode, UTF8Decode, UTF8Reader, UTF8Writer) = codecs.lookup('utf-8')
toWrite = UTF8Writer(sys.stdout)

def printUsage():
    print "Usage : %s file.pos file.ltf.xml > file.ltf.pos.xml" % sys.argv[0]
    
def main():

    tokenPos = []
    tokenIndex = 0
    
    try:
        xmlF=open(sys.argv[1])
    except:
        printUsage()
        sys.exit()

    try:
        posF=open(sys.argv[2])
    except:
        printUsage()
        sys.exit()        

    posF = UTF8Reader(posF)
    
    lines = posF.readlines()
    for line in lines:
        if re.match("\s*$", line):
            continue
        token, pos = line.split()
        tokenPos.append((token, pos))

    xmlF = UTF8Reader(xmlF)
    
    lines = xmlF.readlines()
    for line in lines:
        m = re.match("(.*<TOKEN .*>)(.*)(</TOKEN>)", line)

        if not m:
            toWrite.write(line)
            continue

        token, pos = tokenPos[tokenIndex]
        tokenIndex += 1
        
        if token != m.group(2):
            print "ERROR: tokens do not match"
            sys.exit()

        m2 = re.match("(.* pos=\")([^\"]*)(\".*)$", line)
        if m2:
            newLine = m2.group(1)+pos+m2.group(3)
            toWrite.write(newLine+"\n")
            continue

        m3 = re.match("(.* id=\"[^\"]*\")(.*)$", line)
        if m3:
            newLine = m3.group(1)+" pos=\""+pos+"\""+m3.group(2)
            toWrite.write(newLine+"\n")
            continue

        print "ERROR"
            
if __name__ == "__main__":
    main()
    
