#!/usr/bin/python
#
# This script converts the ltf.xml format into the POS tagger format
#
# POS tagger format is:
#
# word tag
# word tag
# ...
# a "word tag" pair on each line, with a blank line between sentences.
#
# Usage : ./ltf2pos.py file.ltf.xml > file.pos
#
# Author: maeda@ldc.upenn.edu

import sys, re, codecs
from xml.dom import minidom

(UTF8Encode, UTF8Decode, UTF8Reader, UTF8Writer) = codecs.lookup('utf-8')
toWrite = UTF8Writer(sys.stdout)

def cmpById(a, b):
    aId = a.getAttribute("id")
    bId = b.getAttribute("id")
    m = re.match(".*-(\d+)$", aId)
    if m:
        aIdNum = int(m.group(1))
    else:
        aIdNum = 0
    m = re.match(".*-(\d+)$", bId)        
    if m:
        bIdNum = int(m.group(1))
    else:
        bIdNum = 0

    return cmp(aIdNum, bIdNum)
    
def getText(nodelist):
    rc = ""
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            if node.data == "\n":
                rc=rc +" "
            else:
                rc = rc + node.data
    return rc

def handleSeg(seg):
    tokens = seg.getElementsByTagName("TOKEN")
    tokens.sort(cmpById)
    for token in tokens:
        id = token.getAttribute("id")
        pos = token.getAttribute("pos")
        if not pos:
            pos = "none"
        text = getText(token.childNodes)
        #print id,
        line = "%s %s\n" % (text, pos)
        toWrite.write(line)
    print

def handleDoc(doc):
    text = doc.getElementsByTagName("TEXT")[0]
    segments = text.getElementsByTagName("SEG")
    segments.sort(cmpById)
    for segment in segments:
        handleSeg(segment)

def printUsage():
    print "Usage : %s file.ltf.xml > file.pos" % sys.argv[0]
    
def main():
    if len(sys.argv) > 1:
        try:
            f=open(sys.argv[1], 'r')
        except:
            printUsage()
            sys.exit()
    else:
        f = sys.stdin

    #filestring=UTF8Reader(f.read())
    filestring=f.read()

    lctl_text = minidom.parseString(filestring)

    for doc in lctl_text.getElementsByTagName("DOC"):
        handleDoc(doc)
            
if __name__ == "__main__":
    main()
    
