#!/usr/local/bin/python
# Simple morphology lookup tool for Urdu
# 
# Usage:./morphlookup.py ../../Lexicon/Urdu_v0.6.llf.xml wordlist
#
# Author: Seth Erickson <rseth@ldc.upenn.edu>

import sys, re, codecs, string, xml.dom.minidom 

Encode, Decode, Reader, Writer = codecs.lookup("UTF-8")

re_word = re.compile("<WORD>(.*)</WORD>")
re_endentry = re.compile("</ENTRY>")
re_pos = re.compile("<MORPH>(.*)</MORPH>")
re_plus = re.compile("\+")

def main(argv=None):

    def morphsearch(dict_lines,s):  
        foundentry = False
        ret = []
        for i,l in enumerate(dict_lines):
            w = re_word.search(l)
            if w is not None and w.group(1)==s:
                for entry in dict_lines[i+1:]: 
                    pos = re_pos.search(entry)
                    if re_endentry.search(entry) is not None:
                        foundentry = True           
                        break
                    elif pos is not None and re_plus.search(entry) is not None:
                        ret.append(pos.group(1))
                    else:
                        continue
            if foundentry:
                break
        return ret;

    if argv is None:
        argv = sys.argv

    input = Reader(sys.stdin)
    out = Writer(sys.stdout)

    dict = None
    if len(argv) > 1:
        dict = Reader(open(argv[1]))
    if len(argv) > 2:
        input = Reader(open(argv[2]))        

    if dict == None:
        print("Usage: morphlookup.py <dictionary> [token file] ")        
        return 1
    
    dict_lines = dict.readlines()

    l = input.readlines()
    for i in l: 
        i = i.replace("\n","")
        #i = i.replace(" ","")

        result = morphsearch(dict_lines, i)
        out.write(i+":")
        for r in result:
            out.write('(%s)'%r)
        out.write('\n');
                

if  __name__ ==  "__main__":
    sys.exit(main())
