import argparse, sys, unicodedata, os, codecs, collections, time, math
import regex as re
import ycutils.tokenize, ycutils.tfidf, ycutils.bagofwords, ycutils.corpus, parse_sage

parser = argparse.ArgumentParser(description='Creates terms file for model to use.')
parser.add_argument('sage_file', type=file, help='SAGE file.')
parser.add_argument('--with-background', type=file, default=None, help='Include background effect using word count file.')
parser.add_argument('--pruned-list', type=file, default=None, help='Use a manually pruned term list')
A = parser.parse_args()

sage = parse_sage.get_bows(A.sage_file)
all_terms = set()
bow_sage = {}
for eff, bow in sage.iteritems():
  if not re.match(r'.+-ideo-.+', eff): continue
  all_terms.update(bow.iterkeys())
#end for

if A.pruned_list: 
  pruned_terms = set([line.strip() for line in A.pruned_list if line.strip()])
  all_terms &= pruned_terms
#end if

sage_effects = dict([(eff, re.match(r'.+\-ideo\-(.+)', eff).group(1)) for eff in sage.iterkeys() if re.match(r'.+\-ideo\-(.+)', eff)])

if A.with_background:
  background = ycutils.bagofwords.BOW()
  for line in A.with_background:
    line = line.strip()
    if not line: continue

    effects, wc_str = line.split('\t')
    background += ycutils.bagofwords.BOW(wc_string=wc_str)
  #end for

  for w in all_terms: background[w] = math.log(background[w])
  sage['background'] = background
  sage_effects['background'] = 'background'
#end if

effects = sorted(sage_effects.keys())
print >>sys.stderr, 'Writing sage terms for {}'.format(map(lambda eff: sage_effects[eff], effects))

sys.stdout.write('# terms\t' + '\t'.join(map(lambda eff: sage_effects[eff], effects)) + '\n')
for w in sorted(all_terms):
  sys.stdout.write(w + '\t')
  sys.stdout.write('\t'.join(map(lambda key: str(sage[key].get(w, 0.0)), effects)))
  sys.stdout.write('\n')
#end for
print >>sys.stderr, '{} terms found.'.format(len(all_terms))