"""
Wed Feb  4 11:31:05 EST 2009

This is the collection of utility tools for standalizing the distiled texts
for experiment using HBC.

Roughtly, the function goes more specific ones to general ones.

"""

import sys
import os 


####### General utility files. #######################################

def make_hash( f_dir, file_stem_list ):
	f_list = os.listdir(f_dir)
	file_lkup = {}
	for f in f_list:
		for file_stem in file_stem_list:
			if file_stem in f: 
				file_lkup[file_stem] = f
	return file_lkup

def split_crps( splt, file_stem_list ):
	train_range = int(splt*len(file_stem_list))	
	train = file_stem_list[:train_range]  ## this would split the files
	test = file_stem_list[train_range:]
	return train, test

def write_file_list(outfile, body_dir, body_lkup, train):
	_out = open(outfile, 'w')
	for f in train:
		file = body_dir + body_lkup[f]
		_out.write('%s\n' % (file, ))
	_out.close()


def write_file_list_ids(outfile, body_dir, body_lkup, train, cmnt_ids, \
	auth_dir, auth_lkup ):
	_out = open(outfile, 'w')
	for f in train:
		file = body_dir + body_lkup[f]
		_out.write('%s\n' % (file, ))
		file = auth_dir + auth_lkup[f]
		_out.write('%s\n' % (file, ))
		for id in cmnt_ids[f]:
			_out.write('%s ' % (id, ))
		_out.write('\n\n')
	_out.close()

def	write_vocab_file(vocab_file, body_vocab):
	id = 1
	lkup = {}
	words = sorted(body_vocab.keys())
	_fh = open(vocab_file, 'w')
	for w in words:
		_fh.write('%d %s\n' % (id, w))
		lkup[w] = id
		id += 1
	_fh.close()
	return lkup


######### For counting words in the distiled files #######################

def count_words(file, dir, dict):
	count = 0
	_fh = open(dir + file, 'r')
	for line in _fh:
		for item in line.split():
			if item in dict: count += 1
	_fh.close()
	return count

def count_cmnt_words(file, dir, dict):
	word_c = 0; cmnt_c = 0;
	cmnt_ids = [] 
	_fh = open(dir + file, 'r')
	lines = _fh.readlines()
	_fh.close()
	for i in range(len(lines)):
		if lines[i].startswith('<COMMENT_DATA_SEC_'):
			items = lines[i+1].split()
			if len(items) == 0: continue
			count = 0
			for item in items:
				if item in dict: count += 1
			if count == 0: continue
			cmnt_c += 1
			cmnt_ids.append(lines[i].rstrip('\n'))
			word_c += count
	return cmnt_c, word_c, cmnt_ids


####### for building the vocab #####################################

def count_vocab_words( vocab, dir, lkup, train):
	vocab_freq = {}
	for i in vocab.keys(): vocab_freq[i] = 0
	for f in train:
		_fh = open( dir + lkup[f], 'r')
		for line in _fh:
			if line.startswith('<COMMENT_DATA_SEC_'): continue
			for item in line.split():
				if item in vocab: vocab_freq[item] += 1
		_fh.close()
	return vocab_freq 


def count_vocab_words_cmnt(vocab, dir, lkup, train, cmnt_ids):
	vocab_freq = {}
	for i in vocab.keys(): vocab_freq[i] = 0

	for f in train:
		_fh = open( dir + lkup[f], 'r')
		lines = _fh.readlines()
		for i in range(len(lines)):
			if lines[i].startswith('<COMMENT_DATA_SEC_'):
				id = lines[i].rstrip('\n')
				if id in cmnt_ids[f]: 
					for item in lines[i+1].split():
						if item in vocab: vocab_freq[item] += 1
		_fh.close()
	return vocab_freq 


def count_vocab_words_auth(vocab, dir, lkup, train, cmnt_ids): 
	vocab_freq = {}
	for i in vocab.keys(): vocab_freq[i] = 0
	for f in train:
		_fh = open( dir + lkup[f], 'r')
		lines = _fh.readlines()
		for i in range(len(lines)):
			if lines[i].startswith('<COMMENT_DATA_SEC_'):
				id = lines[i].rstrip('\n')
				if id in cmnt_ids[f]: 
					usr_id = lines[i+1].rstrip('\n')
					usr_id = usr_id.rstrip(' ')
					if usr_id in vocab: vocab_freq[usr_id] += 1
		_fh.close()
	return vocab_freq 


def	build_vocab( body_lkup, body_dir, train ):
	vocab = {}
	for file_stem in body_lkup.keys():
		if file_stem in train:
			_tmp = open( body_dir + body_lkup[file_stem], 'r' )
			for line in _tmp:
				## this is just for cmnts, but would not harm for body
				if line.startswith('<COMMENT_DATA_SEC_'): continue
				items = line.split()
				for item in items:
					if item not in vocab: vocab[item] = 1
					else: vocab[item] += 1 
			_tmp.close()
	return vocab

def	build_vocab_auth( auth_lkup, auth_dir, train ):
	vocab = {}
	for file_stem in auth_lkup.keys():
		if file_stem in train:
			_tmp = open( auth_dir + auth_lkup[file_stem], 'r' )
			for line in _tmp:
				if line.startswith('<COMMENT_DATA_SEC_'): continue
				if len(line.split()) == 0: continue
				line = line.rstrip('\n')
				line = line.rstrip(' ')
				if line not in vocab: vocab[line] = 1
				else: vocab[line] += 1 
			_tmp.close()
	return vocab

def trim_vocab( vocab, CUTOFF, stop_words):
	for i in sorted( vocab.keys()):
		if i in stop_words: trash = vocab.pop(i)
		elif i.startswith('_'): trash = vocab.pop(i)
		elif i.startswith('_meta'): trash = vocab.pop(i)
		elif '_meta_' in i: trash = vocab.pop(i)
		elif vocab[i] <= CUTOFF: trash = vocab.pop(i)
	return vocab


############# For output of the HBC formatted files ####################

def write_hbc_body(body_dir, body_lkup, train, out_file, vocab_id): 
	_out = open( out_file, 'w')
	for f in train:
		file = body_dir + body_lkup[f]
		_fh = open(file, 'r')
		doc = ''
		for line in _fh:
			if line == '\n': continue
			items = line.split()
			for i in items:
				if i in vocab_id: doc += '%d ' % (vocab_id[i],)
		if len(doc) == 0: print file ## this should not happen for training.
		_out.write(doc.rstrip(' '))
		_out.write('\n')
		_fh.close()
	_out.close()


def write_hbc_cmnt(cmnt_dir, cmnt_lkup, train, cmnt_out_file, cmnt_vocab_id,\
	auth_dir, auth_lkup, auth_out_file, auth_vocab_id, train_cmnts):

	_out = open( cmnt_out_file, 'w')
	_out2 = open( auth_out_file, 'w')
	for f in train:
		_fh = open(cmnt_dir + cmnt_lkup[f], 'r')
		_fh_auth = open(auth_dir + auth_lkup[f], 'r')
		lines_cmnt = _fh.readlines()
		lines_auth = _fh_auth.readlines()

		cmnt_hash = {}
		wc_l = len(lines_cmnt)
		for i in range(wc_l):
			if '<COMMENT_DATA_SEC_' in lines_cmnt[i]:
				id = lines_cmnt[i].rstrip('\n')
				items = lines_cmnt[i+1].split()
				cmnt_hash[ id ] = items
		auth_hash = {}
		wc_l = len(lines_auth)
		for i in range(wc_l):
			if '<COMMENT_DATA_SEC_' in lines_auth[i]:
				id = lines_auth[i].rstrip('\n')
				item = lines_auth[i+1].rstrip('\n')
				item = item.rstrip(' ')
				auth_hash[ id ] = item

		### sanity check, if this is not met, its v.v. wrong
		if len(cmnt_hash) != len(auth_hash):
			print('something is very wrong \n')
			print('%d %d' % (len(cmnt_hash), len(auth_hash))); sys.exit(1);
		doc_cmnt = ''
		doc_auth = ''
		for i in train_cmnts[f]:  ## each train_cmnt[f] is a list.
			auth_id = auth_hash[i]
			for word in cmnt_hash[i]:
				if word in cmnt_vocab_id:
					doc_cmnt += '%d ' % (cmnt_vocab_id[word],)
					## below is the precortion for test dataset.
					## for training set, key error does not happen.
					##doc_auth += '%d ' % (auth_vocab_id[auth_id],)
					if auth_id in auth_vocab_id:
						doc_auth += '%d ' % (auth_vocab_id[auth_id],)
					else:
						doc_auth += '000 '
		if len(doc_cmnt) == 0: print f
		if len(doc_auth) == 0: print f
		_out.write(doc_cmnt.rstrip(' '))
		_out.write('\n')
		_out2.write(doc_auth.rstrip(' '))
		_out2.write('\n')
		_fh.close()
		_fh_auth.close()
	### end of for f in train loop.