## This produce a few staff as a preparation for hbc...
## vocab_auth.txt ---- list of usr ids, starting from 1
## vocab_body.txt ---  words appeared in the training of body, id start from 1 
## vocab_cmnt.txt ---  words appeared in the training of cmnt, id start from 1 
## list of files used for training
## doc_id.txt ---  list of doc id, starting from 1 

import os
from operator import itemgetter


def write_hbc_body(body_dir, body_lkup, train, out_file, vocab_id): 
	_out = open( out_file, 'w')
	for f in train:
		file = body_dir + body_lkup[f]
		_fh = open(file, 'r')
		doc = ''
		for line in _fh:
			if line == '\n': continue
			items = line.split()
			for i in items:
				if i in vocab_id: doc += '%d ' % (vocab_id[i],)
		if len(doc) == 0: print file
		_out.write(doc.rstrip(' '))
		_out.write('\n')
		_fh.close()
	_out.close()

def write_hbc_cmnt(cmnt_dir, cmnt_lkup, train, cmnt_out_file, cmnt_vocab_id,\
	auth_dir, auth_lkup, auth_out_file, auth_vocab_id):

	_out = open( cmnt_out_file, 'w')
	_out2 = open( auth_out_file, 'w')
	for f in train:
		_fh = open(cmnt_dir + cmnt_lkup[f], 'r')
		doc = ''
		sect_size = {}
		current = ''
		for line in _fh:
			if line == '\n':
				continue
			line = line.rstrip('\n')
			if '<COMMENT_DATA_SEC_' in line:
				sect_size[line] = 0
				current = line
				continue
			items = line.split()
			for i in items:
				if i in cmnt_vocab_id:
					doc += '%d ' % (cmnt_vocab_id[i],)
					sect_size[current] += 1
		if len(doc) == 0: print file
		_out.write(doc.rstrip(' '))
		_out.write('\n')
		_fh.close()
		_fh = open(auth_dir + auth_lkup[f], 'r')
		doc = ''
		current = ''
		for line in _fh:
			if line == '\n': continue
			line = line.rstrip('\n')
			if '<COMMENT_DATA_SEC_' in line:
				current = line
			if line in auth_vocab_id:
				for i in range(sect_size[current]): 
					doc += '%d ' % (auth_vocab_id[line],)
		if len(doc) == 0: print file
		_out2.write(doc.rstrip(' '))
		_out2.write('\n')
		_fh.close()
	_out.close()
	_out2.close()


def	write_vocab_file(vocab_file, body_vocab):
	id = 1
	lkup = {}
	words = sorted(body_vocab.keys())
	_fh = open(vocab_file, 'w')
	for w in words:
		_fh.write('%d %s\n' % (id, w))
		lkup[w] = id
		id += 1
	_fh.close()
	return lkup


def write_file_list(outfile, body_dir, body_lkup, train):
	_out = open(outfile, 'w')
	for f in train:
		file = body_dir + body_lkup[f]
		_out.write('%s\n' % (file, ))
	_out.close()


def count_cmnt_words(file, dir, dict):
	word_c = 0; cmnt_c = 0;
	_fh = open(dir + file, 'r')
	for line in _fh:
		if line.startswith('<COMMENT_DATA_SEC_'):
			cmnt_c += 1
			continue
		count = 0
		items =  line.split()
		if len(items) == 0: continue
		for item in line.split():
			if item in dict: count += 1
		if count == 0 and cmnt_c != 0:
			cmnt_c -= 1
		word_c += count
	_fh.close()
	return cmnt_c, word_c


def count_words(file, dir, dict):
	count = 0
	_fh = open(dir + file, 'r')
	for line in _fh:
		for item in line.split():
			if item in dict: count += 1
	_fh.close()
	return count


def make_hash( f_list, file_stem_list ):
	file_lkup = {}
	for f in f_list:
		for file_stem in file_stem_list:
			if file_stem in f: 
				file_lkup[file_stem] = f
	return file_lkup


def split_crps( splt, file_stem_list ):
	train_range = int(splt*len(file_stem_list))	
	train = file_stem_list[:train_range]  ## this would split the files
	test = file_stem_list[train_range:]
	return train, test


def	build_vocab( body_lkup, body_dir, train ):
	vocab = {}
	for file_stem in body_lkup.keys():
		if file_stem in train:
			_tmp = open( body_dir + body_lkup[file_stem], 'r' )
			for line in _tmp:
				## this is just for cmnts, but would not harm for body
				if line.startswith('<COMMENT_DATA_SEC_'): continue
				items = line.split()
				for item in items:
					if item not in vocab: vocab[item] = 1
					else: vocab[item] += 1 
			_tmp.close()
	return vocab

def	build_vocab_auth( auth_lkup, auth_dir, train ):
	vocab = {}
	for file_stem in auth_lkup.keys():
		if file_stem in train:
			_tmp = open( auth_dir + auth_lkup[file_stem], 'r' )
			for line in _tmp:
				## this is just for cmnts, but would not harm for body
				if line.startswith('<COMMENT_DATA_SEC_'): continue
				line = line.rstrip('\n')
				wn = len(line.split())
				if wn > 0: 
					if line not in vocab: vocab[line] = 1
					else: vocab[line] += 1 
			_tmp.close()
	return vocab

def trim_vocab( vocab, CUTOFF, stop_words):
	for i in sorted( vocab.keys()):
		if i in stop_words: trash = vocab.pop(i)
		elif i.startswith('_'): trash = vocab.pop(i)
		elif i.startswith('_meta'): trash = vocab.pop(i)
		elif '_meta_' in i: trash = vocab.pop(i)
		elif vocab[i] <= CUTOFF: trash = vocab.pop(i)
	return vocab


if __name__ == "__main__":

	### specification goes here
	CUTOFF = 2
	SIZE_MIN = 5
	CMNT_MIN = 1
	body_dir = '../../distilled/data/bodies_distiled/'
	cmnt_dir = '../../distilled/data/cmnts_distiled/'
	auth_dir = '../../distilled/data/auths_distiled/'
	body_list = os.listdir(body_dir)
	cmnt_list = os.listdir(cmnt_dir)
	auth_list = os.listdir(auth_dir)
	train_splt = 0.9
	BLOG_SITE = 'DK'
	DEST_DIR = './data/'
	hbc_body_train_stem = '_body_train.txt'
	hbc_body_test_stem = '_body_test.txt'
	hbc_cmnt_tkn_train_stem = '_cmnt_tkn_train.txt'
	hbc_cmnt_uid_train_stem = '_cmnt_uid_train.txt'

	file_stem_list = []
	_fh = open('./file_stem_list.txt','r')
	for i in _fh: file_stem_list.append(i.rstrip('\n'))
	_fh.close()

	_fh = open('stop_list.txt', 'r')
	line = _fh.read()
	_fh.close()
	stop_words = line.split()

	### sanity check
	if len(body_list) != len(cmnt_list) or len(auth_list) != len(cmnt_list):
		print "something is very wrong."; sys.exit();

	train,test = split_crps( train_splt, file_stem_list )

	#make hash file of file name to file stem:
	body_lkup = make_hash( body_list, file_stem_list ) 
	cmnt_lkup = make_hash( cmnt_list, file_stem_list ) 
	auth_lkup = make_hash( auth_list, file_stem_list ) 

	## 0.
	## create the base vocabulary
	body_vocab = build_vocab( body_lkup, body_dir, train )
	cmnt_vocab = build_vocab( cmnt_lkup, cmnt_dir, train )
	auth_vocab = build_vocab_auth( auth_lkup, auth_dir, train )
	#print len(auth_vocab)
	#print sorted( auth_vocab.items(), key=itemgetter(1))
	
	## 1.
	## remove the stop words + non-counting words from vocab:
	## check the frequency of words appeared in the training set
	## discard words in the stoplist, also words freq < CUTOFF from the vocab

	print('size of  vocab, original:%d' % (len(body_vocab),) )
	body_vocab = trim_vocab(body_vocab, CUTOFF, stop_words)
	print('after the trimming:%d' % (len(body_vocab),) )

	print('size of  vocab, original:%d' % (len(cmnt_vocab),) )
	cmnt_vocab = trim_vocab(cmnt_vocab, CUTOFF, stop_words )
	print('after the trimming:%d' % (len(cmnt_vocab),) )

	## 2.
	## discard the post with too few words in the body (according to the vocab)
	## discard the post with too few comment counts
	## do this for both training and test sets
	remove_this = []
	for f in train:
		if SIZE_MIN > count_words(body_lkup[f], body_dir, body_vocab):
			remove_this.append(f)
			continue
		cmnt_c, word_c = count_cmnt_words(cmnt_lkup[f], cmnt_dir, cmnt_vocab)
		if CMNT_MIN > cmnt_c: 
			remove_this.append(f)
			continue
		if SIZE_MIN > word_c: 
			remove_this.append(f)
			continue
	print('total number of file in training before clnup:%d' % (len(train),) )
	for f in remove_this:
		#print f
		train.remove(f)
	print('total nummaber of file in training after clnup:%d' % (len(train),) )
		
	## 3. do one more round of sanity check on vocab:
	body_vocab_freq = {}
	for i in body_vocab.keys(): body_vocab_freq[i] = 0
	for f in train:
		_fh = open( body_dir +body_lkup[f], 'r')
		for line in _fh:
			for item in line.split():
				if item in body_vocab: body_vocab_freq[item] += 1
		_fh.close()
	for c in body_vocab_freq.keys():
		if body_vocab_freq[c] == 0:
			trash = body_vocab[c]
			#print c
			#print trash 
	## 3. do one more round of sanity check on vocab -- for comment:
	cmnt_vocab_freq = {}
	for i in cmnt_vocab.keys(): cmnt_vocab_freq[i] = 0
	for f in train:
		_fh = open( cmnt_dir +cmnt_lkup[f], 'r')
		for line in _fh:
			for item in line.split():
				if item in cmnt_vocab: cmnt_vocab_freq[item] += 1
		_fh.close()
	for c in cmnt_vocab_freq.keys():
		if cmnt_vocab_freq[c] == 0:
			trash = cmnt_vocab[c]
			#print c
			#print trash 
	## 3. do one more round of sanity check on vocab -- for comment:
	auth_vocab_freq = {}
	for i in auth_vocab.keys(): auth_vocab_freq[i] = 0
	for f in train:
		_fh = open( auth_dir + auth_lkup[f], 'r')
		for line in _fh:
			wn = len( line.split())
			if wn > 0: 
				line = line.rstrip('\n')
				if line in auth_vocab: auth_vocab_freq[line] += 1 
		_fh.close()
	for a in auth_vocab_freq.keys():
		if auth_vocab_freq[a] == 0:
			trash = auth_vocab[a]
			#print a
			#print trash 

	## write out the entire things...
	write_file_list('body_train_file.txt', body_dir, body_lkup, train)
	write_file_list('cmnt_train_file.txt', cmnt_dir, cmnt_lkup, train)
	write_file_list('auth_train_file.txt', auth_dir, auth_lkup, train)

	## write the vocab for each:
	body_vocab_id = write_vocab_file('body_vocab.txt', body_vocab)
	cmnt_vocab_id = write_vocab_file('cmnt_vocab.txt', cmnt_vocab)
	auth_vocab_id = write_vocab_file('auth_vocab.txt', auth_vocab)

	## write the training corpus:
	## mask here if you are doing the stats.
	out_file = DEST_DIR +  BLOG_SITE + hbc_body_train_stem
	write_hbc_body(body_dir, body_lkup, train, out_file, body_vocab_id) 

	cmnt_out_file = DEST_DIR +  BLOG_SITE + hbc_cmnt_tkn_train_stem
	auth_out_file = DEST_DIR +  BLOG_SITE + hbc_cmnt_uid_train_stem
	write_hbc_cmnt(cmnt_dir, cmnt_lkup, train, cmnt_out_file, cmnt_vocab_id, \
		auth_dir, auth_lkup, auth_out_file, auth_vocab_id)

	## write the test file. 
	out_file = DEST_DIR +  BLOG_SITE + hbc_body_test_stem
	_out_b = open(out_file, 'w')
	_out_a = open('DK_cmnt_rank_alt.txt', 'w')
	_test_fh = open('body_test_file.txt', 'w')
	for f in test:
		bfile = body_dir + body_lkup[f]
		afile = auth_dir + auth_lkup[f]
		_fh_b = open(bfile, 'r')
		_fh_a = open(afile, 'r')

		doc_b = ''
		for line in _fh_b:
			if line == '\n': continue
			items = line.split()
			for i in items:
				if i in body_vocab_id: doc_b += '%d ' % (body_vocab_id[i],)
		_fh_b.close()

		if len(doc_b) == 0:
			print bfile
			continue
		_out_b.write(doc_b.rstrip(' '))
		_out_b.write('\n')
		_test_fh.write(bfile)
		_test_fh.write('\n')
		## count the uid
		doc_a = ''
		for line in _fh_a:
			if line == '\n':
				continue
			line = line.rstrip('\n')
			if '<COMMENT_DATA_SEC_' in line:
				continue
			if line in auth_vocab_id:
				doc_a += '%d ' % (auth_vocab_id[line],)
			else:
				doc_a += '000 '

		_out_a.write(doc_a.rstrip(' '))
		_out_a.write('\n')
	_out_b.close()
	_out_a.close()
	_test_fh.close()