## TOP

import os
from operator import itemgetter
from hbc_data_util import *

#### Change below to adjust to new round of run.
BLOG_SITE = 'RS'
DEST_DIR = './data/'
AUX_DIR = './data/'

### trimming/pruning spec goes here
CUTOFF = 2   
SIZE_MIN = 5
CMNT_MIN = 1
Train_splt = 0.9

#### input filenames goes here
Body_dir = '../../distilled/data/bodies_distiled/'
Cmnt_dir = '../../distilled/data/cmnts_distiled/'
Auth_dir = '../../distilled/data/auths_distiled/'
File_stems = './file_stem_list.txt'
Stop_list = './stop_list.txt'

#### output file stems goes here.
Hbc_body_train_stem = '_body_train.txt'
Hbc_cmnt_tkn_train_stem = '_cmnt_tkn_train.txt'
Hbc_cmnt_uid_train_stem = '_cmnt_uid_train.txt'
Hbc_body_test_stem = '_body_test.txt'
Hbc_cmnt_tkn_test_stem = '_cmnt_tkn_test.txt'
Hbc_cmnt_uid_test_stem = '_cmnt_uid_test.txt'
GS_file = '_cmnt_rank.txt'


if __name__ == "__main__":

	## make hash table of the file name to file stem:
	file_stem_list = []
	_fh = open(File_stems,'r')
	for i in _fh: file_stem_list.append(i.rstrip('\n'))
	_fh.close()
	body_lkup = make_hash( Body_dir, file_stem_list ) 
	cmnt_lkup = make_hash( Cmnt_dir, file_stem_list ) 
	auth_lkup = make_hash( Auth_dir, file_stem_list ) 

	## read in the stoplist
	_fh = open(Stop_list, 'r')
	line = _fh.read()
	_fh.close()
	stop_words = line.split()

	### sanity check
	### the number of document should match.
	if len(body_lkup) != len(cmnt_lkup) or len(auth_lkup) != len(cmnt_lkup):
		print(" %d %d %d " % (len(body_lkup),len(cmnt_lkup), len(cmnt_lkup)) ); sys.exit();
		print " something is very wrong."; sys.exit();

	### sanity check
	### number of comments in each files should match.

	### split the corpus to test and train.
	train,test = split_crps( Train_splt, file_stem_list )

	## 0.create the base vocabulary
	body_vocab = build_vocab( body_lkup, Body_dir, train )
	cmnt_vocab = build_vocab( cmnt_lkup, Cmnt_dir, train )
	auth_vocab = build_vocab_auth( auth_lkup, Auth_dir, train )
	
	## 1.
	## remove the stop words + non-counting words from vocab:
	## check the frequency of words appeared in the training set
	## discard words in the stoplist, also words freq < CUTOFF from the vocab
	print('size of  vocab, original:%d' % (len(body_vocab),) )
	body_vocab = trim_vocab(body_vocab, CUTOFF, stop_words)
	print('after the trimming:%d' % (len(body_vocab),) )
	print('size of cmnt vocab, original:%d' % (len(cmnt_vocab),) )
	cmnt_vocab = trim_vocab(cmnt_vocab, CUTOFF, stop_words )
	print('after the trimming:%d' % (len(cmnt_vocab),) )
	print('size of author vocab, original:%d' % (len(auth_vocab),) )

	## 2.
	## discard the post with too few words in the body (according to the vocab)
	## discard the post with too few comment counts
	## do this for both training and test sets
	remove_this = []
	train_cmnts = {}
	for f in train:
		if SIZE_MIN > count_words(body_lkup[f], Body_dir, body_vocab):
			remove_this.append(f)
			continue
		cmnt_c, word_c, cmnt_ids= \
			count_cmnt_words(cmnt_lkup[f], Cmnt_dir, cmnt_vocab)
		if CMNT_MIN > cmnt_c: 
			remove_this.append(f)
			continue
		if SIZE_MIN > word_c: 
			remove_this.append(f)
			continue
		train_cmnts[f] = cmnt_ids 
	print('total number of file in training before clnup:%d' % (len(train),) )
	for f in remove_this:
		train.remove(f)
		if f in train_cmnts: trash = train_cmnts.pop(f)
	print('total nummaber of file in training after clnup:%d' % (len(train),) )
		

	## 3. do one more round of sanity check on vocab:
	body_vocab_freq = count_vocab_words( body_vocab,Body_dir,body_lkup,train) 
	for c in body_vocab_freq.keys():
		if body_vocab_freq[c] == 0:
			trash = body_vocab.pop(c)
	cmnt_vocab_freq = \
		count_vocab_words_cmnt(cmnt_vocab,Cmnt_dir,cmnt_lkup,train, train_cmnts) 
	for c in cmnt_vocab_freq.keys():
		if cmnt_vocab_freq[c] == 0:
			trash = cmnt_vocab.pop(c)
	auth_vocab_freq = \
		count_vocab_words_auth(auth_vocab,Auth_dir,auth_lkup,train, train_cmnts) 
	for a in auth_vocab_freq.keys():
		if auth_vocab_freq[a] == 0:
			trash = auth_vocab.pop(a)

	## write out the entire things...
	write_file_list(AUX_DIR +'body_train_file.txt', Body_dir, body_lkup, train)
	write_file_list(AUX_DIR +'cmnt_train_file.txt', Cmnt_dir, cmnt_lkup, train)
	write_file_list(AUX_DIR +'auth_train_file.txt', Auth_dir, auth_lkup, train)
	write_file_list_ids(AUX_DIR+'cmnt_train_id.txt',Cmnt_dir,cmnt_lkup,train, \
		train_cmnts, Auth_dir, auth_lkup)

	## write the vocab for each:
	body_vocab_id = write_vocab_file(DEST_DIR + BLOG_SITE + '_body_vocab.txt',\
		 body_vocab)
	cmnt_vocab_id = write_vocab_file(DEST_DIR + BLOG_SITE + '_cmnt_vocab.txt', \
		cmnt_vocab)
	auth_vocab_id = write_vocab_file(DEST_DIR + BLOG_SITE + '_auth_vocab.txt', \
		auth_vocab)

	## write the training corpus:
	out_file = DEST_DIR +  BLOG_SITE + Hbc_body_train_stem
	write_hbc_body(Body_dir, body_lkup, train, out_file, body_vocab_id) 
	cmnt_out_file = DEST_DIR +  BLOG_SITE + Hbc_cmnt_tkn_train_stem
	auth_out_file = DEST_DIR +  BLOG_SITE + Hbc_cmnt_uid_train_stem
	write_hbc_cmnt(Cmnt_dir, cmnt_lkup, train, cmnt_out_file, cmnt_vocab_id, \
		Auth_dir, auth_lkup, auth_out_file, auth_vocab_id, train_cmnts)


	## Building the test part of the data.
	## also write the GS -- list of user commented on the post
	##
	## We DO remove the file that become ZERO length according to the 
	## vocaburaly based on the training data.
	## thought about 'unknown' word padding, but this way is more clear,
	## Although We do NOT remove the 'unknonw' users.  it would become 
	## an 'NEW_USER' = '000'. This is different from "UNRECOGNIZED_STRING"
	## 
	remove_this = []
	test_cmnts = {}
	for f in test:
		if SIZE_MIN > count_words(body_lkup[f], Body_dir, body_vocab):
			remove_this.append(f)
			continue
		cmnt_c, word_c, cmnt_ids= \
			count_cmnt_words(cmnt_lkup[f], Cmnt_dir, cmnt_vocab)
		if CMNT_MIN > cmnt_c: 
			remove_this.append(f)
			continue
		if SIZE_MIN > word_c: 
			remove_this.append(f)
			continue
		test_cmnts[f] = cmnt_ids 
	print('total number of file in test before clnup:%d' % (len(test),) )
	for f in remove_this:
		test.remove(f)
		if f in test_cmnts: trash = test_cmnts.pop(f)
	print('total nummaber of file in test after clnup:%d' % (len(test),) )
		
	## write out the entire things...
	write_file_list(AUX_DIR+'body_test_file.txt', Body_dir, body_lkup, test)
	write_file_list(AUX_DIR+'cmnt_test_file.txt', Cmnt_dir, cmnt_lkup, test)
	write_file_list(AUX_DIR+'auth_test_file.txt', Auth_dir, auth_lkup, test)

	### write the test data	
	out_file = DEST_DIR +  BLOG_SITE + Hbc_body_test_stem
	write_hbc_body(Body_dir, body_lkup, test, out_file, body_vocab_id) 
	out_file_tkn = DEST_DIR +  BLOG_SITE + Hbc_cmnt_tkn_test_stem
	out_file_uid = DEST_DIR +  BLOG_SITE + Hbc_cmnt_uid_test_stem
	write_hbc_cmnt(Cmnt_dir, cmnt_lkup, test, out_file_tkn, cmnt_vocab_id, \
		Auth_dir, auth_lkup, out_file_uid, auth_vocab_id, test_cmnts)

	### write the GS files.
	### one user for each comment, including the unknonwn users.
	_out = open( AUX_DIR + BLOG_SITE + GS_file, 'w')
	_fh = open( out_file_uid, 'r')
	for line in _fh:
		output_string = ''
		uid_list =  line.split()
		for i in range(len(uid_list)):
			if i == 0: 
				output_string += '%s ' % uid_list[i]
			elif uid_list[i] != uid_list[i-1]:
				output_string += '%s ' % uid_list[i]
			else:
				pass
		_out.write( output_string )
		_out.write( '\n' )
	_fh.close() 
	_out.close()