## this script insert the comment break in cmnt tkn text, 
## so that it can run as 3-d data with HBC 

def make_ALT_files( uid_file_name, tkn_file_name, out_file_name):

	_uid = open(uid_file_name, 'r')
	_tkn = open(tkn_file_name, 'r')
	_out = open(out_file_name, 'w')
	uid_line = _uid.readlines()
	tkn_line = _tkn.readlines()
	doc_num = len(uid_line) ## this should be equl to len(tkn_list)
	#print len(uid_line)
	for i in range(doc_num):
		#uid_items = map(int, uid_line[i].split())	
		#tkn_items = map(int, tkn_line[i].split())	
		uid_items = uid_line[i].split()	
		tkn_items = tkn_line[i].split()	
		
		cmnt_len = len(tkn_items)  ## check if uid_items also the same.
		
		tmp = ''
		prev = 0
		for j in range(cmnt_len):
			if prev == 0:
				tmp += str(tkn_items[j]) + ' ' 
				prev = uid_items[j]
			elif uid_items[j] == prev:
				tmp += str(tkn_items[j]) + ' ' 
				prev = uid_items[j]
			else:
				tmp = tmp.rstrip(' ')
				tmp += '\t' + str(tkn_items[j]) + ' ' 
				prev = uid_items[j]
		_out.write(tmp.rstrip(' '))
		_out.write('\n')
	_out.close()
	_uid.close()
	_tkn.close()


## this script make the uid corpus does not include the comment volume info
## this would count only the uid once per comment

def make_ALT_UID_files( uid_file_name, out_file_name): 
	_fh = open(uid_file_name, 'r')
	_out = open(out_file_name, 'w')
	for line in _fh:
		#items = map(int, line.split() )
		items = line.split() 
		tmp = ''
		prev = 0
		for i in items:
			if i == prev:
				pass
			else:
				tmp += str(i) + ' ' 
				prev = i
		_out.write(tmp.rstrip(' '))
		_out.write('\n')
	_out.close()
	_fh.close()


if __name__ == "__main__":
	make_ALT_files('data/RS_cmnt_uid_train.txt','data/RS_cmnt_tkn_train.txt',\
		'data/RS_cmnt_tkn_train_alt.txt')
	make_ALT_UID_files( 'data/RS_cmnt_uid_train.txt', \
		'data/RS_cmnt_uid_train_alt.txt')
