
## this script make the uid corpus does not include the comment volume info
## this would count only the uid once per post 
## update. add the part to group the tkn by user. easier that way.
## need this in order to do the binary cmnt-lda

def	make_ALT2(file1, file2, file3, file4, unknowns=0):

	_uid = open(file1, 'r')
	_tkn = open(file2, 'r')
	_out_uid = open(file3, 'w')
	_out_tkn = open(file4, 'w')
	uid_line = _uid.readlines()
	tkn_line = _tkn.readlines()
	_uid.close()
	_tkn.close()
	
	doc_num = len(uid_line) ## this should be equl to len(tkn_list)
	for i in range(doc_num):
		#uid_items = map(int, uid_line[i].split())
		#tkn_items = map(int, tkn_line[i].split())
		uid_items = uid_line[i].split()
		tkn_items = tkn_line[i].split()
		cmnt_len = len(tkn_items)  ## check if uid_items also the same.
		uid_dict = {}
		uid_position = []	
		for j in uid_items:
			if j not in uid_dict:
				uid_dict[j] = ''
				uid_position.append( j )
		for j in range(cmnt_len):
			## add each word in cmnt to the dictionary keyed to the usr id
			uid_dict[ uid_items[j] ] += str(tkn_items[j]) + ' '
		output_line = ''
		output_line_uid = ''
		#for j in uid_dict.keys():
		for j in uid_position:
			if unknowns == 1 and j == '000': continue 
			tmp = uid_dict[j]
			tmp = tmp.rstrip(' ')
			tmp = tmp + '\t'
			output_line += tmp
			output_line_uid += str(j) + ' '
		output_line = output_line.rstrip('\t')
		output_line_uid = output_line_uid.rstrip(' ')
		_out_tkn.write( output_line )
		_out_tkn.write( '\n' )
		_out_uid.write( output_line_uid )
		_out_uid.write( '\n' )
	_out_uid.close()
	_out_tkn.close()
	

if __name__ == "__main__":

	make_ALT2( 'data/RS_cmnt_uid_train.txt','data/RS_cmnt_tkn_train.txt', \
		'data/RS_cmnt_uid_train_alt2.txt','data/RS_cmnt_tkn_train_alt2.txt') 
