""" updating the distill codes i did 3 years ago

	# replace special characters with white space... for now 
	prog_13 = re.compile('&\w+?;')
	result = prog_13.sub(' ', result) 
	prog_14 = re.compile('&#\w+?;')
	result = prog_14.sub(' ', result) 
	# replace $numbers into _META_DOLLOR_REF_
	prog_11 = re.compile('(\$\d+)')
	result = prog_11.sub('_META_DOLLOR_REF_', result) 
	# replace %number into _META_PERCENT_REF_
	prog_12 = re.compile('(\d+%)')
	result = prog_12.sub('_META_PERCENT_REF_', result) 
	# replace numbers into _META_NUMBER_REF_
	prog_2 = re.compile('(\d+)')
	result = prog_2.sub('_META_NUMBER_REF_', result) 
	# replace the end of sentence 
	prog_3 = re.compile('(\.\s+)')
	result = prog_3.sub(' _META_END_DOT_ ', result) 
	# replace the end of sentence 
	prog_4 = re.compile('(!\s+)')
	result = prog_4.sub(' _META_END_EXCLAMATION_ ', result) 
	# replace the end of sentence 
	prog_3 = re.compile('(\?\s+)')
	result = prog_3.sub(' _META_END_QUESTION_ ', result) 
	# replace the "beep" into _META_BEEP_REF_
	# this substitution should be the LAST ONE
	prog_4 = re.compile('([!@#$%^&*?+~<>:;]{2,})')
	result = prog_4.sub('_META_BEEP_REF_', result) 
	# talkenize the string, then put them back together.
	# we do this in this way so that we have better contrall
	# set unicode flag?
	# get the "run of alpha numeric" now
	#m = re.findall('\?',sentence)
	#for tolken in m:
	#	print tolken
	tolkens = re.split('\W+', result)  
	result = ""
	for tolken in tolkens:
		result = result + " " + tolken.lower()
		if tolken == '_META_END_DOT_':
			result = result + "\n\n"
		elif tolken == '_META_END_QUESTION_':
			result = result + "\n\n"
		elif tolken == '_META_END_EXCLAMATION_':
			result = result + "\n\n"
"""

__author__ = "Tae Yano"
__version__ = "$Revision: 1. $"
__date__ = "$Date: 2011/3/12 $"
__license__ = "Python"

import os
import sys 
import re
import glob
import html_clean

def cln_sentence(sentence):
	#IMPORTANT... The order of substitution matters!

#	# remove the xml tags -- re.I is really not necessary..
#	#prog_0 = re.compile('(<[a-zA-Z_]+>|<\/[a-zA-Z_]+>|<.+? .+?>)', re.I)
#	prog_0 = re.compile('(<\w+>|<\/\w+>|<.+? .+?>)')
#	result = prog_0.sub(' ', sentence) 
#	# replace url reference with _META_URL_REF_
#	prog_1 = re.compile('(http://.\S+)')
#	#result = prog_1.sub('_META_URL_REF_', result) 
#	result = prog_1.sub('', result) 
	result = html_clean.strip_html(sentence)

	return(result)

### DONT TOUCH anything up from here...


def process_body(inputfile, outputfile):
	input_fh = open(inputfile, 'r')
	output_fh = open(outputfile, 'w')
	the_sentence = ""
	for line in input_fh:
		if line == "\n":
			# ignore empty lines.
			continue
		line = line.rstrip('\n')
		the_sentence = the_sentence + ' ' + line
	the_cln_sentence = cln_sentence(the_sentence)
	#print("%s\n" % (the_sentence,) )
	#print("%s\n" % (the_cln_sentence,) )
	output_fh.write("%s\n\n" % (the_cln_sentence,) )
	input_fh.close()
	output_fh.close()


if __name__ == "__main__":
	body_dir = './bodies/'
	body_distil_dir = './bodies_cln/'
	flist = glob.glob('./bodies/*')
	for body_file in flist:
		body_distil_file = body_file.replace('.html_body.xml','_body_cln.txt')
		body_distil_file = body_distil_file.replace(body_dir,body_distil_dir)
		print body_distil_file 
		process_body(body_file, body_distil_file)
