saui_pr4.emailscraper

1 #!/usr/bin/env python 2 3 # Copyright (c) 2007 Carnegie Mellon University. 4 # 5 # You may modify and redistribute this file under the same terms as 6 # the CMU Sphinx system. See 7 # http://cmusphinx.sourceforge.net/html/LICENSE for more information. 8 # 9 # Briefly, don't remove the copyright. Otherwise, do what you like. 10 11 """ 12 Scrape an mbox-format mailbox for e-mail body text and output data 13 suitable for training a language model. 14 15 This is part of project 4 in 05-631 Software Architecture for User 16 Interfaces, Fall 2007. 17 """ 18 19 __author__ = "David Huggins-Daines <dhuggins@cs.cmu.edu>" 20 21 import mailbox 22 import email 23 import quopri 24 import sys 25 import re 26

27 -def usage():

28 """Show usage of this module.""" 29 sys.stderr.write("""Usage: emailscraper.py MBOX [MBOX ...]\n""")

30 31 if len(sys.argv) == 1: 32 usage() 33 sys.exit(0) 34

35 -def normalize_subject(msg):

36 """Normalize the subject header of an e-mail message.""" 37 # We can generally leave these unmolested, because Flite is 38 # going to do most of the normalization for us. 39 print msg['Subject']

40

41 -def normalize_addrs(msg):

42 """Normalize address headers of an e-mail message.""" 43 for field in ('to', 'cc'): 44 addrs = msg.get_all(field) 45 if addrs: 46 for r,e in email.utils.getaddresses(addrs): 47 if r == '': continue 48 if r[0] == "'" and r[-1] == "'": print r[1:-1] 49 else: print r

50 51 quotre = re.compile(r'^(\s*>)+\s*', re.MULTILINE) 52 origre = re.compile(r'^\s*-+\s*Original\s+Message\s*-+.*', 53 re.MULTILINE | re.DOTALL) 54 cvsre = re.compile(r'^Update of /.*', re.MULTILINE | re.DOTALL) 55 sharre = re.compile(r'^#!/.*$.*', re.MULTILINE | re.DOTALL) 56 msword = re.compile(r'^Content-Type: (application/msword|text/html).*', 57 re.MULTILINE | re.DOTALL) 58 sepre = re.compile(r'[-.#_*=~^]{5,}\s*', re.DOTALL) 59 listre = re.compile(r'^\s*[-+*]+\s+', re.MULTILINE) 60

61 -def normalize_body(msg):

62 """Normalize the body of an e-mail message.""" 63 txt = msg.get_payload() 64 # FIXME: the email module should do this for us 65 if msg['Content-Transfer-Encoding'] == 'quoted-printable': 66 txt = quopri.decodestring(txt) 67 elif msg['Content-Transfer-Encoding'] == 'base64' \ 68 or msg['Content-Transfer-Encoding'] == 'BASE64': 69 return 70 # Remove quoted-text markers 71 txt = quotre.sub("", txt) 72 # Remove shell archives 73 txt = sharre.sub("", txt) 74 # Remove MS Word documents that mistakenly got quoted by a stupid 75 # e-mail program 76 txt = msword.sub("", txt) 77 # Remove obvious CVS commit messages 78 txt = cvsre.sub("", txt) 79 # Remove blocks of "Original Message" headers 80 txt = origre.sub("\n", txt) 81 # Remove list markers 82 txt = listre.sub("", txt) 83 # Remove long lines of punctuation 84 txt = sepre.sub("", txt) 85 # Remove remnants of quoted-printable 86 txt = txt.replace("=20", " ") 87 # Now separate it into "paragraphs" 88 for par in txt.split("\n\n"): 89 par = par.replace("\n", " ").strip() 90 if par == "": continue 91 print par 92 print

93

94 -def scrape_mbox(filename):

95 """Scrape usable text out of a mailbox.""" 96 mbox = mailbox.mbox(filename) 97 for msg in mbox: 98 normalize_subject(msg) 99 print 100 normalize_addrs(msg) 101 print 102 for part in msg.walk(): 103 if part.get_content_type() == 'text/plain': 104 normalize_body(part)

105 106 if __name__ == '__main__': 107 for m in sys.argv[1:]: 108 scrape_mbox(m) 109

Source Code for Module saui_pr4.emailscraper