Package saui_pr4 :: Module emailscraper
[hide private]
[frames] | no frames]

Source Code for Module saui_pr4.emailscraper

  1  #!/usr/bin/env python 
  2   
  3  # Copyright (c) 2007 Carnegie Mellon University. 
  4  # 
  5  # You may modify and redistribute this file under the same terms as 
  6  # the CMU Sphinx system.  See 
  7  # http://cmusphinx.sourceforge.net/html/LICENSE for more information. 
  8  # 
  9  # Briefly, don't remove the copyright.  Otherwise, do what you like. 
 10   
 11  """ 
 12  Scrape an mbox-format mailbox for e-mail body text and output data 
 13  suitable for training a language model. 
 14   
 15  This is part of project 4 in 05-631 Software Architecture for User 
 16  Interfaces, Fall 2007. 
 17  """ 
 18   
 19  __author__ = "David Huggins-Daines <dhuggins@cs.cmu.edu>" 
 20   
 21  import mailbox 
 22  import email 
 23  import quopri 
 24  import sys 
 25  import re 
 26   
27 -def usage():
28 """Show usage of this module.""" 29 sys.stderr.write("""Usage: emailscraper.py MBOX [MBOX ...]\n""")
30 31 if len(sys.argv) == 1: 32 usage() 33 sys.exit(0) 34
35 -def normalize_subject(msg):
36 """Normalize the subject header of an e-mail message.""" 37 # We can generally leave these unmolested, because Flite is 38 # going to do most of the normalization for us. 39 print msg['Subject']
40
41 -def normalize_addrs(msg):
42 """Normalize address headers of an e-mail message.""" 43 for field in ('to', 'cc'): 44 addrs = msg.get_all(field) 45 if addrs: 46 for r,e in email.utils.getaddresses(addrs): 47 if r == '': continue 48 if r[0] == "'" and r[-1] == "'": print r[1:-1] 49 else: print r
50 51 quotre = re.compile(r'^(\s*>)+\s*', re.MULTILINE) 52 origre = re.compile(r'^\s*-+\s*Original\s+Message\s*-+.*', 53 re.MULTILINE | re.DOTALL) 54 cvsre = re.compile(r'^Update of /.*', re.MULTILINE | re.DOTALL) 55 sharre = re.compile(r'^#!/.*$.*', re.MULTILINE | re.DOTALL) 56 msword = re.compile(r'^Content-Type: (application/msword|text/html).*', 57 re.MULTILINE | re.DOTALL) 58 sepre = re.compile(r'[-.#_*=~^]{5,}\s*', re.DOTALL) 59 listre = re.compile(r'^\s*[-+*]+\s+', re.MULTILINE) 60
61 -def normalize_body(msg):
62 """Normalize the body of an e-mail message.""" 63 txt = msg.get_payload() 64 # FIXME: the email module should do this for us 65 if msg['Content-Transfer-Encoding'] == 'quoted-printable': 66 txt = quopri.decodestring(txt) 67 elif msg['Content-Transfer-Encoding'] == 'base64' \ 68 or msg['Content-Transfer-Encoding'] == 'BASE64': 69 return 70 # Remove quoted-text markers 71 txt = quotre.sub("", txt) 72 # Remove shell archives 73 txt = sharre.sub("", txt) 74 # Remove MS Word documents that mistakenly got quoted by a stupid 75 # e-mail program 76 txt = msword.sub("", txt) 77 # Remove obvious CVS commit messages 78 txt = cvsre.sub("", txt) 79 # Remove blocks of "Original Message" headers 80 txt = origre.sub("\n", txt) 81 # Remove list markers 82 txt = listre.sub("", txt) 83 # Remove long lines of punctuation 84 txt = sepre.sub("", txt) 85 # Remove remnants of quoted-printable 86 txt = txt.replace("=20", " ") 87 # Now separate it into "paragraphs" 88 for par in txt.split("\n\n"): 89 par = par.replace("\n", " ").strip() 90 if par == "": continue 91 print par 92 print
93
94 -def scrape_mbox(filename):
95 """Scrape usable text out of a mailbox.""" 96 mbox = mailbox.mbox(filename) 97 for msg in mbox: 98 normalize_subject(msg) 99 print 100 normalize_addrs(msg) 101 print 102 for part in msg.walk(): 103 if part.get_content_type() == 'text/plain': 104 normalize_body(part)
105 106 if __name__ == '__main__': 107 for m in sys.argv[1:]: 108 scrape_mbox(m) 109