1
2
3
4
5
6
7
8
9
10
11 """
12 Scrape an mbox-format mailbox for e-mail body text and output data
13 suitable for training a language model.
14
15 This is part of project 4 in 05-631 Software Architecture for User
16 Interfaces, Fall 2007.
17 """
18
19 __author__ = "David Huggins-Daines <dhuggins@cs.cmu.edu>"
20
21 import mailbox
22 import email
23 import quopri
24 import sys
25 import re
26
28 """Show usage of this module."""
29 sys.stderr.write("""Usage: emailscraper.py MBOX [MBOX ...]\n""")
30
31 if len(sys.argv) == 1:
32 usage()
33 sys.exit(0)
34
36 """Normalize the subject header of an e-mail message."""
37
38
39 print msg['Subject']
40
42 """Normalize address headers of an e-mail message."""
43 for field in ('to', 'cc'):
44 addrs = msg.get_all(field)
45 if addrs:
46 for r,e in email.utils.getaddresses(addrs):
47 if r == '': continue
48 if r[0] == "'" and r[-1] == "'": print r[1:-1]
49 else: print r
50
51 quotre = re.compile(r'^(\s*>)+\s*', re.MULTILINE)
52 origre = re.compile(r'^\s*-+\s*Original\s+Message\s*-+.*',
53 re.MULTILINE | re.DOTALL)
54 cvsre = re.compile(r'^Update of /.*', re.MULTILINE | re.DOTALL)
55 sharre = re.compile(r'^#!/.*$.*', re.MULTILINE | re.DOTALL)
56 msword = re.compile(r'^Content-Type: (application/msword|text/html).*',
57 re.MULTILINE | re.DOTALL)
58 sepre = re.compile(r'[-.#_*=~^]{5,}\s*', re.DOTALL)
59 listre = re.compile(r'^\s*[-+*]+\s+', re.MULTILINE)
60
62 """Normalize the body of an e-mail message."""
63 txt = msg.get_payload()
64
65 if msg['Content-Transfer-Encoding'] == 'quoted-printable':
66 txt = quopri.decodestring(txt)
67 elif msg['Content-Transfer-Encoding'] == 'base64' \
68 or msg['Content-Transfer-Encoding'] == 'BASE64':
69 return
70
71 txt = quotre.sub("", txt)
72
73 txt = sharre.sub("", txt)
74
75
76 txt = msword.sub("", txt)
77
78 txt = cvsre.sub("", txt)
79
80 txt = origre.sub("\n", txt)
81
82 txt = listre.sub("", txt)
83
84 txt = sepre.sub("", txt)
85
86 txt = txt.replace("=20", " ")
87
88 for par in txt.split("\n\n"):
89 par = par.replace("\n", " ").strip()
90 if par == "": continue
91 print par
92 print
93
95 """Scrape usable text out of a mailbox."""
96 mbox = mailbox.mbox(filename)
97 for msg in mbox:
98 normalize_subject(msg)
99 print
100 normalize_addrs(msg)
101 print
102 for part in msg.walk():
103 if part.get_content_type() == 'text/plain':
104 normalize_body(part)
105
106 if __name__ == '__main__':
107 for m in sys.argv[1:]:
108 scrape_mbox(m)
109