Scrape an mbox-format mailbox for e-mail body text and output data
suitable for training a language model.
This is part of project 4 in 05-631 Software Architecture for User
Interfaces, Fall 2007.
|
|
quotre = re.compile(r'^(\s*>)+\s*', re.MULTILINE)
|
|
|
origre = re.compile(r'^\s*-+\s*Original\s+Message\s*-+.*', re....
|
|
|
cvsre = re.compile(r'^Update of /.*', re.MULTILINE | re.DOTALL)
|
|
|
sharre = re.compile(r'^#!/.*$.*', re.MULTILINE | re.DOTALL)
|
|
|
msword = re.compile(r'^Content-Type: (application/msword|text/...
|
|
|
sepre = re.compile(r'[-.#_*=~^]{5,}\s*', re.DOTALL)
|
|
|
listre = re.compile(r'^\s*[-+*]+\s+', re.MULTILINE)
|