Package mekano :: Module textual
[hide private]
[frames] | no frames]

Source Code for Module mekano.textual

 1  from mekano import AtomVector 
 2   
 3  """mekano.Textual 
 4   
 5  Textual services like tokenization, stemming, stop word removal 
 6   
 7  Tokenizers: Convert a string into a stream of tokens. All tokenizers 
 8              lower() the string and return a generator. 
 9              Currently available: 
10              - BasicTokenizer 
11              - WordRegexTokenizer         [this is mostly what you want] 
12              - WordNumberRegexTokenizer 
13   
14  """ 
15   
16  import re 
17  from itertools import ifilter 
18   
19  wordsplitter_rex = re.compile("\W+") 
20   
21  word_regex = re.compile(r"\b[a-z]{3,}\b") 
22   
23  # todo: broken: $700 is not parsed.                              
24  #word_number_regex = re.compile(r"\b[a-z][a-z0-9]{3,}\b|\$?[0-9]*\.?[0-9]+") 
25  #word_number_regex = re.compile(r"\b[a-z][a-z0-9]{3,}\b|\$?[0-9]*([0-9]+[0-9x]+[0-9]+)?(\.[0-9]+)?") 
26  # todo: "12," shouldn't get parsed. Basically, comma or dot can't end a number 
27   
28  word_number_regex = re.compile(r"\b[a-z][a-z0-9]{3,}\b|(\$|\b)[0-9]+(,[0-9]{3})*(\.[0-9]+)?\b") 
29   
30 -def BasicTokenizer(s, minlen=1):
31 """Split on any non-word letter. 32 33 Words need not start with [a-z] 34 """ 35 for token in wordsplitter_rex.split(s.lower()): 36 if len(token) >= minlen: 37 yield token
38 39
40 -def WordRegexTokenizer(s):
41 """Find 3 or more letter words. 42 43 Words must start with [a-z] 44 """ 45 for match in word_regex.finditer(s.lower()): 46 yield match.group()
47 48
49 -def WordNumberRegexTokenizer(s):
50 """Find 4 or more letter words or numbers/currencies. 51 52 Words must start with [a-z] 53 """ 54 for match in word_number_regex.finditer(s.lower()): 55 yield match.group()
56 57
58 -def Vectorize(s, af, tokenizer = WordRegexTokenizer):
59 """Create an AtomVector from a string. 60 61 Tokenizes string 's' using tokenizer, creating 62 atoms using AtomFactory 'af'. 63 """ 64 av = AtomVector() 65 for word in tokenizer(s): 66 atom = af[word] 67 av[atom] += 1 68 return av
69