1 from mekano import AtomVector
2
3 """mekano.Textual
4
5 Textual services like tokenization, stemming, stop word removal
6
7 Tokenizers: Convert a string into a stream of tokens. All tokenizers
8 lower() the string and return a generator.
9 Currently available:
10 - BasicTokenizer
11 - WordRegexTokenizer [this is mostly what you want]
12 - WordNumberRegexTokenizer
13
14 """
15
16 import re
17 from itertools import ifilter
18
19 wordsplitter_rex = re.compile("\W+")
20
21 word_regex = re.compile(r"\b[a-z]{3,}\b")
22
23
24
25
26
27
28 word_number_regex = re.compile(r"\b[a-z][a-z0-9]{3,}\b|(\$|\b)[0-9]+(,[0-9]{3})*(\.[0-9]+)?\b")
29
31 """Split on any non-word letter.
32
33 Words need not start with [a-z]
34 """
35 for token in wordsplitter_rex.split(s.lower()):
36 if len(token) >= minlen:
37 yield token
38
39
41 """Find 3 or more letter words.
42
43 Words must start with [a-z]
44 """
45 for match in word_regex.finditer(s.lower()):
46 yield match.group()
47
48
50 """Find 4 or more letter words or numbers/currencies.
51
52 Words must start with [a-z]
53 """
54 for match in word_number_regex.finditer(s.lower()):
55 yield match.group()
56
57
59 """Create an AtomVector from a string.
60
61 Tokenizes string 's' using tokenizer, creating
62 atoms using AtomFactory 'af'.
63 """
64 av = AtomVector()
65 for word in tokenizer(s):
66 atom = af[word]
67 av[atom] += 1
68 return av
69