1
2
3
4
5
6
7
8
9
10
11 """
12 Take a text file and extract words from it using Flite's text
13 normalization, yielding a transcription suitable for language model
14 training and a pronunciation dictionary.
15
16 This is part of project 4 in 05-631 Software Architecture for User
17 Interfaces, Fall 2007.
18 """
19
20 __author__ = "David Huggins-Daines <dhuggins@cs.cmu.edu>"
21
22 import flite
23 import getopt
24 import sys
25
27 """Show usage of this module."""
28 sys.stderr.write("""Usage: text2words.py [-o DICTIONARY] < TEXT > WORDS""")
29
30 -class TextNorm(object):
32 self.vox = flite.load_voice()
33 self.vox.set_post_synth_callback(self.get_words)
34 self.dic = {}
35
36 - def normalize(self, par):
37 self.vox.text_to_speech(par, "none")
38
39 - def get_words(self, utt):
40 words = []
41 for w in utt['Word']:
42 segs = []
43 for syl in w.as_rel('SylStructure'):
44 for seg in syl:
45 if seg.name == 'ax':
46 segs.append('AH')
47 elif seg.name == 'pau':
48 segs.append('SIL')
49 else:
50 segs.append(seg.name.upper())
51 if len(segs) > 0:
52 words.append(w.name)
53 if not w.name in self.dic:
54 self.dic[w.name] = " ".join(segs)
55 print "<s>", " ".join(words), "</s>"
56
58 try:
59 opts, args = getopt.getopt(sys.argv[1:], 'o:', ['output='])
60 except getopt.GetoptError:
61 usage()
62 sys.exit(2)
63 outfile = None
64 for o, a in opts:
65 if o in ('-o', '--output'):
66 outfile = a
67
68 norm = TextNorm()
69 par = []
70 for line in sys.stdin:
71 line = line.strip()
72 if line == "":
73 norm.normalize(" ".join(par))
74 par = []
75 else:
76 par.append(line)
77 if outfile != None:
78 outfh = open(outfile, "w")
79 words = norm.dic.keys()
80 words.sort()
81 for w in words:
82 outfh.write("%-30s %s\n" % (w, norm.dic[w]))
83
84 if __name__ == "__main__":
85 main()
86