#!/usr/local/bin/perl # # A program for graphically handling sphinx argumants. # Thomas K Harris # December 2002 # ############################################################## use strict; use Tk::Getopt; use Tk; my @opttable = ('Input Model Databases', ['lmfn', '=s', undef, label => 'LM file:', longhelp => 'Optional DARPA format bigram/trigram backoff LM file with the empty string as its name.'], ['lmctlfn', '=s', undef, label => 'LM control file', longhelp => 'Optional LM control file with a list of LM files and associated names (one line per entry). This is how multiple LMs can be loaded during initialization.'], ['kbdumpdir', '=s', undef, label => 'dump file directory:', longhelp => 'Building LM Dump Files LM files are usually ASCII files. If they are large, it is time consuming to read them into the decoder. A binary "dump" file is much faster to read and more compact. LM dump files can be created by either a standalone program examples/lm3g2dmp.c or the decoder. The standalone version can be compiled from the examples directory. The program takes two arguments, the LM source file and a directory in which the dump file is to be created. It reads the header from the original LM file to determine the size of the LM. It then forms the binary dump file name by appending a .DMP extension to the LM file name. This file is written to the second (directory) argument. (NOTE: The dump file must not already exist!!) Any version of the decoder can also automatically create binary "dump" files similar to the standalone version described above. It first looks for the dump file in the directory given by the -kbdumpdir argument. If the dump file is present it reads it and ignores the rest of the original LM file. Otherwise, it reads the LM file and creates a dump file in the -kbdumpdir directory so that it can be used in subsequent decoder runs. The decoder does not create dump files for small LMs that have fewer than an internally defined number of bigrams and trigrams.'], ['dictfn', '=s', undef, label => 'Main pronunciation dictionary file:'], ['oovdictfn', '=s', undef, label => '(OOV) pronunciation dictionary:', longhelp => 'Optional out-of-vocabulary (OOV) pronunciation dictionary. These are added to the unnamed LM (read from -lmfn file) with unigram probability given by -oovugprob. '], ['ndictfn', '=s', undef, label => '"noise" words pronunciation dictionary:', longhelp => 'Optional "noise" words pronunciation dictionary. Noise words are not part of any LM and, like silence, can be inserted transparently anywhere in the utterance.'], ['phnfn', '=s', undef, label => 'phone file:', longhelp => 'Phone files with senone mapping information for the given dictionary and acoustic model.'], ['mapfn', '=s', undef, label => 'map file:', longhelp => 'Map file with senone mapping information for the given dictionary and acoustic model.'], ['hmmdir', '=s', undef, label => 'HMM direcory', longhelp => 'Directory with Sphinx-II semi-continuous HMM acoustic models.'], #['hmmdirlist', '=s', undef], ['cbdir', '=s', undef, label => 'codebook directory:', longhelp => 'Directory with Sphinx-II semi-continuous HMM codebooks.'], ['sendumpfn', '=s', undef, label => 'senome model file:', longhelp => 'Building 8-Bit Senone Dump Files The Sphinx-II senonic acoustic model files contain 32-bit data. (These are in the directory specified by the -hmmdir argument.) However, they can be clustered down to 8-bits for memory efficiency, without loss of recognition accuracy. The clustering is carried out by an offline process as follows: Create a temporary 32-bit senone dump file by running the decoder with the -sendumpfn flag set to the temporary file name, the -8bsen flag set to FALSE, and omitting the -lmfn argument. The decoder can be killed after it creates the 32-bit senone dump file, which happens during the initialization and is announced in the log output. Run: /afs/cs/project/plus-2/s2/Sphinx2/bin/alpha/pdf32to8b 32bit-file 8bit-file to create the 8-bit senone dump file. That is, the first argument to pdf32to8b is the temporary 32-bit dump file created above, and the second argument is the 8-bit output file. Delete the temporary 32-bit file. '], ['-8bsen', '=s', undef, choices => ['TRUE', 'FALSE'], label => '8 bit senome', longhelp => 'should be TRUE if the 8-bit senones are used'], 'Decoder Configuration', ['ctlfn', '=s', undef, label => 'control file:', longhelp => 'Batch-mode control file listing utterance files (without their file-extension) to decode. Must not be specified for live-mode or application-driven operation'], ['ctloffset', '=i', undef, label => 'control offset', longhelp => 'The (optional) number of initial utterances in the file to be skipped'], ['ctlcount', '=i', undef, label => 'control count', longhelp => 'the number to be processed (after the skip, if any). defaults to ALL'], ['datadir', '=s', undef, label => 'data directory:', longhelp => 'If the control file entries are relative pathnames, an optional directory prefix for them may be specified using this argument.'], ['allphone', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Should be TRUE to configure the recognition engine for allphone mode operation.'], ['tactlfn', '=s', undef, label => 'input transcript file', longhelp => 'Input transcript file, parallel to the control file (-ctlfn) in forced alignment mode.'], ['adcin', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], label => 'A/D cepstrum input', longhelp => 'In batch mode, \'A/D cepstrum input\' selects A/D (TRUE) or cepstrum input data (FALSE).'], ['adcext', '=s', 'raw', label => 'A/D cepstrum extention:', longhelp => 'If TRUE, \'A/D cepstrum extention\'is the file extension to be appended to names listed in the -ctlfn argument file'], ['adchdr', '=i', 0, label => 'A/D cepstrum header', longhelp => 'the number of bytes of header in each input file'], ['adcendian', '=i', 1, choices => [0, 1], label => 'A/D cepstrum endian:', longhelp => 'byte ordering: 0 for big-endian, 1 for little-endian'], ['normmean', '=s', 'TRUE', choices => ['TRUE', 'FALSE'], label => 'cepstrum mean normalization'], ['nmprior', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], label => 'mean normalization prior:', longhelp => 'If mean normalization prior is FALSE, CMN computed on current utterance only (usually batch mode), otherwise based on past history (live mode).'], ['compress', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], label => 'silence deletion:', longhelp => 'Continuous Listening and Silence Filtering As mentioned earlier, Sphinx2 can only decode utterances that are limited to less than about 30 sec at a time. However, one often wants to leave the audio recording running continuously and automatically determine utterance boundaries based on pauses in the input speech. The continuous listening module in Sphinx2 provides the mechanisms for this purpose. The silence filtering module is interposed between the raw audio input source and the application. The application calls the function cont_ad_read instead of directly reading the raw A/D input source (e.g., via the ad_read function described above). cont_ad_read returns only those segments of input audio that it determines to be non-silence. Additional timestamp information is provided to inform the application about silence regions that have been dropped. The complete continuous listening API is defined in include/cont_ad.h and is summarized below: cont_ad_init: Associates a new continuous listening module instance with a specified raw A/D handle and a corresponding read function pointer. E.g., these may be the handle returned by ad_open and function ad_read described above. cont_ad_calib: Calibrates the background silence level by reading the raw audio for a few seconds. It should be done once immediately after cont_ad_init, and after any environmental change. cont_ad_read: Reads and returns the next available block of non-silence data in a given buffer. (Uses the read function and handle supplied to cont_ad_init to obtain the raw A/D data.) More details are provided below. cont_ad_reset: Flushes any data buffered inside the module. Useful for discarding accumulated, but unprocessed speech. cont_ad_set_thresh: Useful for adjusting the silence and speech thresholds. cont_ad_detach: Detaches the specified continuous listening module from the associated audio device. cont_ad_attach: Attaches the specified continuous listening module to the specified audio device. (Similar to cont_ad_init, but without the need to calibrate the audio device.) cont_ad_close: Closes the continuous listening module. Some more details on the cont_ad_read function: Operationally, every call to cont_ad_read causes the module to read the associated raw A/D source (as much data as possible and available), scan it for speech (non-silence) segments and enqueue them internally. It returns the first available segment of speech data, if any. In addition to returning non-silence data, the function also updates a couple of parameters that may be of interest to the application: The signal level for the most recently read data. This is the siglvl member variable of the cont_ad_t structure returned by cont_ad_init(). A timestamp value indicating the total number of raw audio samples that have been consumed at the end of the most recent cont_ad_read() call. This is in the read_ts member variable of the cont_ad_t structure. So, for example, if on two successive calls to cont_ad_read, the timestamp is 100000 and 116000, respectively, the application can determine that 1 sec (16000 samples) of silence have been gobbled up between the two calls. Silence regions aren\'t chopped off completely. About 50-100ms worth of silence is preserved at either end of a speech segment and passed on to the application. Finally, the continuous listener won\'t concatenate speech segments separated by silence. That is, the data returned by a single call to cont_ad_read will not span raw audio separated by silence that has been gobbled up. cont_ad_read must be called frequently enough to avoid loss of input data owing to buffer overflow. The application is responsible for turning actual recording on and off, if applicable. In particular, it must ensure that recording is on during calibration and normal operation. See examples/cont_adseg.c for an example that uses the continuous listening module to segment live audio input into separate utterances. Similarly, examples/cont_fileseg.c segments a given pre-recorded file containing audio data into utterances. The implementation of continuous listening is in src/libfe/cont_ad.c. Applications that use this module are required to link with libfe and libcommon (and libad if necessary). '], ['compressprior', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'If compressprior is FALSE, based on current utterance statistics (batch mode) otherwise based on past history (live mode). -compress should be FALSE if continuous listening is used.'], ['agcmax', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Automatic gain control (AGC) option. In batch mode only -agcmax should be TRUE, and in live mode only -agcemax.'], ['agcemax', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Automatic gain control (AGC) option. In batch mode only -agcmax should be TRUE, and in live mode only -agcemax.'], ['live', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Forces some live mode flags: -nmprior -compressprior and -agcemax to TRUE if any AGC is on.'], ['samp', '=i', 16000, choices => [8000, 16000], label => 'Sampling rate:'], ['fwdflat', '=s', 'TRUE', choices => ['TRUE', 'FALSE'], longhelp => 'Run flat-lexical Viterbi search after tree-structured pass (for better accuracy). Usually FALSE in live mode.'], ['bestpath', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Run global best path search over Viterbi search word lattice output (for better accuracy).'], ['compallsen', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Compute all senones, whether active or inactive, in each frame.'], ['latsize', '=i', 50000, longhelp => 'Word lattice entries to be allocated. Longer sentences need larger lattices.'], 'Beam Widths', ['top', '=i', 4, longhelp => 'Number of codewords computed per frame. Usually, narrowed to 1 in live mode.'], ['beam', '=f', 1e-6, longhelp => 'Main pruning thresholds for tree search. Usually narrowed down to 2e-6 in live mode.'], ['npbeam', '=f', 1e-6, longhelp => 'Main pruning thresholds for tree search. Usually narrowed down to 2e-6 in live mode.'], ['lpbeam', '=f', 1e-5, longhelp => 'Additional pruning threshold for transitions to leaf nodes of lexical tree. Usually narrowed down to 2e-5 in live mode.'], ['lponlybeam', '=f', 3e-4, longhelp => 'Yet more pruning thresholds for leaf nodes and exits from lexical tree. Usually narrowed down to 5e-4 in live mode.'], ['nwbeam', '=f', 3e-4, longhelp => 'Yet more pruning thresholds for leaf nodes and exits from lexical tree. Usually narrowed down to 5e-4 in live mode.'], ['fwdflatbeam', '=f', 1e-8, longhelp => 'Main and word-exit pruning thresholds for the optional, flat lexical Viterbi search.'], ['fwdflatnwbeam', '=f', 3e-4, longhelp => 'Main and word-exit pruning thresholds for the optional, flat lexical Viterbi search.'], ['topsenfrm', '=i', 1, longhelp => 'No. of lookahead frames for predicting active base phones. (If <=1, all base phones assumed to be active every frame.)'], ['topsenthresh', '=i', -60000, longhelp => 'topsenthresh is log(pruning threshold) applied to raw senone scores to determine active phones in each frame.'], 'Language Weights/Penalties', ['langwt', '=f', 6.5, label => 'lexical tree Viterbi search language weight:'], ['fwdflatlw', '=f', 8.5, label => 'flat structured Viterbi search language weight:'], ['rescorelw', '=f', 9.5, label => 'global word lattice search language weight'], ['ugwt', '=f', 1.0, label => 'unigram weight:', longhelp => 'Unigram weight for interpolating unigram probabilities with uniform distribution. Typically in the range 0.5-0.8.'], ['inspen', '=f', 0.65, label => 'insertion penalty:', longhelp => 'Word insertion penalty or probability (for words in the LM).'], ['silpen', '=f', 0.005, label => 'silence penalty:', longhelp => 'insertion penalty for the silence word'], ['fillpen', '=f', 1e-8, label => 'fill penalty:', longhelp => 'insertion penalty for noise words (from -ndictfn file) if any.'], ['oovugprob', '=f', -4.5, longhelp => 'Unigram probability (logprob) for OOV words from -oovdictfn file, if any.'], 'Output Specifications', ['matchfn', '=s', undef, label => 'match filename:', longhelp => 'Filename to which final recognition string for each utterance written'], ['matchsegfn', '=s', undef, label => 'match segmentation filename:', longhelp => 'Like above, but contains word segmentation info: startframe #frames word...'], ['reportpron', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Causes word pronunciation to be included in output files.'], ['rawlogdir', '=s', undef, label => 'Raw log directory', longhelp => 'If specified, logs raw A/D input samples for each utterance to the indicated directory. (One file per utterance, named .raw.)'], ['mfclogdir', '=s', undef, longhelp => 'If specified, logs cepstrum data for each utterance to the indicated directory. (One file per utterance, named .mfc.)'], ['dumplatdir', '=s', undef, label => 'word lattice directory', longhelp => 'If specified, dumps word lattice for each utterance to a file in this directory.'], ['logfn', '=s', undef, label => 'log filename:', longhelp => 'Filename to which decoder logging information is written.'], ['backtrace', '=s', 'TRUE', choices => ['TRUE', 'FALSE'], longhelp => 'Includes detailed word backtrace information in log file.'], ['nbest', '=i', 0, longhelp => 'No. of N-best hypotheses to be produced. Currently, this flag is only useful in batch mode. But an application can always directly invoke search_get_alt to obtain them. Also, the current implementation is lacking in some details (e.g., in returning detailed scores).\ '], ['nbestdir', '=s', '.', label => 'nbest direstory', longhelp => 'Directory to which N-best files written (one/utterance).'], ['taword', '=s', 'TRUE', choices => ['TRUE', 'FALSE'], longhelp => 'Whether word output should be produced when running in forced alignment mode.'], ['taphone', '=s', 'TRUE', choices => ['TRUE', 'FALSE'], longhelp => 'Whether phone output should be produced when running in forced alignment mode.'], ['tastate', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Whether state alignment output should be produced when running in forced alignment mode.']); my %options = (); my $opt =new Tk::Getopt(-opttable => \@opttable, -options => \%options, -filename => "$ENV{HOME}/.sphinx.options"); $opt->set_defaults; $opt->load_options; $opt->get_options; $opt->process_options; my $top = MainWindow->new; $top->Label(-text => 'SphinxGUI')->pack; my $sphinxexe = $top->Entry; $sphinxexe->insert(0, 'sphinx2-batch'); $sphinxexe->pack; my $buttons = $top->Frame->pack; $buttons->Button(-text => 'Run', -command => sub{&run}) ->pack(-side => 'right'); $buttons->Button(-text => 'Quit', -command => [$top => 'destroy']) ->pack(-side => 'right'); $buttons->Button(-text => 'Conigure', -command => sub{&get_options}) ->pack(-side => 'right'); MainLoop; sub get_options { $opt->option_editor($top); $opt->get_options; } sub run { my @sphinxargs; while (my ($key, $val) = each %options) { push @sphinxargs, "-$key", $val if $val ne ''; } warn join(' ', @sphinxargs); system $sphinxexe->get, @sphinxargs; }