#!/usr/local/bin/perl
#
# A program for graphically handling sphinx argumants.
# Thomas K Harris <tkharris@cs.cmu.edu>
# December 2002
#
##############################################################

use strict;
use Tk::Getopt;
use Tk;

my @opttable = ('Input Model Databases',
		['lmfn', '=s', undef, label => 'LM file:', longhelp => 
'Optional DARPA format bigram/trigram backoff LM file with the empty string as 
its name.'],
		['lmctlfn', '=s', undef, label => 'LM control file', longhelp => 
'Optional LM control file with a list of LM files and associated names (one 
line per entry). This is how multiple LMs can be loaded during 
initialization.'],
		['kbdumpdir', '=s', undef, label => 'dump file directory:', longhelp => 
'Building LM Dump Files
LM files are usually ASCII files. If they are large, it is time consuming to 
read them into the decoder. A binary "dump" file is much faster to read and 
more compact. 
LM dump files can be created by either a standalone program 
examples/lm3g2dmp.c or the decoder. The standalone version can be compiled 
from the examples directory. The program takes two arguments, the LM source 
file and a directory in which the dump file is to be created. It reads the 
header from the original LM file to determine the size of the LM. It then 
forms the binary dump file name by appending a .DMP extension to the LM file 
name. This file is written to the second (directory) argument. (NOTE: The dump
file must not already exist!!) 

Any version of the decoder can also automatically create binary "dump" files 
similar to the standalone version described above. It first looks for the dump
file in the directory given by the -kbdumpdir argument. If the dump file is 
present it reads it and ignores the rest of the original LM file. Otherwise, 
it reads the LM file and creates a dump file in the -kbdumpdir directory so 
that it can be used in subsequent decoder runs. 

The decoder does not create dump files for small LMs that have fewer than an 
internally defined number of bigrams and trigrams.'],
		['dictfn', '=s', undef, label => 'Main pronunciation dictionary file:'],
		['oovdictfn', '=s', undef, label => '(OOV) pronunciation dictionary:', longhelp => 
'Optional out-of-vocabulary (OOV) pronunciation dictionary. These are added to
the unnamed LM (read from -lmfn file) with unigram probability given by 
-oovugprob. '],
		['ndictfn', '=s', undef, label => '"noise" words pronunciation dictionary:', longhelp => 
'Optional "noise" words pronunciation dictionary. Noise words are not part of 
any LM and, like silence, can be inserted transparently anywhere in the 
utterance.'],
		['phnfn', '=s', undef, label => 'phone file:', longhelp => 
'Phone files with senone mapping information for the given dictionary and 
acoustic model.'],
		['mapfn', '=s', undef, label => 'map file:', longhelp => 
'Map file with senone mapping information for the given dictionary and 
acoustic model.'],
		['hmmdir', '=s', undef, label => 'HMM direcory', longhelp => 
'Directory with Sphinx-II semi-continuous HMM acoustic models.'],
		#['hmmdirlist', '=s', undef],
		['cbdir', '=s', undef, label => 'codebook directory:', longhelp => 
'Directory with Sphinx-II semi-continuous HMM codebooks.'],
		['sendumpfn', '=s', undef, label => 'senome model file:', longhelp => 

'Building 8-Bit Senone Dump Files

The Sphinx-II senonic acoustic model files contain 32-bit data. (These
are in the directory specified by the -hmmdir argument.) However, they
can be clustered down to 8-bits for memory efficiency, without loss of
recognition accuracy. The clustering is carried out by an offline
process as follows:

Create a temporary 32-bit senone dump file by running the decoder with
the -sendumpfn flag set to the temporary file name, the -8bsen flag
set to FALSE, and omitting the -lmfn argument. The decoder can be
killed after it creates the 32-bit senone dump file, which happens
during the initialization and is announced in the log output.

Run: /afs/cs/project/plus-2/s2/Sphinx2/bin/alpha/pdf32to8b 32bit-file
8bit-file to create the 8-bit senone dump file. That is, the first
argument to pdf32to8b is the temporary 32-bit dump file created above,
and the second argument is the 8-bit output file.

Delete the temporary 32-bit file. 

'],
		['-8bsen', '=s', undef, choices => ['TRUE', 'FALSE'], label => '8 bit senome', longhelp => 'should be TRUE if the 8-bit senones are used'],

		'Decoder Configuration',
		['ctlfn', '=s', undef, label => 'control file:', longhelp => 
'Batch-mode control file listing utterance files (without their file-extension)
to decode. Must not be specified for live-mode or application-driven operation'],
		['ctloffset', '=i', undef, label => 'control offset', longhelp => 'The (optional) number of initial utterances in the file to be skipped'],
		['ctlcount', '=i', undef, label => 'control count', longhelp => 
'the number to be processed (after the skip, if any). defaults to ALL'],
		['datadir', '=s', undef, label => 'data directory:', longhelp => 'If the control file entries are relative pathnames, an optional directory 
prefix for them may be specified using this argument.'],
		['allphone', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp =>
'Should be TRUE to configure the recognition engine for allphone mode 
operation.'],
		['tactlfn', '=s', undef, label => 'input transcript file', longhelp => 
'Input transcript file, parallel to the control file (-ctlfn) in forced 
alignment mode.'],
		['adcin', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], label => 'A/D cepstrum input', longhelp => 
'In batch mode, \'A/D cepstrum input\' selects A/D (TRUE) or cepstrum input 
data (FALSE).'],
		['adcext', '=s', 'raw', label => 'A/D cepstrum extention:', longhelp => 
'If TRUE, \'A/D cepstrum extention\'is the file extension to be appended to 
names listed in the -ctlfn argument file'],
		['adchdr', '=i', 0, label => 'A/D cepstrum header', longhelp => 'the number of bytes of header in each input file'],
		['adcendian', '=i', 1, choices => [0, 1], label => 'A/D cepstrum endian:', longhelp => 'byte ordering: 0 for big-endian, 1 for little-endian'],
		['normmean', '=s', 'TRUE', choices => ['TRUE', 'FALSE'], label => 'cepstrum mean normalization'],
		['nmprior', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], label => 'mean normalization prior:', longhelp => 
'If mean normalization prior is FALSE, CMN computed on current utterance only 
(usually batch mode), otherwise based on past history (live mode).'],
		['compress', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], label => 'silence deletion:', longhelp => 

'Continuous Listening and Silence Filtering

As mentioned earlier, Sphinx2 can only decode utterances that are
limited to less than about 30 sec at a time. However, one often wants
to leave the audio recording running continuously and automatically
determine utterance boundaries based on pauses in the input
speech. The continuous listening module in Sphinx2 provides the
mechanisms for this purpose.

The silence filtering module is interposed between the raw audio input
source and the application. The application calls the function
cont_ad_read instead of directly reading the raw A/D input source
(e.g., via the ad_read function described above). cont_ad_read returns
only those segments of input audio that it determines to be
non-silence. Additional timestamp information is provided to inform
the application about silence regions that have been dropped.

The complete continuous listening API is defined in include/cont_ad.h
and is summarized below: cont_ad_init:

Associates a new continuous listening module instance with a specified
raw A/D handle and a corresponding read function pointer. E.g., these
may be the handle returned by ad_open and function ad_read described
above.

cont_ad_calib: 

Calibrates the background silence level by reading the raw audio for
a few seconds. It should be done once immediately after cont_ad_init,
and after any environmental change.

cont_ad_read: 

Reads and returns the next available block of non-silence data in a
given buffer. (Uses the read function and handle supplied to
cont_ad_init to obtain the raw A/D data.) More details are provided
below.

cont_ad_reset: 

Flushes any data buffered inside the module. Useful for discarding
accumulated, but unprocessed speech.

cont_ad_set_thresh: 

Useful for adjusting the silence and speech thresholds.  

cont_ad_detach: 

Detaches the specified continuous listening module from the associated
audio device.

cont_ad_attach: 

Attaches the specified continuous listening module to the specified
audio device. (Similar to cont_ad_init, but without the need to
calibrate the audio device.)

cont_ad_close: 

Closes the continuous listening module.  

Some more details on the cont_ad_read function: Operationally, every
call to cont_ad_read causes the module to read the associated raw A/D
source (as much data as possible and available), scan it for speech
(non-silence) segments and enqueue them internally. It returns the
first available segment of speech data, if any. In addition to
returning non-silence data, the function also updates a couple of
parameters that may be of interest to the application:

The signal level for the most recently read data. This is the siglvl
member variable of the cont_ad_t structure returned by cont_ad_init().

A timestamp value indicating the total number of raw audio samples
that have been consumed at the end of the most recent cont_ad_read()
call. This is in the read_ts member variable of the cont_ad_t
structure.

So, for example, if on two successive calls to cont_ad_read, the
timestamp is 100000 and 116000, respectively, the application can
determine that 1 sec (16000 samples) of silence have been gobbled up
between the two calls.

Silence regions aren\'t chopped off completely. About 50-100ms worth
of silence is preserved at either end of a speech segment and passed
on to the application.

Finally, the continuous listener won\'t concatenate speech segments
separated by silence. That is, the data returned by a single call to
cont_ad_read will not span raw audio separated by silence that has
been gobbled up.

cont_ad_read must be called frequently enough to avoid loss of input
data owing to buffer overflow. The application is responsible for
turning actual recording on and off, if applicable. In particular, it
must ensure that recording is on during calibration and normal
operation.

See examples/cont_adseg.c for an example that uses the continuous
listening module to segment live audio input into separate
utterances. Similarly, examples/cont_fileseg.c segments a given
pre-recorded file containing audio data into utterances.

The implementation of continuous listening is in
src/libfe/cont_ad.c. Applications that use this module are required to
link with libfe and libcommon (and libad if necessary).

'],
		['compressprior', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 
'If compressprior is FALSE, based on current utterance statistics (batch mode)
otherwise based on past history (live mode). -compress should be FALSE if continuous listening is used.'],
		['agcmax', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Automatic gain control (AGC) option. In batch mode only -agcmax should be
TRUE, and in live mode only -agcemax.'],
                ['agcemax', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Automatic gain control (AGC) option. In batch mode only -agcmax should be
TRUE, and in live mode only -agcemax.'],
		['live', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 
'Forces some live mode flags: -nmprior -compressprior and -agcemax to TRUE if 
any AGC is on.'],
		['samp', '=i', 16000, choices => [8000, 16000], label => 'Sampling rate:'],
		['fwdflat', '=s', 'TRUE', choices => ['TRUE', 'FALSE'], longhelp => 'Run flat-lexical Viterbi search after tree-structured pass (for better 
accuracy). Usually FALSE in live mode.'],
		['bestpath', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Run global best path search over Viterbi search word lattice output 
(for better accuracy).'],
		['compallsen', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Compute all senones, whether active or inactive, in each frame.'],
		['latsize', '=i', 50000, longhelp => 'Word lattice entries to be allocated. Longer sentences need larger lattices.'],
		'Beam Widths',
		['top', '=i', 4, longhelp => 'Number of codewords computed per frame. Usually, narrowed to 1 in live mode.'],
		['beam', '=f', 1e-6, longhelp => 'Main pruning thresholds for tree search. Usually narrowed down to 2e-6 in live 
mode.'],
		['npbeam', '=f', 1e-6, longhelp => 'Main pruning thresholds for tree search. Usually narrowed down to 2e-6 in live 
mode.'],
		['lpbeam', '=f', 1e-5, longhelp => 'Additional pruning threshold for transitions to leaf nodes of lexical tree. 
Usually narrowed down to 2e-5 in live mode.'],
		['lponlybeam', '=f', 3e-4, longhelp => 'Yet more pruning thresholds for leaf nodes and exits from lexical tree. Usually 
narrowed down to 5e-4 in live mode.'],
		['nwbeam', '=f', 3e-4, longhelp => 'Yet more pruning thresholds for leaf nodes and exits from lexical tree. Usually 
narrowed down to 5e-4 in live mode.'],
		['fwdflatbeam', '=f', 1e-8, longhelp => 'Main and word-exit pruning thresholds for the optional, flat lexical Viterbi 
search.'],
		['fwdflatnwbeam', '=f', 3e-4, longhelp => 'Main and word-exit pruning thresholds for the optional, flat lexical Viterbi 
search.'],
		['topsenfrm', '=i', 1, longhelp => 'No. of lookahead frames for predicting active base phones. (If <=1, all base 
phones assumed to be active every frame.)'],
		['topsenthresh', '=i', -60000, longhelp => 'topsenthresh is log(pruning threshold) applied to raw senone scores to 
determine active phones in each frame.'],
		'Language Weights/Penalties',
		['langwt', '=f', 6.5, label => 'lexical tree Viterbi search language weight:'],
		['fwdflatlw', '=f', 8.5, label => 'flat structured Viterbi search language weight:'],
		['rescorelw', '=f', 9.5, label => 'global word lattice search language weight'],
		['ugwt', '=f', 1.0, label => 'unigram weight:', longhelp => 'Unigram weight for interpolating unigram probabilities with uniform 
distribution. Typically in the range 0.5-0.8.'],
		['inspen', '=f', 0.65, label => 'insertion penalty:', longhelp => 'Word insertion penalty or probability (for words in the LM).'],
		['silpen', '=f', 0.005, label => 'silence penalty:', longhelp => 'insertion penalty for the silence word'],
		['fillpen', '=f', 1e-8, label => 'fill penalty:', longhelp => 'insertion penalty for noise words (from -ndictfn file) if any.'],
		['oovugprob', '=f', -4.5, longhelp => 'Unigram probability (logprob) for OOV words from -oovdictfn file, if any.'],
		'Output Specifications',
		['matchfn', '=s', undef, label => 'match filename:', longhelp => 'Filename to which final recognition string for each utterance written'],
		['matchsegfn', '=s', undef, label => 'match segmentation filename:', longhelp => 'Like above, but contains word segmentation info: startframe #frames word...'],
		['reportpron', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Causes word pronunciation to be included in output files.'],
		['rawlogdir', '=s', undef, label => 'Raw log directory', longhelp => 'If specified, logs raw A/D input samples for each utterance to the indicated directory. (One file per utterance, named <uttid>.raw.)'],
		['mfclogdir', '=s', undef, longhelp => 'If specified, logs cepstrum data for each utterance to the indicated directory. (One file per utterance, named <uttid>.mfc.)'],
		['dumplatdir', '=s', undef, label => 'word lattice directory', longhelp => 'If specified, dumps word lattice for each utterance to a file in this 
directory.'],
		['logfn', '=s', undef, label => 'log filename:', longhelp => 'Filename to which decoder logging information is written.'],
		['backtrace', '=s', 'TRUE', choices => ['TRUE', 'FALSE'], longhelp => 'Includes detailed word backtrace information in log file.'],
		['nbest', '=i', 0, longhelp => 
'No. of N-best hypotheses to be produced. Currently, this flag is only
useful in batch mode. But an application can always directly invoke
search_get_alt to obtain them. Also, the current implementation is
lacking in some details (e.g., in returning detailed scores).\
'],
                ['nbestdir', '=s', '.', label => 'nbest direstory', longhelp => 'Directory to which N-best files written (one/utterance).'],
		['taword', '=s', 'TRUE', choices => ['TRUE', 'FALSE'], longhelp => 'Whether word output should be produced when running in forced alignment mode.'],
		['taphone', '=s', 'TRUE', choices => ['TRUE', 'FALSE'], longhelp => 'Whether phone output should be produced when running in forced alignment mode.'],
		['tastate', '=s', 'FALSE', choices => ['TRUE', 'FALSE'], longhelp => 'Whether state alignment output should be produced when running in forced alignment 
mode.']);
my %options = ();
my $opt =new Tk::Getopt(-opttable => \@opttable,
			-options => \%options,
			-filename => "$ENV{HOME}/.sphinx.options");
$opt->set_defaults;
$opt->load_options;
$opt->get_options;
$opt->process_options;

my $top = MainWindow->new;
$top->Label(-text => 'SphinxGUI')->pack;
my $sphinxexe = $top->Entry;
$sphinxexe->insert(0, 'sphinx2-batch');
$sphinxexe->pack;
my $buttons = $top->Frame->pack;
$buttons->Button(-text => 'Run', -command => sub{&run})
    ->pack(-side => 'right');
$buttons->Button(-text => 'Quit', -command => [$top => 'destroy'])
    ->pack(-side => 'right');
$buttons->Button(-text => 'Conigure', -command => sub{&get_options})
    ->pack(-side => 'right');
MainLoop;

sub get_options {
    $opt->option_editor($top);
    $opt->get_options;
}

sub run {
    my @sphinxargs;
    while (my ($key, $val) = each %options) {
	push @sphinxargs, "-$key", $val if $val ne '';
    }
    warn join(' ', @sphinxargs);
    system $sphinxexe->get, @sphinxargs;
}
