00001 /*===================================================================== 00002 ======= COPYRIGHT NOTICE ======= 00003 Copyright (C) 1996, Carnegie Mellon University, Cambridge University, 00004 Ronald Rosenfeld and Philip Clarkson. 00005 00006 All rights reserved. 00007 00008 This software is made available for research purposes only. It may be 00009 redistributed freely for this purpose, in full or in part, provided 00010 that this entire copyright notice is included on any copies of this 00011 software and applications and derivations thereof. 00012 00013 This software is provided on an "as is" basis, without warranty of any 00014 kind, either expressed or implied, as to any matter including, but not 00015 limited to warranty of fitness of purpose, or merchantability, or 00016 results obtained from use of this software. 00017 ======================================================================*/ 00018 00019 00020 /* Type and function definitions for general n_gram models */ 00021 00022 #ifndef _NGRAM_H_ 00023 #define _NGRAM_H_ 00024 00025 #include "rr_libs/sih.h" 00026 #include "toolkit.h" 00027 00028 #define DEFAULT_COUNT_TABLE_SIZE 65535 00029 #define DEFAULT_OOV_FRACTION 0.5 00030 #define DEFAULT_DISC_RANGE_1 1 00031 #define DEFAULT_DISC_RANGE_REST 7 00032 #define DEFAULT_MIN_ALPHA -3.2 00033 #define DEFAULT_MAX_ALPHA 2.5 00034 #define DEFAULT_OUT_OF_RANGE_ALPHAS 10000 00035 00036 #define GOOD_TURING 1 00037 #define ABSOLUTE 2 00038 #define LINEAR 3 00039 #define WITTEN_BELL 4 00040 00041 #define SPECIFIED 1 00042 #define BUFFER 2 00043 #define TWO_PASSES 3 00044 00045 #define KEY 65000 00046 00047 #define CLOSED_VOCAB 0 00048 #define OPEN_VOCAB_1 1 00049 #define OPEN_VOCAB_2 2 00050 00051 typedef unsigned short id__t; /* Double underscore, since id_t is 00052 already defined on some platforms */ 00053 typedef int count_t; /* The count as read in, rather than its index 00054 in the count table. */ 00055 typedef unsigned short count_ind_t; /* The count's index in the count 00056 table. */ 00057 typedef unsigned short bo_weight_t; 00058 typedef unsigned short cutoff_t; 00059 typedef int table_size_t; 00060 typedef unsigned short index__t; 00061 typedef double disc_val_t; 00062 typedef double uni_probs_t; 00063 typedef int ptr_tab_t; 00064 typedef float four_byte_t; 00065 00071 typedef struct { 00072 unsigned short n; 00073 id__t *id_array; 00074 count_t count; 00075 } ngram; 00076 00080 typedef struct { 00081 unsigned short count_table_size; 00082 int *counts_array; 00083 } count_table_t; 00084 00085 00090 typedef struct { 00091 00092 /* Language model type */ 00093 00094 unsigned short n; 00095 int version; 00096 00097 /* Vocabulary stuff */ 00098 00099 sih_t *vocab_ht; 00100 unsigned short vocab_size; 00101 char **vocab; 00102 unsigned short no_of_ccs; 00104 /* Tree */ 00105 00106 table_size_t *table_sizes; 00107 id__t **word_id; 00108 count_ind_t **count; 00110 count_ind_t *marg_counts; 00114 int **count4; 00116 int *marg_counts4; 00117 bo_weight_t **bo_weight; 00118 four_byte_t **bo_weight4; 00121 index__t **ind; 00124 /* Two-byte alpha stuff */ 00125 00126 double min_alpha; 00127 double max_alpha; 00128 unsigned short out_of_range_alphas; 00130 double *alpha_array; 00131 unsigned short size_of_alpha_array; 00132 00133 /* Count table */ 00134 00135 count_ind_t count_table_size; 00136 count_t **count_table; 00138 /* Index lookup tables */ 00139 00140 ptr_tab_t **ptr_table; 00142 unsigned short *ptr_table_size; 00144 /* Discounting and cutoffs - note: some of these may not used, 00145 depending on the discounting techinque used. */ 00146 00147 unsigned short discounting_method; 00149 cutoff_t *cutoffs; 00150 int **freq_of_freq; 00152 unsigned short *fof_size; 00153 unsigned short *disc_range; 00157 disc_val_t **gt_disc_ratio; 00159 disc_val_t *lin_disc_ratio; 00160 double *abs_disc_const; 00163 /* Unigram statistics */ 00164 00165 uni_probs_t *uni_probs; 00166 uni_probs_t *uni_log_probs; 00167 flag *context_cue; 00169 int n_unigrams; 00171 int min_unicount; 00173 /* Input files */ 00174 00175 char *id_gram_filename; 00176 FILE *id_gram_fp; 00177 char *vocab_filename; 00178 char *context_cues_filename; 00180 FILE *context_cues_fp; 00183 /* Output files */ 00184 00185 flag write_arpa; 00187 char *arpa_filename; 00188 FILE *arpa_fp; 00189 flag write_bin; 00191 char *bin_filename; 00192 FILE *bin_fp; 00194 /* Misc */ 00195 00196 int *num_kgrams; 00200 unsigned short vocab_type; 00202 unsigned short first_id; 00205 /* Once the tree has been constructed, the tables are indexed from 0 00206 to (num_kgrams[i]-1). */ 00207 00208 /* 1-gram tables are indexed from 0 to ng.vocab_size. */ 00209 00210 double zeroton_fraction; 00212 double oov_fraction; 00213 flag four_byte_alphas; 00214 flag four_byte_counts; 00215 00216 } ng_t; 00217 00218 #endif 00219 00220