ngram.h Source File

00001 /*=====================================================================
00002                 =======   COPYRIGHT NOTICE   =======
00003 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00004 Ronald Rosenfeld and Philip Clarkson.
00005 
00006 All rights reserved.
00007 
00008 This software is made available for research purposes only.  It may be
00009 redistributed freely for this purpose, in full or in part, provided
00010 that this entire copyright notice is included on any copies of this
00011 software and applications and derivations thereof.
00012 
00013 This software is provided on an "as is" basis, without warranty of any
00014 kind, either expressed or implied, as to any matter including, but not
00015 limited to warranty of fitness of purpose, or merchantability, or
00016 results obtained from use of this software.
00017 ======================================================================*/
00018 
00019 
00020 /* Type and function definitions for general n_gram models */
00021 
00022 #ifndef _NGRAM_H_
00023 #define _NGRAM_H_
00024 
00025 #include "rr_libs/sih.h"
00026 #include "toolkit.h"
00027 
00028 #define DEFAULT_COUNT_TABLE_SIZE 65535
00029 #define DEFAULT_OOV_FRACTION 0.5
00030 #define DEFAULT_DISC_RANGE_1 1
00031 #define DEFAULT_DISC_RANGE_REST 7
00032 #define DEFAULT_MIN_ALPHA -3.2
00033 #define DEFAULT_MAX_ALPHA 2.5
00034 #define DEFAULT_OUT_OF_RANGE_ALPHAS 10000
00035 
00036 #define GOOD_TURING 1
00037 #define ABSOLUTE 2
00038 #define LINEAR 3
00039 #define WITTEN_BELL 4
00040 
00041 #define SPECIFIED 1
00042 #define BUFFER 2
00043 #define TWO_PASSES 3
00044 
00045 #define KEY 65000
00046 
00047 #define CLOSED_VOCAB 0
00048 #define OPEN_VOCAB_1 1
00049 #define OPEN_VOCAB_2 2
00050 
00051 typedef unsigned short id__t; /* Double underscore, since id_t is
00052                                  already defined on some platforms */
00053 typedef int count_t;   /* The count as read in, rather than its index 
00054                           in the count table. */
00055 typedef unsigned short count_ind_t; /* The count's index in the count 
00056                                        table. */
00057 typedef unsigned short bo_weight_t;
00058 typedef unsigned short cutoff_t;
00059 typedef int table_size_t;
00060 typedef unsigned short index__t;
00061 typedef double disc_val_t;
00062 typedef double uni_probs_t;
00063 typedef int ptr_tab_t;
00064 typedef float four_byte_t;
00065 
00071 typedef struct {
00072   unsigned short n; 
00073   id__t          *id_array; 
00074   count_t        count;
00075 } ngram;
00076 
00080 typedef struct {
00081   unsigned short count_table_size;
00082   int            *counts_array;
00083 } count_table_t;
00084 
00085 
00090 typedef struct {
00091 
00092   /* Language model type */
00093 
00094   unsigned short n;                
00095   int            version;
00096 
00097   /* Vocabulary stuff */
00098 
00099   sih_t          *vocab_ht;      
00100   unsigned short vocab_size;     
00101   char           **vocab;        
00102   unsigned short no_of_ccs;      
00104   /* Tree */
00105 
00106   table_size_t   *table_sizes;   
00107   id__t          **word_id;      
00108   count_ind_t    **count;        
00110   count_ind_t    *marg_counts;   
00114   int            **count4;       
00116   int            *marg_counts4;  
00117   bo_weight_t    **bo_weight;    
00118   four_byte_t    **bo_weight4;   
00121   index__t       **ind;          
00124   /* Two-byte alpha stuff */
00125 
00126   double         min_alpha;      
00127   double         max_alpha;      
00128   unsigned short out_of_range_alphas;  
00130   double         *alpha_array;
00131   unsigned short size_of_alpha_array;
00132 
00133   /* Count table */
00134 
00135   count_ind_t    count_table_size; 
00136   count_t        **count_table;    
00138   /* Index lookup tables */
00139 
00140   ptr_tab_t      **ptr_table;     
00142   unsigned short *ptr_table_size; 
00144   /* Discounting and cutoffs - note: some of these may not used,
00145      depending on the discounting techinque used. */
00146 
00147   unsigned short discounting_method;     
00149   cutoff_t       *cutoffs;               
00150   int            **freq_of_freq;         
00152   unsigned short *fof_size;              
00153   unsigned short *disc_range;            
00157   disc_val_t     **gt_disc_ratio;        
00159   disc_val_t     *lin_disc_ratio;        
00160   double         *abs_disc_const;        
00163   /* Unigram statistics */
00164 
00165   uni_probs_t    *uni_probs;             
00166   uni_probs_t    *uni_log_probs;         
00167   flag           *context_cue;           
00169   int            n_unigrams;             
00171   int            min_unicount;           
00173   /* Input files */
00174 
00175   char           *id_gram_filename;  
00176   FILE           *id_gram_fp;        
00177   char           *vocab_filename;    
00178   char           *context_cues_filename; 
00180   FILE           *context_cues_fp;       
00183   /* Output files */
00184 
00185   flag           write_arpa;      
00187   char           *arpa_filename;  
00188   FILE           *arpa_fp;        
00189   flag           write_bin;       
00191   char           *bin_filename;   
00192   FILE           *bin_fp;         
00194   /* Misc */
00195 
00196   int            *num_kgrams;     
00200   unsigned short vocab_type;      
00202   unsigned short first_id;        
00205   /* Once the tree has been constructed, the tables are indexed from 0
00206      to (num_kgrams[i]-1). */
00207 
00208   /* 1-gram tables are indexed from 0 to ng.vocab_size. */
00209 
00210   double         zeroton_fraction; 
00212   double         oov_fraction;
00213   flag           four_byte_alphas;
00214   flag           four_byte_counts;
00215 
00216 } ng_t;
00217 
00218 #endif
00219 
00220