Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

UnigramLM.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.cs.cmu.edu/~lemur/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 #ifndef _UNIGRAMLM_HPP
00014 #define _UNIGRAMLM_HPP
00015 
00016 #include "Counter.hpp"
00017 #include "Exception.hpp"
00018 #include <cstring>
00020 
00025 class UnigramLM {
00026 public:
00028   virtual double prob(int wordIndex) = 0;
00030   virtual const char * lexiconID() = 0;
00031 
00033   virtual void startIteration() = 0;
00034   virtual bool hasMore() = 0;
00035   virtual void nextWordProb(int &wordIndex, double &prob) = 0;
00036 };
00037 
00038 
00040 
00041 class SmoothedMLEstimator : public UnigramLM {
00042 public:
00043   SmoothedMLEstimator(Counter &counter, const char *lexiconID) : ct(counter), lexID(lexiconID) {}
00044   virtual ~SmoothedMLEstimator() {}
00045 
00046   virtual double prob(int wordIndex) {
00047     return (probEstimate(wordIndex, ct.count(wordIndex),ct.sum()));
00048   }
00049 
00050   virtual void startIteration() {
00051     ct.startIteration();
00052   }
00053 
00054   virtual bool hasMore() {
00055     return ct.hasMore();
00056   }
00057 
00058   virtual void nextWordProb(int &wordIndex, double &prob) {
00059     double count;
00060     ct.nextCount(wordIndex, count);
00061     prob = probEstimate(wordIndex, count, ct.sum());
00062   }
00063   
00064   virtual const char * lexiconID() { return lexID;}
00065 
00067   virtual double probEstimate(int wordIndex, double wdCount, double sumCount) =0;
00068 
00069 protected:
00070   Counter &ct;
00071   const char *lexID;
00072 };
00073   
00075 
00076 class MLUnigramLM : public SmoothedMLEstimator { 
00077 public:
00078   MLUnigramLM(Counter & counter, const char *lexiconID) : SmoothedMLEstimator(counter, lexiconID) {};
00079   virtual ~MLUnigramLM() {}
00080   
00081   virtual double probEstimate(int wordIndex, double count, double sum) {
00082     return (count/sum);
00083   }
00084 };
00085 
00087 class LaplaceUnigramLM : public SmoothedMLEstimator { 
00088 public:
00089   LaplaceUnigramLM(Counter & counter, const char *lexiconID, double vocabSize) : SmoothedMLEstimator(counter, lexiconID), vocSz(vocabSize) {};
00090   virtual ~LaplaceUnigramLM() {}
00091   
00092   virtual double probEstimate(int wordIndex, double count, double sum) {
00093     return ((count+1)/(sum+vocSz));
00094   }
00095 private:
00096   double vocSz;
00097 };
00098 
00099 
00101 
00102 class DirichletUnigramLM : public SmoothedMLEstimator { 
00103 public:
00104   DirichletUnigramLM(Counter & counter, const char *lexiconID, 
00105                      UnigramLM &refLM, double priorSampleSize) 
00106     : SmoothedMLEstimator(counter, lexiconID), ref(&refLM), 
00107     s(priorSampleSize) {}
00108 
00109   virtual ~DirichletUnigramLM() {}
00110   
00111   virtual double probEstimate(int wordIndex, double count, double sum) {
00112     return ((count+s*ref->prob(wordIndex))/(sum+s));
00113   }
00114 
00115 private:
00116   UnigramLM *ref;
00118   double s;  
00119 };
00120 
00121 
00122 
00123 
00124 
00125 
00127 
00128 class InterpUnigramLM : public SmoothedMLEstimator { 
00129 public:
00130   InterpUnigramLM(Counter & counter, const char *lexiconID, 
00131                      UnigramLM &refLM, double refCoeff) 
00132     : SmoothedMLEstimator(counter, lexiconID), ref(&refLM), 
00133     refC(refCoeff) {}
00134 
00135   virtual ~InterpUnigramLM() {}
00136   
00137   virtual double probEstimate(int wordIndex, double count, double sum) {
00138     return ((1-refC)*count/sum + refC*ref->prob(wordIndex));
00139   }
00140 
00141 private:
00142   UnigramLM *ref;
00144   double refC;  
00145 };
00146 
00147 
00148 
00149 
00150 
00151 #endif /* _UNIGRAMLM_HPP */

Generated at Fri Jul 26 18:22:28 2002 for LEMUR by doxygen1.2.4 written by Dimitri van Heesch, © 1997-2000