00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _UNIGRAMLM_HPP
00014 #define _UNIGRAMLM_HPP
00015
00016 #include "Counter.hpp"
00017 #include "Exception.hpp"
00018 #include <cstring>
00020
00025 class UnigramLM {
00026 public:
00028 virtual double prob(int wordIndex) = 0;
00030 virtual const char * lexiconID() = 0;
00031
00033 virtual void startIteration() = 0;
00034 virtual bool hasMore() = 0;
00035 virtual void nextWordProb(int &wordIndex, double &prob) = 0;
00036 };
00037
00038
00040
00041 class SmoothedMLEstimator : public UnigramLM {
00042 public:
00043 SmoothedMLEstimator(Counter &counter, const char *lexiconID) : ct(counter), lexID(lexiconID) {}
00044 virtual ~SmoothedMLEstimator() {}
00045
00046 virtual double prob(int wordIndex) {
00047 return (probEstimate(wordIndex, ct.count(wordIndex),ct.sum()));
00048 }
00049
00050 virtual void startIteration() {
00051 ct.startIteration();
00052 }
00053
00054 virtual bool hasMore() {
00055 return ct.hasMore();
00056 }
00057
00058 virtual void nextWordProb(int &wordIndex, double &prob) {
00059 double count;
00060 ct.nextCount(wordIndex, count);
00061 prob = probEstimate(wordIndex, count, ct.sum());
00062 }
00063
00064 virtual const char * lexiconID() { return lexID;}
00065
00067 virtual double probEstimate(int wordIndex, double wdCount, double sumCount) =0;
00068
00069 protected:
00070 Counter &ct;
00071 const char *lexID;
00072 };
00073
00075
00076 class MLUnigramLM : public SmoothedMLEstimator {
00077 public:
00078 MLUnigramLM(Counter & counter, const char *lexiconID) : SmoothedMLEstimator(counter, lexiconID) {};
00079 virtual ~MLUnigramLM() {}
00080
00081 virtual double probEstimate(int wordIndex, double count, double sum) {
00082 return (count/sum);
00083 }
00084 };
00085
00087 class LaplaceUnigramLM : public SmoothedMLEstimator {
00088 public:
00089 LaplaceUnigramLM(Counter & counter, const char *lexiconID, double vocabSize) : SmoothedMLEstimator(counter, lexiconID), vocSz(vocabSize) {};
00090 virtual ~LaplaceUnigramLM() {}
00091
00092 virtual double probEstimate(int wordIndex, double count, double sum) {
00093 return ((count+1)/(sum+vocSz));
00094 }
00095 private:
00096 double vocSz;
00097 };
00098
00099
00101
00102 class DirichletUnigramLM : public SmoothedMLEstimator {
00103 public:
00104 DirichletUnigramLM(Counter & counter, const char *lexiconID,
00105 UnigramLM &refLM, double priorSampleSize)
00106 : SmoothedMLEstimator(counter, lexiconID), ref(&refLM),
00107 s(priorSampleSize) {}
00108
00109 virtual ~DirichletUnigramLM() {}
00110
00111 virtual double probEstimate(int wordIndex, double count, double sum) {
00112 return ((count+s*ref->prob(wordIndex))/(sum+s));
00113 }
00114
00115 private:
00116 UnigramLM *ref;
00118 double s;
00119 };
00120
00121
00122
00123
00124
00125
00127
00128 class InterpUnigramLM : public SmoothedMLEstimator {
00129 public:
00130 InterpUnigramLM(Counter & counter, const char *lexiconID,
00131 UnigramLM &refLM, double refCoeff)
00132 : SmoothedMLEstimator(counter, lexiconID), ref(&refLM),
00133 refC(refCoeff) {}
00134
00135 virtual ~InterpUnigramLM() {}
00136
00137 virtual double probEstimate(int wordIndex, double count, double sum) {
00138 return ((1-refC)*count/sum + refC*ref->prob(wordIndex));
00139 }
00140
00141 private:
00142 UnigramLM *ref;
00144 double refC;
00145 };
00146
00147
00148
00149
00150
00151 #endif