00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 00015 00016 00017 // ====================================================================== 00018 // Terms.hpp 00019 // Adam Berger 00020 // Stores a hash table of recognized term spellings 00021 // ====================================================================== 00022 00023 #ifndef _TERMSH_ 00024 #define _TERMSH_ 00025 00026 #include "common_headers.hpp" 00027 #include "String.hpp" 00028 #include "ISet.hpp" 00029 #include "Array.hpp" 00030 00031 static const char *OOV_SPELLING = "[OOV]"; 00032 static const char *NULL_SPELLING = "[NULL]"; 00033 static const int MAX_SETS_OPEN = 128; 00034 00035 00036 class Terms { 00037 public: 00038 static Terms *construct(const char *path_); 00039 const char *operator[](int idx) const { return terms[idx]; } 00040 int operator[](const char *t) const 00041 { int k=terms[t]; if (k==-1) return getIndexOfOOV(); else return k;} 00042 int size() const { return terms.size(); } 00043 int getIndexOfOOV() const { return terms[OOV_SPELLING]; } 00044 int getIndexOfNullTerm() const { return terms[NULL_SPELLING]; } 00045 const char *getPath() const { return path; } 00046 static const char * getOOVSpelling() { return OOV_SPELLING; } 00047 int tokenize(const char *buff, Array<int> &tokenized) const; 00048 00049 // for back-compatibility with 089 code 00050 int tokenize(const char *buff, Array<unsigned short> &tokenized) const; 00051 00052 public: 00053 void open(const char *path_); 00054 00055 private: 00056 ISet<String> terms; 00057 String path; 00058 00059 private: 00060 static Terms* setOfSets; 00061 static int nSetsOpen; 00062 static Terms *SetOfSets() { 00063 if (setOfSets==NULL) setOfSets = new Terms [MAX_SETS_OPEN]; 00064 return setOfSets; 00065 } 00066 static int &NSetsOpen() { return nSetsOpen; } 00067 }; 00068 00069 class Source : public Terms { 00070 }; 00071 00072 class Target : public Terms { 00073 }; 00074 00075 00076 #endif