00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 #ifndef _QRYBASEDSAMPLER_HPP 00013 #define _QRYBASEDSAMPLER_HPP 00014 00015 00016 00017 #include "FreqCounter.hpp" 00018 #include "DBManager.hpp" 00019 00021 typedef stringset docidset; 00022 00024 #define T_NDOCS 1 00025 00026 #define T_NWORDS 2 00027 00028 #define T_NQRYS 4 00029 00033 class QryBasedSampler { 00034 public: 00035 QryBasedSampler(); 00036 ~QryBasedSampler(); 00037 00039 bool probe(char * initQuery); 00040 00042 void setDBManager(DBManager * database); 00043 00045 DBManager * getDBManager(); 00046 00047 00050 void setFreqCounter(FreqCounter * counter); 00051 00053 FreqCounter * getFreqCounter(); 00054 00055 00059 void setOutputPrefix(char * prefix); 00060 00062 char * getOutputPrefix(); 00063 00065 void setNumDocs(int n); 00066 00068 int getNumDocs(); 00069 00070 00072 void setNumWords(int n); 00073 00075 int getNumWords(); 00076 00077 00079 void setNumQueries(int n); 00080 00082 int getNumQueries(); 00083 00084 00091 void setTermMode(int m); 00092 00094 int getTermMode(); 00095 00096 00098 void setDocsPerQuery(int n); 00099 00101 int getDocsPerQuery(); 00102 00103 00104 private: 00105 00106 /* for querying a db */ 00107 DBManager * db; 00108 00109 00110 /* for building a description of a db */ 00111 FreqCounter * freqCounter; 00112 00113 00114 /* output prefix for filenames */ 00115 char * outputPrefix; 00116 00117 00118 /* termination mode of the probe - 00119 * either T_NDOCS or T_NWORDS */ 00120 int termMode; 00121 00122 /* number unique docs to retrieve - only used if 00123 * termMode == T_NDOCS */ 00124 int numDocs; 00125 00126 /* number unique words to retrieve - only used if 00127 * termMode == T_NWORDS */ 00128 int numWords; 00129 00130 /* number of queries to run - only used if 00131 * termMode == T_NQRYS */ 00132 int numQueries; 00133 00134 /* documents per query to use */ 00135 int docsPerQuery; 00136 00137 /* stores the ids of the document already retrieved 00138 * from the system. used to prevent parsing 00139 * a document multiple times */ 00140 docidset seenDocs; 00141 }; 00142 00143 #endif