00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 /* type definitions for objects we will use */ 00014 #ifndef _INVFPTYPES_H 00015 #define _INVFPTYPES_H 00016 00018 #define IND_VERSION "1.9" 00019 00020 #include "common_headers.hpp" 00021 00022 typedef int FILEID_T; 00023 typedef int TERMID_T; 00024 typedef TERMID_T LOC_T; // for simplifying implementation of memory management, it's helpful for termid and location to be of the same type. 00025 typedef TERMID_T DOCID_T; 00026 typedef float SCORE_T; 00027 typedef char* TERM_T; 00028 typedef char* EXDOCID_T; 00029 00030 // suffixes for filenames 00031 #define INVINDEX ".invf" 00032 #define INVFPINDEX ".invfp" 00033 #define INVLOOKUP ".invlookup" 00034 #define DTINDEX ".dt" 00035 #define DTLOOKUP ".dtlookup" 00036 #define TERMIDMAP ".tid" 00037 #define DOCIDMAP ".did" 00038 #define MAINTOC ".inv" 00039 #define INVFPTOC ".ifp" 00040 #define DOCMGRMAP ".dm" 00041 00042 // what to call out of vocabulary ids 00043 #define INVALID_STR "[OOV]" 00044 00045 // name for parameters 00046 #define VERSION_PAR "VERSION" 00047 #define NUMDOCS_PAR "NUM_DOCS" 00048 #define NUMTERMS_PAR "NUM_TERMS" 00049 #define NUMUTERMS_PAR "NUM_UNIQUE_TERMS" 00050 #define AVEDOCLEN_PAR "AVE_DOCLEN" 00051 #define INVINDEX_PAR "INV_INDEX" 00052 #define INVLOOKUP_PAR "INV_LOOKUP" 00053 #define DTINDEX_PAR "DT_INDEX" 00054 #define DTLOOKUP_PAR "DT_LOOKUP" 00055 #define TERMIDMAP_PAR "TERMIDS" 00056 #define DOCIDMAP_PAR "DOCIDS" 00057 #define NUMDT_PAR "NUM_DTFILES" 00058 #define NUMINV_PAR "NUM_INVFILES" 00059 #define DOCMGR_PAR "DOCMGR_IDS" 00060 00061 struct LocatedTerm { // pair of term and its location 00062 TERMID_T term; 00063 LOC_T loc; 00064 }; 00065 00066 struct LLTerm { // pair of term and list of locations 00067 TERMID_T term; 00068 vector<LOC_T> loc; 00069 }; 00070 00071 struct dt_entry { // an entry in the lookup table for docterm lists index 00072 FILEID_T fileid; // which file the word is in 00073 long offset; // what the offset into the file is 00074 int length; // the length of the inverted list 00075 int docmgr; // the docmgr id of manager for this doc 00076 }; 00077 00078 struct inv_entry { // an entry in the lookup table for docterm lists index 00079 FILEID_T fileid; // which file the word is in 00080 long offset; // what the offset into the file is 00081 int ctf; // collection term freq 00082 int df; // doc freq 00083 }; 00084 00085 struct ltstr 00086 { 00087 bool operator()(char* s1, char* s2) const{ 00088 return strcmp(s1, s2) < 0; 00089 } 00090 }; 00091 00092 #endif