00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 /* type definitions for objects we will use */ 00014 #ifndef _INVFPTYPES_H 00015 #define _INVFPTYPES_H 00016 00017 #define IND_VERSION "2.1" 00018 00019 #include "common_headers.hpp" 00020 00021 typedef int FILEID_T; 00022 typedef int TERMID_T; 00023 typedef TERMID_T LOC_T; // for simplifying implementation of memory management, it's helpful for termid and location to be of the same type. 00024 typedef TERMID_T DOCID_T; 00025 typedef float SCORE_T; 00026 typedef char* TERM_T; 00027 typedef char* EXDOCID_T; 00028 00029 // suffixes for filenames 00030 #define INVINDEX ".invf" 00031 #define INVFPINDEX ".invfp" 00032 #define INVLOOKUP ".invlookup" 00033 #define DTINDEX ".dt" 00034 #define DTLOOKUP ".dtlookup" 00035 #define TERMIDMAP ".tid" 00036 #define DOCIDMAP ".did" 00037 #define MAINTOC ".inv" 00038 #define INVFPTOC ".ifp" 00039 #define DOCMGRMAP ".dm" 00040 00041 // what to call out of vocabulary ids 00042 #define INVALID_STR "[OOV]" 00043 00044 // name for parameters 00045 #define VERSION_PAR "VERSION" 00046 #define NUMDOCS_PAR "NUM_DOCS" 00047 #define NUMTERMS_PAR "NUM_TERMS" 00048 #define NUMUTERMS_PAR "NUM_UNIQUE_TERMS" 00049 #define AVEDOCLEN_PAR "AVE_DOCLEN" 00050 #define INVINDEX_PAR "INV_INDEX" 00051 #define INVLOOKUP_PAR "INV_LOOKUP" 00052 #define DTINDEX_PAR "DT_INDEX" 00053 #define DTLOOKUP_PAR "DT_LOOKUP" 00054 #define TERMIDMAP_PAR "TERMIDS" 00055 #define DOCIDMAP_PAR "DOCIDS" 00056 #define NUMDT_PAR "NUM_DTFILES" 00057 #define NUMINV_PAR "NUM_INVFILES" 00058 #define DOCMGR_PAR "DOCMGR_IDS" 00059 00060 struct LocatedTerm { // pair of term and its location 00061 TERMID_T term; 00062 LOC_T loc; 00063 }; 00064 00065 struct LLTerm { // pair of term and list of locations 00066 TERMID_T term; 00067 vector<LOC_T> loc; 00068 }; 00069 00070 struct dt_entry { // an entry in the lookup table for docterm lists index 00071 FILEID_T fileid; // which file the word is in 00072 long offset; // what the offset into the file is 00073 int length; // the length of the inverted list 00074 int docmgr; // the docmgr id of manager for this doc 00075 }; 00076 00077 struct inv_entry { // an entry in the lookup table for docterm lists index 00078 FILEID_T fileid; // which file the word is in 00079 long offset; // what the offset into the file is 00080 int ctf; // collection term freq 00081 int df; // doc freq 00082 }; 00083 00084 struct ltstr 00085 { 00086 bool operator()(char* s1, char* s2) const{ 00087 return strcmp(s1, s2) < 0; 00088 } 00089 }; 00090 00091 #endif