00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 /* type definitions for objects we will use */ 00014 #ifndef _INVFPTYPES_H 00015 #define _INVFPTYPES_H 00016 00017 #define IND_VERSION "2.2" 00018 00019 #include "common_headers.hpp" 00020 00021 typedef int FILEID_T; 00022 typedef int TERMID_T; 00023 typedef TERMID_T LOC_T; // for simplifying implementation of memory management, it's helpful for termid and location to be of the same type. 00024 typedef TERMID_T DOCID_T; 00025 typedef float SCORE_T; 00026 typedef char* TERM_T; 00027 typedef char* EXDOCID_T; 00028 00029 // suffixes for filenames 00030 #define INVINDEX ".invf" 00031 #define INVFPINDEX ".invfp" 00032 #define INVLOOKUP ".invlookup" 00033 #define DTINDEX ".dt" 00034 #define DTLOOKUP ".dtlookup" 00035 #define TERMIDMAP ".tid" 00036 #define TERMIDSTRMAP ".tidstr" 00037 #define DOCIDMAP ".did" 00038 #define DOCIDSTRMAP ".didstr" 00039 #define MAINTOC ".inv" 00040 #define INVFPTOC ".ifp" 00041 #define DOCMGRMAP ".dm" 00042 00043 // what to call out of vocabulary ids 00044 #define INVALID_STR "[OOV]" 00045 00046 // name for parameters 00047 #define VERSION_PAR "VERSION" 00048 #define NUMDOCS_PAR "NUM_DOCS" 00049 #define NUMTERMS_PAR "NUM_TERMS" 00050 #define NUMUTERMS_PAR "NUM_UNIQUE_TERMS" 00051 #define AVEDOCLEN_PAR "AVE_DOCLEN" 00052 #define INVINDEX_PAR "INV_INDEX" 00053 #define INVLOOKUP_PAR "INV_LOOKUP" 00054 #define DTINDEX_PAR "DT_INDEX" 00055 #define DTLOOKUP_PAR "DT_LOOKUP" 00056 #define TERMIDMAP_PAR "TERMIDS" 00057 #define TERMIDSTRMAP_PAR "TERMIDSTRS" 00058 #define DOCIDMAP_PAR "DOCIDS" 00059 #define DOCIDSTRMAP_PAR "DOCIDSTRS" 00060 #define NUMDT_PAR "NUM_DTFILES" 00061 #define NUMINV_PAR "NUM_INVFILES" 00062 #define DOCMGR_PAR "DOCMGR_IDS" 00063 00064 struct LocatedTerm { // pair of term and its location 00065 TERMID_T term; 00066 LOC_T loc; 00067 }; 00068 00069 struct LLTerm { // pair of term and list of locations 00070 TERMID_T term; 00071 vector<LOC_T> loc; 00072 }; 00073 00074 struct dt_entry { // an entry in the lookup table for docterm lists index 00075 FILEID_T fileid; // which file the word is in 00076 long offset; // what the offset into the file is 00077 int length; // the length of the inverted list 00078 int docmgr; // the docmgr id of manager for this doc 00079 }; 00080 00081 struct inv_entry { // an entry in the lookup table for docterm lists index 00082 FILEID_T fileid; // which file the word is in 00083 long offset; // what the offset into the file is 00084 int ctf; // collection term freq 00085 int df; // doc freq 00086 }; 00087 00088 struct ltstr 00089 { 00090 bool operator()(char* s1, char* s2) const{ 00091 return strcmp(s1, s2) < 0; 00092 } 00093 }; 00094 00095 #endif