Passage Level Retrieval

Contents

  1. Overview
  2. Example Code

1. Overview

Lemur provides passage level retrieval for all TextQueryRetMethods via the PassageRep class. PassageRep supports iteration over passages of fixed window size with an overlap of K terms for the window. It encapsulates the DocumentRep for the whole document, modifying its docLength attribute. It delegates calls to termWeight and scoreConstant to the encapsulated DocumentRep. TFIDFRetMethod with BM25 tf weighting and OkapiRetMethod will not compute correct scores, as they use the average document length from the collection in their formulas, where they should use the passage size. The difference should be small and have no effect of note on the scoring.

All TextQueryRetMethods now provide the method scoreDocPassages, with an example of its use below.

2. testScorePassage.cpp

This application will perform retrieval on a set of queries according to the specified parameters. For each query, the retrieved documents are rescored using a passage window of 10, with an overlap of 5 (in bold). The passage scores are then printed to the standard output.

#include "common_headers.hpp"
#include "BasicDocStream.hpp"
#include "IndexManager.hpp"
#include "RetMethodManager.hpp"

void GetAppParam()
{
  RetrievalParameter::get();
}

int AppMain(int argc, char *argv[]) {
  
  Index  *ind;

  try {
    ind  = IndexManager::openIndex(RetrievalParameter::databaseIndex);
  } 
  catch (Exception &ex) {
    ex.writeMessage();
    throw Exception("testScorePassage", 
		    "Can't open index, check parameter index");
  }

  
  DocStream *qryStream;
  try {
    qryStream = new BasicDocStream(RetrievalParameter::textQuerySet);
  } 
  catch (Exception &ex) {
    ex.writeMessage(cerr);
    throw Exception("testScorePassage", 
		    "Can't open query file, check parameter textQuery");
  }

  ArrayAccumulator accumulator(ind->docCount());
  IndexedRealVector results(ind->docCount());
  RetrievalMethod *model;
  model = RetMethodManager::createModel(ind, &accumulator, 
					RetrievalParameter::retModel);


  qryStream->startDocIteration();
  TextQuery *q;
  
  IndexedRealVector workSetRes;
  
  while (qryStream->hasMore()) {
    Document *d = qryStream->nextDoc();
    q = new TextQuery(*d);
    cout << "query : "<< q->id() << endl;
    QueryRep * qr = model->computeQueryRep(*q);
    model->scoreCollection(*qr, results);
    results.Sort();
    IndexedRealVector::iterator j;
    PassageScoreVector passRes;
    for (j = results.begin();j != results.end(); j++) {
      cout << ind->document((*j).ind) << " " << (*j).val << " ";
      int id = (*j).ind;
      TextQueryRetMethod *tqmodel = (TextQueryRetMethod *)model;
      double pScore = tqmodel->scoreDocPassages(*q, id, passRes, 10, 5);
      cout << pScore << endl;
      passRes.sortScores();
      for (PassageScoreVector::iterator k = passRes.begin();
	   k != passRes.end(); k++) {
	cout << (*k).id << " (" << (*k).start << "," << (*k).end << "):" 
	     << (*k).score << endl;
      }
      passRes.clear();
      cout << endl;
    }
    delete qr;
    delete q;
  }
  delete model;
  delete qryStream;
  delete ind;
  return 0;
}


The Lemur Project
Last modified: Jul 07 09:52:47 EDT 2004