OfflineCluster.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2002 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 // David Fisher
00013 // init: 02/03/2003
00014 #ifndef _OFFLINECLUSTER_HPP
00015 #define _OFFLINECLUSTER_HPP
00016 #include <set>
00017 #include "common_headers.hpp"
00018 #include "Index.hpp"
00019 #include "ClusterParam.hpp"
00020 #include "ClusterFactory.hpp"
00021 #include "SimFactory.hpp"
00022 
00024 class OfflineCluster
00025 {
00026 public:
00028   OfflineCluster(const Index &ind, 
00029                  enum ClusterParam::simTypes simType = ClusterParam::COS,
00030                  enum ClusterParam::clusterTypes clusterType = ClusterParam::CENTROID,
00031                 enum ClusterParam::docModes docMode = ClusterParam::DMAX);
00032 
00034   ~OfflineCluster();
00035 
00038   vector<Cluster*> *kMeans(vector<DOCID_T> docIds, int numParts = 2, 
00039                            int maxIters = 100);
00040 
00042   vector<Cluster*> *kMeans(Cluster *cluster, int numParts = 2, 
00043                            int maxIters = 100);
00044 
00047   vector<Cluster*> *bisecting_kMeans(vector<DOCID_T> docIds, int numParts = 2, 
00048                                      int numIters = 5, int maxIters = 100);
00049 
00050 private:
00052   const SimilarityMethod *sim;
00054   ClusterFactory *factory;
00056   const Index &index;
00058   bool compareClusterSets(Cluster **, Cluster **, int n);
00060   vector <DOCID_T> selectSeeds(vector<DOCID_T> docIds, int num);
00062   Cluster *chooseSplit(vector<Cluster *> *working);  
00064   double scoreSet(vector<Cluster *> *working);
00065 };
00066 #endif