AnchorTextWriter.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // AnchorTextWriter
00015 //
00016 // 20 May 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_ANCHORTEXTWRITER_HPP
00020 #define INDRI_ANCHORTEXTWRITER_HPP
00021 
00022 #include <iostream>
00023 #include <algorithm>
00024 #include "indri/Path.hpp"
00025 #include "lemur-compat.hpp"
00026 
00027 class AnchorTextWriter : public ObjectHandler<ParsedDocument> {
00028 private:
00029   std::ofstream _out;
00030 
00031 public:
00032   AnchorTextWriter( const std::string& outputPath ) {
00033     std::string directory = Path::directory( outputPath );
00034     Path::make( directory );
00035     _out.open( outputPath.c_str(), std::ios::out );
00036   }
00037 
00038   ~AnchorTextWriter() {
00039     _out.close();
00040   }
00041 
00042   void handle( ParsedDocument* document ) {
00043     greedy_vector<MetadataPair>::iterator iter;
00044 
00045     iter = std::find_if( document->metadata.begin(),
00046                          document->metadata.end(),
00047                          MetadataPair::key_equal( "DOCNO" ) );
00048 
00049     const char* docno = (char*)iter->value;
00050 
00051     iter = std::find_if( document->metadata.begin(),
00052                          document->metadata.end(),
00053                          MetadataPair::key_equal( "URL" ) );
00054 
00055     const char* page = (char*)iter->value;
00056     const char* url = 0;
00057     int count = 0;
00058     int urlEnd = -1;
00059 
00060     // find the third slash, which should occur
00061     // right after the domain name
00062     char* slash = 0;
00063     if(page)  slash = strchr( page, '/' );
00064     if(slash) slash = strchr( slash+1, '/' );
00065     if(slash) slash = strchr( slash+1, '/' );
00066 
00067     int domainLength;
00068     if( slash )
00069       domainLength = slash - page;
00070     else
00071       domainLength = strlen(page);
00072 
00073     // count links
00074     for( unsigned int i=0; i<document->tags.size(); i++ ) {
00075       TagExtent& extent = document->tags[i];
00076 
00077       // we only extract absolute urls
00078       if( !strcmp( extent.name, "absolute-url" ) ) {
00079         url = document->terms[ extent.begin ];
00080         urlEnd = extent.end;
00081 
00082         // if it has the same domain, throw it out
00083         if( url && page && !lemur_compat::strncasecmp( url, page, domainLength ) ) {
00084           url = 0;
00085           urlEnd = -1;
00086         }
00087       } else if( !strcmp( extent.name, "a" ) &&  // this is anchor text
00088                  url &&                          // we've seen a url
00089                  urlEnd == extent.begin &&       // this text is associated with an absolute-url
00090                  extent.end - extent.begin > 0 ) // there is some text here
00091       {
00092         count++;
00093         url = 0;
00094       }
00095     }
00096 
00097     // print output
00098     _out << "DOCNO=" << docno << std::endl;
00099     _out << "DOCURL=" << page << std::endl;
00100     _out << "LINKS=" << count << std::endl;
00101     url = 0;
00102     urlEnd = -1;
00103 
00104     for( unsigned int i=0; i<document->tags.size(); i++ ) {
00105       TagExtent& extent = document->tags[i];
00106 
00107       if( !strcmp( extent.name, "absolute-url" ) ) {  // this is an absolute url
00108         url = document->terms[ extent.begin ];
00109         urlEnd = extent.end;
00110 
00111         // if it has the same domain, throw it out
00112         if( url && page && !lemur_compat::strncasecmp( url, page, domainLength ) ) {
00113           url = 0;
00114           urlEnd = -1;
00115         }
00116       } else if( !strcmp( extent.name, "a" ) &&  // this is anchor text
00117                  url &&                          // we've seen a url
00118                  urlEnd == extent.begin &&       // this text is associated with an absolute-url
00119                  extent.end - extent.begin > 0 ) // there is some text here
00120       {
00121         int textLength = 0;
00122 
00123         _out << "LINKURL=" << url << std::endl;
00124         _out << "TEXT=\"";
00125         for( unsigned int j=extent.begin; j < extent.end && textLength < 60000; j++ ) {
00126           if( !document->terms[j] )
00127             continue;
00128 
00129           textLength += strlen(document->terms[j])+1;
00130           _out << document->terms[j] << " ";
00131         }
00132         _out << "\"" << std::endl;
00133 
00134         // only do the same link once
00135         url = 0;
00136       }
00137     }
00138   }
00139 };
00140 
00141 #endif // INDRI_ANCHORTEXTWRITER_HPP
00142