/*
 * WITExample1.java
 *
 * Created on Feb 10, 2006, 9:46 PM
 *
 * @author Sophie Wang
 */

import edu.cmu.cs.readweb.text.DocumentItem;
import edu.cmu.cs.readweb.text.DocumentSet;
import edu.cmu.cs.readweb.util.GWS;
import java.io.IOException;
import java.util.Enumeration;
import java.util.Vector;

public class WITExample1 {
    
    static public void main(String[] args) throws IOException {
        
        String str = "tom mitchell";
        try{
            //get a list of URLs by given query, eg. "was born in", 
            //Vector  urlList = GWS.getPageURLs(str, 300, 0,  false, true);
            Vector  urlList = GWS.getPageURLsByTkey(str, 1000, 0,  false, true, "WEB");
            //for each URL in urlList, crawl web page, and create document set
            DocumentSet docSet = new DocumentSet(urlList);
            // output parsed documents in directory "docs/"
            docSet.writeDocsIntoDir("docs/");
            
            //Or output a list of DocumentItem objects
            Vector docList = docSet.getDocuments();
            
            for (Enumeration en = docList.elements() ; en.hasMoreElements() ;){
                DocumentItem doc = (DocumentItem)en.nextElement();
                //print out the parsed text of each web page
                System.out.println(doc.getDocStream());
                //print out the extracted hyperlink and anchor text
                System.out.println(doc.getDocLinks());
            }
            
        }catch(Exception e) { e.printStackTrace(); }
        
    }
    
    
}