/*
 *  Team 1
 *  CS575: Software Design
 *  Project Phase I
 *  Pipe-and-Filter style
 *  PNFNoise.java
 */

import java.io.*;
import java.util.Vector;

/**
 *  <p>Noise class for the Pipe and Filter KWIC system.</p>
 *
 *  This class represents a filter that takes in a plain SentenceCollection
 *  and a list of the files that contain the noise words to be used to filter
 *  the collection.  The words are read in from the file(s) and the SentenceCollection
 *  is notified of these noise words.  
 * 
 *  <p><b>PRE:</b> expects to receive the name(s) of the noise words file(s) and the 
 * SentenceCollection data object
 *  <br><b>POST:</b> after execution, all the files will have been parsed and 
 * the SentenceCollection data object will know that it is to filter the data with the 
 * provided noise words
 *
 *  @author Diana Tetelman
 */

public class PNFNoise
{
    private char tokenTermChar;
    private char lineTermChar;
    
    private StreamTokenizer in;
    
    /**
     * <p> <b>Test driver</b></p>
     * <p> The correct output of it should look like this...</p>
     *
     *       <br>Setting the token termination char to " "
     *       <br>Adding the word "this"
     *       <br>Adding the word "is"
     *       <br>Adding the word "the"
     */
    public static void main(String[] args) throws KWICException
    {
        Vector files = new Vector();
        files.add(new String("noiseWords"));
        SentenceCollection data = new SentenceCollection();
	  data.addKWICRow(new String[] { "Sentence", "number", "one"});
	  data.addKWICRow(new String[] { "This", "sentence", "should", "be", "filtered"});
	  data.addKWICRow(new String[] { "Sentence", "number", "two"});
	  data.addKWICRow(new String[] { "Sentence", "number", "three"});
	  data.addKWICRow(new String[] { "The", "second", "sentence", "that", "should", "be", "filtered"});
	  data.addKWICRow(new String[] { "Is", "this", "one", "also", "filtered"});
        
        // print the data collection
        System.out.println();
        data.startKWICRowIterator();
        while( data.hasNextKWICRow() == true )
        {
            Object[] temp = data.getNextKWICRow();
            int count = temp.length;
            for( int i=0; i<count; i++ )
            {
                System.out.print(temp[i].toString() + " ");
            }
            System.out.println();
        }
	
	  PNFNoise noise = new PNFNoise(' ', '\n');
        SentenceCollection outData = noise.getNoiseWords( files, data);

        // print the filtered data collection
        System.out.println();
        outData.startKWICRowIterator();
        while( outData.hasNextKWICRow() == true )
        {
            Object[] temp = outData.getNextKWICRow();
            int count = temp.length;
            for( int i=0; i<count; i++ )
            {
                System.out.print(temp[i].toString() + " ");
            }
            System.out.println();
        }
    }

    /**
     * <p> Constructor
     *
     * @param      tokenTerm       Character which is used to separate tokens (for example ' ' means space)
     *              lineTerm        Character which is used to terminate a line in the file (for example '\n' for newline char)
     */
    public PNFNoise(char tokenTerm, char lineTerm)
    {
        tokenTermChar = tokenTerm;
        lineTermChar = lineTerm;        
    }
    
    /**
     * <p> This method will read in noise words from the specified files and return a SentenceCollection
     * data object that will now use those words to filter its data.
     *
     * @params      filenames       String[] which should contain the name(s) of the files to be parsed
     *              dataCollection  SentenceCollection data object
     * @return                      SentenceCollection data object which will filter data
     * @exception   KWICException   thrown if any errors occur
     */
    public SentenceCollection getNoiseWords( Vector filenames, SentenceCollection dataCollection )
        throws KWICException
    {
        for( int fIndex = 0; fIndex < filenames.size(); fIndex++ )
        {
            try{
                setupTokenizer((String)filenames.get(fIndex));

                Vector words = new Vector();                   
                int tokenType = in.nextToken();
                while( tokenType != in.TT_EOF  )
                {
					if( tokenType != in.TT_EOL && !in.sval.toCharArray().equals(new Character(lineTermChar)) )
					{
						//System.out.println("Adding the word \"" + in.sval + "\"");
						words.add(in.sval);                     
					}
					tokenType = in.nextToken();
                }
				//System.out.println(words.toString());
                dataCollection.setNoiseWords(words.toArray());
            }
            catch( FileNotFoundException e )
            {
                e.printStackTrace();
                throw new KWICException(filenames.get(fIndex) + " not found!");                
            }
            catch( IOException e )
            {
                throw new KWICException("Problem parsing file into tokens!");
            }
        }        
        return dataCollection;
    }    
    
    private void setupTokenizer(String filename) throws KWICException
    {
        try{
            in = new StreamTokenizer(
                 new BufferedReader(
                 new FileReader(filename)));

            // if the newline is not the sentence terminator, treat it as whitespace
            if( lineTermChar == '\n' )
                in.eolIsSignificant(true);
            else
                in.eolIsSignificant(false);

            // set up the termination character if it is out of the current range.
            if( ! (Character.getNumericValue(tokenTermChar) >= '\u0000' && 
                   Character.getNumericValue(tokenTermChar) <= '\u0020') )
            {
                //System.out.println("Setting the token termination char to \"" + 
                //                    tokenTermChar + "\"");
                in.whitespaceChars(Character.getNumericValue(tokenTermChar),
                                   Character.getNumericValue(tokenTermChar));
            }
        }
        catch( FileNotFoundException e )
        {
            e.printStackTrace();
            throw new KWICException(filename + " not found!");                
        }
    }
    
}