/* * Team 1 * CS575: Software Design * Project Phase I * Pipe-and-Filter style * PNFNoise.java */ import java.io.*; import java.util.Vector; /** *

Noise class for the Pipe and Filter KWIC system.

* * This class represents a filter that takes in a plain SentenceCollection * and a list of the files that contain the noise words to be used to filter * the collection. The words are read in from the file(s) and the SentenceCollection * is notified of these noise words. * *

PRE: expects to receive the name(s) of the noise words file(s) and the * SentenceCollection data object *
POST: after execution, all the files will have been parsed and * the SentenceCollection data object will know that it is to filter the data with the * provided noise words * * @author Diana Tetelman */ public class PNFNoise { private char tokenTermChar; private char lineTermChar; private StreamTokenizer in; /** *

Test driver

The correct output of it should look like this...

* *
Setting the token termination char to " " *
Adding the word "this" *
Adding the word "is" *
Adding the word "the" */ public static void main(String[] args) throws KWICException { Vector files = new Vector(); files.add(new String("noiseWords")); SentenceCollection data = new SentenceCollection(); data.addKWICRow(new String[] { "Sentence", "number", "one"}); data.addKWICRow(new String[] { "This", "sentence", "should", "be", "filtered"}); data.addKWICRow(new String[] { "Sentence", "number", "two"}); data.addKWICRow(new String[] { "Sentence", "number", "three"}); data.addKWICRow(new String[] { "The", "second", "sentence", "that", "should", "be", "filtered"}); data.addKWICRow(new String[] { "Is", "this", "one", "also", "filtered"}); // print the data collection System.out.println(); data.startKWICRowIterator(); while( data.hasNextKWICRow() == true ) { Object[] temp = data.getNextKWICRow(); int count = temp.length; for( int i=0; i Constructor * * @param tokenTerm Character which is used to separate tokens (for example ' ' means space) * lineTerm Character which is used to terminate a line in the file (for example '\n' for newline char) */ public PNFNoise(char tokenTerm, char lineTerm) { tokenTermChar = tokenTerm; lineTermChar = lineTerm; } /** *

This method will read in noise words from the specified files and return a SentenceCollection * data object that will now use those words to filter its data. * * @params filenames String[] which should contain the name(s) of the files to be parsed * dataCollection SentenceCollection data object * @return SentenceCollection data object which will filter data * @exception KWICException thrown if any errors occur */ public SentenceCollection getNoiseWords( Vector filenames, SentenceCollection dataCollection ) throws KWICException { for( int fIndex = 0; fIndex < filenames.size(); fIndex++ ) { try{ setupTokenizer((String)filenames.get(fIndex)); Vector words = new Vector(); int tokenType = in.nextToken(); while( tokenType != in.TT_EOF ) { if( tokenType != in.TT_EOL && !in.sval.toCharArray().equals(new Character(lineTermChar)) ) { //System.out.println("Adding the word \"" + in.sval + "\""); words.add(in.sval); } tokenType = in.nextToken(); } //System.out.println(words.toString()); dataCollection.setNoiseWords(words.toArray()); } catch( FileNotFoundException e ) { e.printStackTrace(); throw new KWICException(filenames.get(fIndex) + " not found!"); } catch( IOException e ) { throw new KWICException("Problem parsing file into tokens!"); } } return dataCollection; } private void setupTokenizer(String filename) throws KWICException { try{ in = new StreamTokenizer( new BufferedReader( new FileReader(filename))); // if the newline is not the sentence terminator, treat it as whitespace if( lineTermChar == '\n' ) in.eolIsSignificant(true); else in.eolIsSignificant(false); // set up the termination character if it is out of the current range. if( ! (Character.getNumericValue(tokenTermChar) >= '\u0000' && Character.getNumericValue(tokenTermChar) <= '\u0020') ) { //System.out.println("Setting the token termination char to \"" + // tokenTermChar + "\""); in.whitespaceChars(Character.getNumericValue(tokenTermChar), Character.getNumericValue(tokenTermChar)); } } catch( FileNotFoundException e ) { e.printStackTrace(); throw new KWICException(filename + " not found!"); } } }