/* * Team 1 * CS575: Software Design * Project Phase I * Pipe-and-Filter style * PNFInput.java */ import java.io.*; import java.util.Vector; /** *

Input class for the Pipe and Filter KWIC system.

* *

This class represents a filter that reads in data from specified files. * It reads the data into a SentenceCollection that is given to it. It then * returns that SentenceCollection back, now filled with the data from the files, if any. * This filter does not make any assumptions about the contents of the incoming collection. * It does however assume the file format to be as follows: a sentence per line with * the words seperated by white space. * *

PRE: expects to receive the name(s) of the input file(s) and the * SentenceCollection data object *
POST: after execution, all the files will have been parsed and * data inserted into the SentenceCollection data object, the data files will not be changed * * @author Diana Tetelman */ public class PNFInput { private char tokenTermChar; private char lineTermChar; private StreamTokenizer in; /** *

Test driver

The correct output of it should look like this...

* *
Setting the token termination char to " " *
Adding the word "The" *
Adding the word "cat" *
Adding the word "is" *
Adding the word "in" *
Adding the word "the" *
Adding the word "hat" *
Adding the word "This" *
Adding the word "is" *
Adding the word "an" *
Adding the word "input" *
Adding the word "file" *
*
The cat is in the hat *
This is an input file * */ public static void main(String[] args) throws KWICException { Vector files = new Vector(); files.add(new String("KWICInput.txt")); PNFInput input = new PNFInput(' ', '\n'); SentenceCollection data = new SentenceCollection(); SentenceCollection outData = input.getData(files, data); // print the data collection System.out.println(); outData.startKWICRowIterator(); while( outData.hasNextKWICRow() == true ) { Object[] temp = outData.getNextKWICRow(); int count = temp.length; for( int i=0; i Constructor * * @param tokenTerm Character which is used to separate tokens (for example ' ' means space) * lineTerm Character which is used to terminate a line in the file (for example '\n' for newline char) */ public PNFInput(char tokenTerm, char lineTerm) { tokenTermChar = tokenTerm; lineTermChar = lineTerm; } /** *

This method will read in data from the specified files and return a SentenceCollection * data object with all of the new data in it. * * @params filenames String[] which should contain the name(s) of the files to be parsed * dataCollection SentenceCollection data object * @return SentenceCollection data object filled with read data (if any found) * @exception KWICException thrown if any errors occur */ public SentenceCollection getData( Vector filenames, SentenceCollection dataCollection ) throws KWICException { for( int fIndex = 0; fIndex < filenames.size(); fIndex++ ) { try{ setupTokenizer((String)filenames.get(fIndex)); int tokenType = in.nextToken(); while( tokenType != in.TT_EOF ) { Vector words = new Vector(); while( tokenType != in.TT_EOF && tokenType != in.TT_EOL && ! in.sval.toCharArray().equals(new Character(lineTermChar)) ) { //System.out.println("Adding the word \"" + in.sval + "\""); words.add(in.sval); tokenType = in.nextToken(); } dataCollection.addKWICRow(words.toArray()); tokenType = in.nextToken(); } } catch( FileNotFoundException e ) { e.printStackTrace(); throw new KWICException(filenames.get(fIndex) + " not found!"); } catch( IOException e ) { throw new KWICException("Problem parsing file into tokens!"); } } return dataCollection; } private void setupTokenizer(String filename) throws KWICException { try{ in = new StreamTokenizer( new BufferedReader( new FileReader(filename))); // if the newline is not the sentence terminator, treat it as whitespace if( lineTermChar == '\n' ) in.eolIsSignificant(true); else in.eolIsSignificant(false); // set up the termination character if it is out of the current range. if( ! (Character.getNumericValue(tokenTermChar) >= '\u0000' && Character.getNumericValue(tokenTermChar) <= '\u0020') ) { //System.out.println("Setting the token termination char to \"" + // tokenTermChar + "\""); in.whitespaceChars(Character.getNumericValue(tokenTermChar), Character.getNumericValue(tokenTermChar)); } } catch( FileNotFoundException e ) { e.printStackTrace(); throw new KWICException(filename + " not found!"); } } }