/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.process;

import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.objectbank.XMLBeginEndIterator;
import edu.stanford.nlp.process.PTBEscapingProcessor;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.process.WordToTaggedWordProcessor;
import edu.stanford.nlp.util.Function;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.web.HTMLParser;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class DocumentPreprocessor {
    private static final boolean DEBUG = false;
    private TokenizerFactory tokenizerFactory;
    private String encoding;
    private String[] sentenceFinalPuncWords;
    private static final Pattern urlPattern = Pattern.compile("(?:ht|f)tps?://.*?");
    private static final int PLAIN = 0;
    private static final int XML = 1;
    private static final int HTML = 2;

    public DocumentPreprocessor(TokenizerFactory tokenizerFactory) {
        this.tokenizerFactory = tokenizerFactory;
    }

    public DocumentPreprocessor() {
        this.tokenizerFactory = PTBTokenizer.factory();
    }

    public DocumentPreprocessor(boolean suppressEscaping) {
        this.tokenizerFactory = PTBTokenizer.factory(false, false, suppressEscaping);
    }

    public void setEncoding(String encoding) {
        this.encoding = encoding;
    }

    public void setSentenceFinalPuncWords(String[] sentenceFinalPuncWords) {
        this.sentenceFinalPuncWords = sentenceFinalPuncWords;
    }

    public void setTokenizerFactory(TokenizerFactory newTokenizerFactory) {
        this.tokenizerFactory = newTokenizerFactory;
    }

    public void usePTBTokenizer() {
        this.tokenizerFactory = PTBTokenizer.factory();
    }

    public void useWhitespaceTokenizer() {
        this.tokenizerFactory = WhitespaceTokenizer.factory();
    }

    public List<Word> getWordsFromText(String fileOrURL) throws IOException {
        return this.getWordsFromText(this.fileOrURLToReader(fileOrURL));
    }

    public List<Word> getWordsFromText(Reader input) {
        Tokenizer tokenizer = this.tokenizerFactory.getTokenizer(new BufferedReader(input));
        return tokenizer.tokenize();
    }

    public List<List<? extends HasWord>> getSentencesFromText(String fileOrURL) throws IOException {
        return this.getSentencesFromText(this.fileOrURLToReader(fileOrURL));
    }

    public List<List<? extends HasWord>> getSentencesFromText(String fileOrURL, boolean doPTBEscaping, String sentenceDelimiter, int tagDelimiter) throws IOException {
        return this.getSentencesFromText(this.fileOrURLToReader(fileOrURL), doPTBEscaping, sentenceDelimiter, tagDelimiter);
    }

    public List<List<? extends HasWord>> getSentencesFromText(Reader input) {
        return this.getSentencesFromText(input, false, null, -1);
    }

    public List<List<? extends HasWord>> getSentencesFromText(String input, Function<List<HasWord>, List<HasWord>> escaper, String sentenceDelimiter, int tagDelimiter) throws IOException {
        return this.getSentencesFromText(this.fileOrURLToReader(input), escaper, sentenceDelimiter, tagDelimiter);
    }

    public List<List<? extends HasWord>> getSentencesFromText(Reader input, String sentenceDelimiter) {
        return this.getSentencesFromText(input, null, sentenceDelimiter, -1);
    }

    public List<List<? extends HasWord>> getSentencesFromText(Reader input, Function<List<HasWord>, List<HasWord>> escaper, String sentenceDelimiter, int tagDelimiter) {
        if (escaper == null) {
            escaper = new NullEscaper();
        }
        ListEscaper listEscaper = new ListEscaper(escaper);
        if (this.tokenizerFactory instanceof WhitespaceTokenizer.WhitespaceTokenizerFactory) {
            if (sentenceDelimiter == null) {
                WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(input, false);
                List<HasWord> words = tokenizer.tokenize();
                if (tagDelimiter >= 0) {
                    WordToTaggedWordProcessor wttwp = new WordToTaggedWordProcessor((char)tagDelimiter);
                    words = wttwp.process(words);
                }
                words = (List<HasWord>)escaper.apply(words);
                WordToSentenceProcessor sp = this.sentenceFinalPuncWords != null ? new WordToSentenceProcessor(new HashSet<String>(Arrays.asList(this.sentenceFinalPuncWords))) : new WordToSentenceProcessor();
                return sp.process(words);
            }
            WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(input, sentenceDelimiter.equals("\n"));
            List<HasWord> words = tokenizer.tokenize();
            List<List<? extends HasWord>> sentences = DocumentPreprocessor.splitListsOnToken(words, sentenceDelimiter);
            if (tagDelimiter >= 0) {
                sentences = DocumentPreprocessor.tagSplitSentences(sentences, tagDelimiter);
            }
            sentences = listEscaper.apply(sentences);
            return sentences;
        }
        if (tagDelimiter >= 0) {
            throw new RuntimeException("Can't read tags from untokenized document.");
        }
        if (sentenceDelimiter == null) {
            Tokenizer tokenizer = this.tokenizerFactory.getTokenizer(new BufferedReader(input));
            List words = tokenizer.tokenize();
            words = (List)escaper.apply(words);
            WordToSentenceProcessor sp = this.sentenceFinalPuncWords != null ? new WordToSentenceProcessor(new HashSet<String>(Arrays.asList(this.sentenceFinalPuncWords))) : new WordToSentenceProcessor();
            return sp.process(words);
        }
        WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(input, true);
        List tokens = tokenizer.tokenize();
        tokens = (List)escaper.apply(tokens);
        List<String> sentences = DocumentPreprocessor.glueSentences(DocumentPreprocessor.splitListsOnToken(tokens, sentenceDelimiter));
        return this.tokenizeSentences(sentences);
    }

    public List<Word> getWordsFromString(String input) {
        Tokenizer tokenizer = this.tokenizerFactory.getTokenizer(new BufferedReader(new StringReader(input)));
        return tokenizer.tokenize();
    }

    public List<List<? extends HasWord>> getSentencesFromXML(String fileOrURL, String splitOnTag) throws IOException {
        return this.getSentencesFromXML(fileOrURL, splitOnTag, null, true);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(String fileOrURL, String splitOnTag, boolean doPTBEscaping) throws IOException {
        return this.getSentencesFromXML(fileOrURL, splitOnTag, null, doPTBEscaping);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(String fileOrURL, String splitOnTag, String sentenceDelimiter, boolean doPTBEscaping) throws IOException {
        return this.getSentencesFromXML(this.fileOrURLToReader(fileOrURL), splitOnTag, sentenceDelimiter, doPTBEscaping);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(Reader input, String splitOnTag, String sentenceDelimiter, boolean doPTBEscaping) {
        Function escaper = doPTBEscaping ? new PTBEscapingProcessor() : new NullEscaper();
        return this.getSentencesFromXML(input, escaper, splitOnTag, sentenceDelimiter);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(String fileOrURL, Function<List<HasWord>, List<HasWord>> escaper, String splitOnTag) throws IOException {
        return this.getSentencesFromXML(this.fileOrURLToReader(fileOrURL), escaper, splitOnTag, null);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(String fileOrURL, Function<List<HasWord>, List<HasWord>> escaper, String splitOnTag, String sentenceDelimiter) throws IOException {
        return this.getSentencesFromXML(this.fileOrURLToReader(fileOrURL), escaper, splitOnTag, sentenceDelimiter);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(Reader input, Function<List<HasWord>, List<HasWord>> escaper, String splitOnTag, String sentenceDelimiter) {
        ArrayList<List<? extends HasWord>> lis = new ArrayList<List<? extends HasWord>>();
        if ("onePerElement".equals(sentenceDelimiter)) {
            sentenceDelimiter = ".$.onePerElement.$.";
        }
        XMLBeginEndIterator xmlIter = new XMLBeginEndIterator(input, splitOnTag);
        while (xmlIter.hasNext()) {
            String s = (String)xmlIter.next();
            List<List<? extends HasWord>> section = this.getSentencesFromText((Reader)new BufferedReader(new StringReader(s)), escaper, sentenceDelimiter, -1);
            for (List<? extends HasWord> individual : section) {
                lis.add(individual);
            }
        }
        return lis;
    }

    public List<Word> getWordsFromHTML(String fileOrURL) throws IOException {
        return this.getWordsFromHTML(this.fileOrURLToReader(fileOrURL));
    }

    public List<Word> getWordsFromHTML(Reader input) {
        HTMLParser parser = new HTMLParser();
        try {
            String s = parser.parse(input);
            return this.getWordsFromText(new StringReader(s));
        }
        catch (IOException e) {
            System.err.println("IOException" + e.getMessage());
            return null;
        }
    }

    public List<List<? extends HasWord>> getSentencesFromHTML(String fileOrURL) throws IOException {
        return this.getSentencesFromHTML(this.fileOrURLToReader(fileOrURL));
    }

    public List<List<? extends HasWord>> getSentencesFromHTML(Reader input) {
        HTMLParser parser = new HTMLParser();
        try {
            String s = parser.parse(input);
            return this.getSentencesFromText(new StringReader(s));
        }
        catch (IOException e) {
            System.err.println("IOException" + e.getMessage());
            return null;
        }
    }

    private List<List<? extends HasWord>> getSentencesFromText(Reader fileOrURL, boolean doPTBEscaping, String sentenceDelimiter, int tagDelimiter) {
        Function escaper = doPTBEscaping ? new PTBEscapingProcessor() : new NullEscaper();
        return this.getSentencesFromText(fileOrURL, escaper, sentenceDelimiter, tagDelimiter);
    }

    private static List<List<HasWord>> splitListsOnToken(List<HasWord> tokens, String sentenceDelimiter) {
        ArrayList<List<HasWord>> result = new ArrayList<List<HasWord>>();
        ArrayList<HasWord> sentence = new ArrayList<HasWord>();
        for (HasWord word : tokens) {
            if (word.word().equals(sentenceDelimiter)) {
                result.add(sentence);
                sentence = new ArrayList();
                continue;
            }
            sentence.add(word);
        }
        if (!sentence.isEmpty()) {
            result.add(sentence);
        }
        return result;
    }

    private static List<String> glueSentences(List<List<HasWord>> sentences) {
        ArrayList<String> result = new ArrayList<String>();
        for (List<HasWord> sentence : sentences) {
            result.add(DocumentPreprocessor.glueSentence(sentence));
        }
        return result;
    }

    private static String glueSentence(List<HasWord> sentence) {
        StringBuilder result = new StringBuilder();
        if (!sentence.isEmpty()) {
            HasWord word = sentence.get(0);
            String s = word.word();
            result.append(s);
            int sz = sentence.size();
            for (int i = 1; i < sz; ++i) {
                word = sentence.get(i);
                s = word.word();
                result.append(" ").append(s);
            }
        }
        return result.toString();
    }

    private List tokenizeSentences(List<String> sentences) {
        ArrayList result = new ArrayList();
        for (String sentence : sentences) {
            Tokenizer tok = this.tokenizerFactory.getTokenizer(new StringReader(sentence));
            result.add(tok.tokenize());
        }
        return result;
    }

    private static List<List<? extends HasWord>> tagSplitSentences(List<List<HasWord>> sentences, int tagDelimiter) {
        ArrayList<List<? extends HasWord>> result = new ArrayList<List<? extends HasWord>>();
        WordToTaggedWordProcessor wttwp = new WordToTaggedWordProcessor((char)tagDelimiter);
        for (List<HasWord> sentence : sentences) {
            sentence = wttwp.process(sentence);
            result.add(sentence);
        }
        return result;
    }

    private Reader fileOrURLToReader(String fileOrURL) throws IOException {
        System.err.println(fileOrURL);
        Matcher m = urlPattern.matcher(fileOrURL);
        if (m.matches()) {
            URL url = new URL(fileOrURL);
            return new BufferedReader(new StringReader(StringUtils.slurpURL(url)));
        }
        if (this.encoding == null) {
            return new FileReader(fileOrURL);
        }
        return new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(fileOrURL), this.encoding));
    }

    public static void main(String[] args) throws IOException {
        if (args.length == 0) {
            System.err.println("usage: DocumentPreprocessor -file filename [-xml tag|-html] [-noSplitSentence]");
            return;
        }
        boolean splitSentences = true;
        boolean suppressEscaping = false;
        boolean noTokenization = false;
        boolean plainOutput = false;
        String xmlTag = null;
        int fileType = 0;
        String file = null;
        for (int i = 0; i < args.length; ++i) {
            if (args[i].equals("-file")) {
                file = args[++i];
                continue;
            }
            if (args[i].equals("-xml")) {
                fileType = 1;
                xmlTag = args[++i];
                continue;
            }
            if (args[i].equals("-html")) {
                fileType = 2;
                continue;
            }
            if (args[i].equals("-noSplitSentence")) {
                splitSentences = false;
                continue;
            }
            if (args[i].equals("-suppressEscaping")) {
                suppressEscaping = true;
                continue;
            }
            if (args[i].equals("-noTokenization")) {
                noTokenization = true;
                continue;
            }
            if (!args[i].equals("-plainOutput")) continue;
            plainOutput = true;
        }
        DocumentPreprocessor docPreprocessor = noTokenization ? new DocumentPreprocessor(WhitespaceTokenizer.factory(true)) : new DocumentPreprocessor(suppressEscaping);
        System.err.println("Tokenizer: " + docPreprocessor.tokenizerFactory.getClass());
        List<Object> docs = new ArrayList();
        switch (fileType) {
            case 0: {
                if (splitSentences) {
                    docs = docPreprocessor.getSentencesFromText(file);
                    break;
                }
                List<Word> doc = docPreprocessor.getWordsFromText(file);
                docs.add(doc);
                break;
            }
            case 1: {
                boolean doPTBEscaping = !suppressEscaping;
                docs = docPreprocessor.getSentencesFromXML(file, xmlTag, doPTBEscaping);
                break;
            }
            case 2: {
                if (splitSentences) {
                    docs = docPreprocessor.getSentencesFromHTML(file);
                    break;
                }
                List<Word> doc = docPreprocessor.getWordsFromHTML(file);
                docs.add(doc);
            }
        }
        System.err.println("Read in " + docs.size() + " sentences.");
        for (List list : docs) {
            System.err.println("Length: " + list.size());
            if (plainOutput) {
                for (int i = 0; i < list.size(); ++i) {
                    if (i > 0) {
                        System.out.print(" ");
                    }
                    System.out.print(list.get(i));
                }
                System.out.println();
                continue;
            }
            System.out.println(list);
        }
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    private static class ListEscaper
    implements Function<List<List<HasWord>>, List<List<HasWord>>> {
        Function<List<HasWord>, List<HasWord>> f;

        public ListEscaper(Function<List<HasWord>, List<HasWord>> f) {
            this.f = f;
        }

        @Override
        public List<List<HasWord>> apply(List<List<HasWord>> lists) {
            ArrayList<List<HasWord>> result = new ArrayList<List<HasWord>>(lists.size());
            for (List<HasWord> l : lists) {
                result.add(this.f.apply(l));
            }
            return result;
        }
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    private static class NullEscaper
    implements Function<List<HasWord>, List<HasWord>> {
        private NullEscaper() {
        }

        @Override
        public List<HasWord> apply(List<HasWord> hasWords) {
            return hasWords;
        }
    }
}

