package edu.cmu.minorthird.text;

import cern.colt.matrix.impl.AbstractFormatter;
import edu.cmu.minorthird.classify.ExampleSchema;
import edu.cmu.minorthird.util.ProgressCounter;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.ObjectOutputStream;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;

/* loaded from: input_file:edu/cmu/minorthird/text/TextBaseLoader.class */
public class TextBaseLoader {
    public static final int NONE = 0;
    public static final int DIRECTORY_NAME = 1;
    public static final int FILE_NAME = 2;
    public static final int IN_FILE = 3;
    public static final int DOC_PER_LINE = 0;
    public static final int DOC_PER_FILE = 1;
    public static final boolean USE_XML = true;
    public static final boolean IGNORE_XML = false;
    private int documentStyle;
    private boolean use_markup;
    private boolean recurseDirectories;
    private static Logger log;
    private int closurePolicy;
    private MutableTextLabels labels;
    private TextBase textBase;
    private Tokenizer tokenizer;
    private String curDocID;
    private Pattern markupPattern;
    private ArrayList stack;
    private List spanList;
    private List tokenPropList;
    static Class class$edu$cmu$minorthird$text$TextBaseLoader;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/cmu/minorthird/text/TextBaseLoader$CharSpan.class */
    public class CharSpan {
        public int lo;
        public int hi;
        String type;
        String docID;
        private final TextBaseLoader this$0;

        public CharSpan(TextBaseLoader textBaseLoader, int i, int i2, String str, String str2) {
            this.this$0 = textBaseLoader;
            this.lo = i;
            this.hi = i2;
            this.type = str;
            this.docID = str2;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/cmu/minorthird/text/TextBaseLoader$StackEntry.class */
    public class StackEntry {
        public int index;
        public String markupTag;
        private final TextBaseLoader this$0;

        public StackEntry(TextBaseLoader textBaseLoader, int i, String str) {
            this.this$0 = textBaseLoader;
            this.index = i;
            this.markupTag = str;
        }
    }

    public TextBaseLoader() {
        this.documentStyle = 1;
        this.use_markup = true;
        this.recurseDirectories = false;
        this.closurePolicy = 1;
        this.tokenizer = null;
        this.markupPattern = Pattern.compile("</?([^ ><]+)( [^<>]+)?>");
        this.tokenPropList = null;
    }

    public TextBaseLoader(int i) {
        this.documentStyle = 1;
        this.use_markup = true;
        this.recurseDirectories = false;
        this.closurePolicy = 1;
        this.tokenizer = null;
        this.markupPattern = Pattern.compile("</?([^ ><]+)( [^<>]+)?>");
        this.tokenPropList = null;
        this.documentStyle = i;
    }

    public TextBaseLoader(int i, boolean z) {
        this.documentStyle = 1;
        this.use_markup = true;
        this.recurseDirectories = false;
        this.closurePolicy = 1;
        this.tokenizer = null;
        this.markupPattern = Pattern.compile("</?([^ ><]+)( [^<>]+)?>");
        this.tokenPropList = null;
        this.documentStyle = i;
        this.use_markup = z;
    }

    public TextBaseLoader(int i, boolean z, boolean z2) {
        this.documentStyle = 1;
        this.use_markup = true;
        this.recurseDirectories = false;
        this.closurePolicy = 1;
        this.tokenizer = null;
        this.markupPattern = Pattern.compile("</?([^ ><]+)( [^<>]+)?>");
        this.tokenPropList = null;
        this.documentStyle = i;
        this.use_markup = z;
        this.recurseDirectories = z2;
    }

    public TextBaseLoader(int i, int i2) {
        this.documentStyle = 1;
        this.use_markup = true;
        this.recurseDirectories = false;
        this.closurePolicy = 1;
        this.tokenizer = null;
        this.markupPattern = Pattern.compile("</?([^ ><]+)( [^<>]+)?>");
        this.tokenPropList = null;
        this.documentStyle = i;
    }

    public TextBaseLoader(int i, int i2, boolean z) {
        this.documentStyle = 1;
        this.use_markup = true;
        this.recurseDirectories = false;
        this.closurePolicy = 1;
        this.tokenizer = null;
        this.markupPattern = Pattern.compile("</?([^ ><]+)( [^<>]+)?>");
        this.tokenPropList = null;
        this.documentStyle = i;
        this.use_markup = z;
    }

    public TextBaseLoader(int i, int i2, int i3, int i4) {
        this.documentStyle = 1;
        this.use_markup = true;
        this.recurseDirectories = false;
        this.closurePolicy = 1;
        this.tokenizer = null;
        this.markupPattern = Pattern.compile("</?([^ ><]+)( [^<>]+)?>");
        this.tokenPropList = null;
        this.documentStyle = i;
    }

    public TextBaseLoader(int i, int i2, int i3, int i4, boolean z, boolean z2) {
        this.documentStyle = 1;
        this.use_markup = true;
        this.recurseDirectories = false;
        this.closurePolicy = 1;
        this.tokenizer = null;
        this.markupPattern = Pattern.compile("</?([^ ><]+)( [^<>]+)?>");
        this.tokenPropList = null;
        this.documentStyle = i;
        this.use_markup = z;
        this.recurseDirectories = z2;
    }

    public TextBase load(File file) throws IOException, ParseException {
        if (this.textBase == null) {
            this.textBase = new BasicTextBase();
        }
        if (this.labels == null) {
            this.labels = new BasicTextLabels(this.textBase);
        }
        clear();
        this.tokenizer = null;
        if (file.isDirectory()) {
            loadDirectory(file);
        } else {
            loadFile(file);
        }
        return this.textBase;
    }

    public TextBase load(File file, Tokenizer tokenizer) throws IOException, ParseException {
        if (this.textBase == null) {
            this.textBase = new BasicTextBase();
        }
        if (this.labels == null) {
            this.labels = new BasicTextLabels(this.textBase);
        }
        this.curDocID = new StringBuffer().append(this.curDocID).append("2").toString();
        clear();
        this.tokenizer = tokenizer;
        if (file.isDirectory()) {
            loadDirectory(file);
        } else {
            loadFile(file);
        }
        return this.textBase;
    }

    public void setLabelsInFile(boolean z) {
        this.use_markup = z;
    }

    public TextBase retokenize(TextBase textBase, Tokenizer tokenizer) throws IOException, ParseException {
        if (this.labels == null) {
            this.labels = new BasicTextLabels(this.textBase);
        }
        this.textBase = ((BasicTextBase) textBase).retokenize(tokenizer);
        return this.textBase;
    }

    public void writeSerialized(TextBase textBase, File file) throws IOException {
        ObjectOutputStream objectOutputStream = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(file)));
        objectOutputStream.writeObject(textBase);
        objectOutputStream.flush();
        objectOutputStream.close();
    }

    public MutableTextLabels getLabels() {
        return this.labels;
    }

    public static MutableTextLabels loadDirOfTaggedFiles(File file) throws ParseException, IOException {
        TextBaseLoader textBaseLoader = new TextBaseLoader(1, true);
        textBaseLoader.load(file);
        return textBaseLoader.getLabels();
    }

    public void loadTaggedFiles(TextBase textBase, File file) throws IOException, FileNotFoundException {
        try {
            new TextBaseLoader(1, true).load(file);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static TextBase loadDocPerLine(File file, boolean z) throws ParseException, IOException {
        try {
            return new TextBaseLoader(0).load(file);
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

    private void loadDirectory(File file) throws IOException, ParseException {
        File[] listFiles = file.listFiles();
        if (listFiles == null) {
            throw new IllegalArgumentException(new StringBuffer().append("can't list directory ").append(file.getName()).toString());
        }
        ProgressCounter progressCounter = new ProgressCounter(new StringBuffer().append("loading directory ").append(file.getName()).toString(), "file", listFiles.length);
        for (int i = 0; i < listFiles.length; i++) {
            if (!"CVS".equals(listFiles[i].getName())) {
                if (listFiles[i].isDirectory() && this.recurseDirectories) {
                    loadDirectory(listFiles[i]);
                }
                if (listFiles[i].isFile()) {
                    loadFile(listFiles[i]);
                }
                progressCounter.progress();
            }
        }
        progressCounter.finished();
    }

    private void loadFile(File file) throws IOException, ParseException {
        log.debug(new StringBuffer().append("loadFile: ").append(file.getName()).toString());
        BufferedReader lineNumberReader = this.documentStyle == 0 ? new LineNumberReader(new FileReader(file)) : new BufferedReader(new FileReader(file));
        this.curDocID = file.getName();
        this.spanList = new ArrayList();
        StringBuffer stringBuffer = new StringBuffer();
        while (lineNumberReader.ready()) {
            String readLine = lineNumberReader.readLine();
            if (this.use_markup) {
                readLine = labelLine(readLine, stringBuffer, this.spanList);
            }
            if (this.documentStyle == 0) {
                this.curDocID = new StringBuffer().append(file.getName()).append("@line:").append(((LineNumberReader) lineNumberReader).getLineNumber()).toString();
                addDocument(readLine);
                stringBuffer = new StringBuffer();
            } else if (!this.use_markup) {
                stringBuffer.append(readLine);
                stringBuffer.append(AbstractFormatter.DEFAULT_ROW_SEPARATOR);
            }
        }
        if (this.documentStyle == 1) {
            addDocument(stringBuffer.toString());
        }
        lineNumberReader.close();
    }

    public void loadWordPerLineFile(TextBase textBase, File file) throws IOException, FileNotFoundException {
        this.tokenizer = new Tokenizer(1, AbstractFormatter.DEFAULT_COLUMN_SEPARATOR);
        if (this.labels == null) {
            this.labels = new BasicTextLabels(textBase);
        }
        String name = file.getName();
        LineNumberReader lineNumberReader = new LineNumberReader(new FileReader(file));
        this.textBase = textBase;
        StringBuffer stringBuffer = new StringBuffer("");
        int i = 1;
        this.curDocID = new StringBuffer().append(name).append("-").append(1).toString();
        this.spanList = new ArrayList();
        this.tokenPropList = new ArrayList();
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                lineNumberReader.close();
                return;
            }
            String[] split = readLine.split("\\s");
            if (split[0].equals("-DOCSTART-")) {
                this.tokenizer = new Tokenizer(1, AbstractFormatter.DEFAULT_COLUMN_SEPARATOR);
                addDocument(stringBuffer.toString());
                this.spanList = new ArrayList();
                this.tokenPropList = new ArrayList();
                stringBuffer = new StringBuffer("");
                i++;
                this.curDocID = new StringBuffer().append(name).append("-").append(i).toString();
            } else if (split.length > 2) {
                int length = stringBuffer.length();
                stringBuffer.append(new StringBuffer().append(split[0]).append(AbstractFormatter.DEFAULT_COLUMN_SEPARATOR).toString());
                int length2 = stringBuffer.length() - 1;
                this.tokenPropList.add(split[1]);
                if (!split[3].equals("O")) {
                    this.spanList.add(new CharSpan(this, length, length2, split[3], this.curDocID));
                }
            }
        }
    }

    private void addDocument(String str) {
        if (str.length() == 0) {
            log.warn(new StringBuffer().append("Text for document ").append(this.curDocID).append(" is length zero or all white space, it will not be added to the text base.").toString());
            return;
        }
        if (log.isDebugEnabled()) {
            log.debug(new StringBuffer().append("add document ").append(this.curDocID).toString());
        }
        if (this.tokenizer == null) {
            this.textBase.loadDocument(this.curDocID, str);
        } else {
            this.textBase.loadDocument(this.curDocID, str, this.tokenizer);
        }
        for (CharSpan charSpan : this.spanList) {
            boolean z = false;
            for (int i = charSpan.lo; i < charSpan.hi; i++) {
                if (str.charAt(i) != ' ' && str.charAt(i) != '\n') {
                    z = true;
                }
            }
            Span charIndexSubSpan = z ? this.textBase.documentSpan(this.curDocID).charIndexSubSpan(charSpan.lo, charSpan.hi) : this.textBase.documentSpan(this.curDocID).charIndexSubSpan(charSpan.lo, charSpan.hi).getLeftBoundary();
            if (log.isDebugEnabled()) {
                int i2 = charSpan.hi;
                if (i2 > str.length()) {
                    i2 = str.length();
                }
                log.debug(new StringBuffer().append("approximating ").append(charSpan.type).append(" span '").append(str.substring(charSpan.lo, i2)).append("' with token span '").append(charIndexSubSpan).toString());
            }
            this.labels.addToType(charIndexSubSpan, charSpan.type);
        }
        if (this.tokenPropList != null && this.tokenPropList.size() > 0) {
            TextToken[] splitIntoTokens = this.tokenizer.splitIntoTokens(this.textBase.getDocument(this.curDocID), str);
            Iterator it = this.tokenPropList.iterator();
            if (splitIntoTokens.length > 0) {
                for (int i3 = 0; i3 < splitIntoTokens.length; i3++) {
                    String str2 = (String) it.next();
                    if (str2 != null && splitIntoTokens[i3] != null) {
                        this.labels.setProperty(splitIntoTokens[i3], ExampleSchema.POS_CLASS_NAME, str2);
                    }
                }
            }
        }
        new TextLabelsLoader().closeLabels(this.labels, this.closurePolicy);
        this.spanList = new ArrayList();
    }

    protected String labelLine(String str, StringBuffer stringBuffer, List list) throws ParseException {
        if (this.stack == null) {
            this.stack = new ArrayList();
        }
        int i = 0;
        Matcher matcher = this.markupPattern.matcher(str);
        while (matcher.find()) {
            String group = matcher.group(1);
            boolean z = !matcher.group().startsWith("</");
            if (log.isDebugEnabled()) {
                log.debug(new StringBuffer().append("matcher.group='").append(matcher.group()).append("'").toString());
                log.debug(new StringBuffer().append("found '").append(group).append("' tag ,open=").append(z).append(", at ").append(matcher.start()).append(" in:\n").append(str).toString());
            }
            stringBuffer.append(str.substring(i, matcher.start()));
            i = matcher.end();
            if (z) {
                this.stack.add(new StackEntry(this, stringBuffer.length(), group));
            } else {
                StackEntry stackEntry = null;
                int size = this.stack.size() - 1;
                while (true) {
                    if (size < 0) {
                        break;
                    }
                    stackEntry = (StackEntry) this.stack.get(size);
                    if (group.equals(stackEntry.markupTag)) {
                        this.stack.remove(size);
                        break;
                    }
                    size--;
                }
                if (stackEntry == null) {
                    throw new ParseException(new StringBuffer().append("close '").append(group).append("' tag with no open in ").append(this.curDocID).toString(), 0);
                }
                if (!group.equals(stackEntry.markupTag)) {
                    throw new ParseException(new StringBuffer().append("close '").append(group).append("' tag paired with open '").append(stackEntry.markupTag).append("'").toString(), stackEntry.index);
                }
                if (log.isDebugEnabled()) {
                    log.debug(new StringBuffer().append("adding a ").append(group).append(" span from ").append(stackEntry.index).append(" to ").append(stringBuffer.length()).append(": '").append(stringBuffer.substring(stackEntry.index)).append("'").toString());
                }
                list.add(new CharSpan(this, stackEntry.index, stringBuffer.length(), group, this.curDocID));
            }
        }
        stringBuffer.append(str.substring(i, str.length()));
        stringBuffer.append(AbstractFormatter.DEFAULT_ROW_SEPARATOR);
        return stringBuffer.toString();
    }

    private void clear() {
        this.curDocID = null;
    }

    static Class class$(String str) {
        try {
            return Class.forName(str);
        } catch (ClassNotFoundException e) {
            throw new NoClassDefFoundError().initCause(e);
        }
    }

    static {
        Class cls;
        if (class$edu$cmu$minorthird$text$TextBaseLoader == null) {
            cls = class$("edu.cmu.minorthird.text.TextBaseLoader");
            class$edu$cmu$minorthird$text$TextBaseLoader = cls;
        } else {
            cls = class$edu$cmu$minorthird$text$TextBaseLoader;
        }
        log = Logger.getLogger(cls);
    }
}
