/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.trees.international.arabic;

import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.util.Function;
import edu.stanford.nlp.util.StringUtils;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class IBMArabicEscaper
implements Function<List<HasWord>, List<HasWord>> {
    private static final Pattern pEnt = Pattern.compile("\\$[a-z]+_\\((.*?)\\)");
    private static final Pattern presForms = Pattern.compile("[\ufb50-\ufdff\ufe70-\ufefe]");
    private static final Pattern extendedArabic = Pattern.compile("[\u063b-\u063f\u0671-\u06ff\u0750-\u077f]");
    private static final Pattern alefVariants = Pattern.compile("[\u0622\u0623\u0625]");
    private static final Pattern pAM = Pattern.compile("\u0649");
    private static final Pattern pDel = Pattern.compile("[\u064b-\u0655\u0670]");
    private static final Pattern pTatweel = Pattern.compile("\u0640");
    private static final Pattern pYaaHamza = Pattern.compile("\u064a\u0621");
    private boolean warnedPresentationForms;
    private boolean warnedExtendedArabic;
    private boolean warnedEntityEscaping;
    private boolean warnedNormalization;
    private boolean warnedDeletion;
    private boolean warnedProcliticEnclitic;

    private String escapeString(String w) {
        int wLen;
        Matcher mDel;
        Matcher mYH;
        Matcher mAM;
        Matcher mAlef;
        Matcher m3;
        Matcher m1;
        if (!this.warnedPresentationForms && (m1 = presForms.matcher(w)).find()) {
            System.err.println("IBMArabicEscaper Warning: encountering Arabic presentation form characters which are NOT mapped but just treated as unknown characters: " + w);
            this.warnedPresentationForms = true;
        }
        if (!this.warnedExtendedArabic && (m3 = extendedArabic.matcher(w)).find()) {
            System.err.println("IBMArabicEscaper Warning: encountering Arabic presentation form characters which are NOT mapped but just treated as unknown characters: " + w);
            this.warnedExtendedArabic = true;
        }
        if ((mAlef = alefVariants.matcher(w)).find()) {
            if (!this.warnedNormalization) {
                System.err.println("IBMArabicEscaper Note: equivalence classing certain characters, such as Alef with madda/hamza, e.g., in: " + w);
                this.warnedNormalization = true;
            }
            w = mAlef.replaceAll("\u0627");
        }
        if ((mAM = pAM.matcher(w)).find()) {
            if (!this.warnedNormalization) {
                System.err.println("IBMArabicEscaper Note: equivalence classing certain characters, such as Alef with madda/hamza, e.g., in: " + w);
                this.warnedNormalization = true;
            }
            w = mAM.replaceAll("\u064a");
        }
        if ((mYH = pYaaHamza.matcher(w)).find()) {
            if (!this.warnedNormalization) {
                System.err.println("IBMArabicEscaper Note: equivalence classing certain characters, such as Alef with madda/hamza, e.g., in: " + w);
                this.warnedNormalization = true;
            }
            w = mYH.replaceAll("\u0626");
        }
        if ((mDel = pDel.matcher(w)).find()) {
            if (!this.warnedDeletion) {
                System.err.println("IBMArabicEscaper Note: deleting certain characters, such as tatweel, fatHa, kasra, damma, e.g., in: " + w);
                this.warnedDeletion = true;
            }
            w = mDel.replaceAll("");
        }
        if ((wLen = (w = StringUtils.tr(w, "\u060c\u061b\u061f\u066a\u066b\u066c\u066d\u06d4\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f\u2013\u2014\u0091\u0092\u2018\u2019\u0093\u0094\u201c\u201d", ",;%.,*.01234567890123456789--''''\"\"\"\"")).length()) > 1) {
            Matcher m2 = pEnt.matcher(w);
            if (m2.matches()) {
                if (!this.warnedEntityEscaping) {
                    System.err.println("IBMArabicEscaper Note: escaping IBM MT-style entities: " + m2.group(0) + " --> " + m2.group(1));
                    this.warnedEntityEscaping = true;
                }
                w = m2.replaceAll("$1");
            } else if (w.charAt(0) == '+') {
                if (!this.warnedProcliticEnclitic) {
                    this.warnedProcliticEnclitic = true;
                    System.err.println("IBMArabicEscaper Note: removing IBM MT-style proclitic/enclitic indicators, e.g., on " + w);
                }
                w = w.substring(1);
            } else if (w.charAt(wLen - 1) == '#') {
                if (!this.warnedProcliticEnclitic) {
                    this.warnedProcliticEnclitic = true;
                    System.err.println("IBMArabicEscaper Note: removing IBM MT-style proclitic/enclitic indicators, e.g., on " + w);
                }
                w = w.substring(0, wLen - 1);
            }
            Matcher mTatweel = pTatweel.matcher(w);
            if (mTatweel.find()) {
                if (!this.warnedDeletion) {
                    System.err.println("IBMArabicEscaper Note: deleting certain characters, such as tatweel, fatHa, kasra, damma, e.g., in: " + w);
                    this.warnedDeletion = true;
                }
                w = mTatweel.replaceAll("");
            }
        } else if (w.equals("(")) {
            w = "-LRB-";
        } else if (w.equals(")")) {
            w = "-RRB-";
        } else if (w.equals("+")) {
            w = "-PLUS-";
        }
        return w;
    }

    @Override
    public List<HasWord> apply(List<HasWord> arg) {
        ArrayList<HasWord> ans = new ArrayList<HasWord>(arg);
        for (HasWord wd : ans) {
            wd.setWord(this.escapeString(wd.word()));
        }
        return ans;
    }

    public static void main(String[] args) throws IOException {
        IBMArabicEscaper escaper = new IBMArabicEscaper();
        for (String arg : args) {
            String line;
            BufferedReader br = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(arg), "UTF-8"));
            String outFile = arg + ".sent";
            PrintWriter pw = new PrintWriter(new BufferedWriter(new OutputStreamWriter((OutputStream)new FileOutputStream(outFile), "UTF-8")));
            while ((line = br.readLine()) != null) {
                String[] words = line.split("\\s+");
                for (int i = 0; i < words.length; ++i) {
                    String w = escaper.escapeString(words[i]);
                    pw.print(w);
                    if (i == words.length - 1) continue;
                    pw.print(" ");
                }
                pw.println();
            }
            br.close();
            pw.close();
        }
    }
}

