package edu.cmu.cs.lti.avenue.navigation.tools;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.StringTokenizer;

public class TransliteratorWrapper {
	public static void main(String[] args) throws Exception {
		if (args.length != 2) {
			System.err.println("Usage: program <in_file> <out_file>");
			System.err.println("All tokens in <in_file> composed entirely of non-ASCII characters will be substituted with a transliteration. Expected encoding is UTF-8.");
			System.exit(1);
		}

		int nTokens = 0;

		BufferedReader in =
				new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF-8"));
		File tempFile = File.createTempFile("transliteration", ".tmp");
		tempFile.deleteOnExit();
		String line;
		ArrayList<String[]> lines = new ArrayList<String[]>();
		HashSet<String> urduWordSet = new HashSet<String>();
		while ((line = in.readLine()) != null) {
			String[] tokens = tokenize(line);
			for (final String token : tokens) {
				if (isAllNonAscii(token)) {
					urduWordSet.add(token);
					nTokens++;
				}
			}
			lines.add(tokens);
		}
		in.close();

		PrintWriter transliteratorInput = new PrintWriter(tempFile, "UTF-8");
		ArrayList<String> urduWordList = new ArrayList<String>(urduWordSet);
		for (final String urduWord : urduWordList) {
			transliteratorInput.println(urduWord);
		}
		transliteratorInput.close();

		System.out.println("Found " + nTokens + " tokens and " + urduWordList.size()
				+ " types needing transliteration.");

		HashMap<String, String> transliterations = new HashMap<String, String>();
		String command = "perl transliterator.perl " + tempFile.getAbsolutePath();
		System.out.println("Running: " + command);
		Process proc = Runtime.getRuntime().exec(command);
		BufferedReader transliteratorIn =
				new BufferedReader(new InputStreamReader(proc.getInputStream()));
		int j = 0;
		while ((line = transliteratorIn.readLine()) != null) {
			String urduWord = urduWordList.get(j);
			transliterations.put(urduWord, line);
			System.out.println(line);
			j++;
		}
		proc.waitFor();
		transliteratorIn.close();
		System.out.println("Transliteration complete, doing replacements...");

		PrintWriter out = new PrintWriter(args[1], "UTF-8");
		for (final String[] tokens : lines) {
			// replace urdu tokens with transliterations
			for (int i = 0; i < tokens.length; i++) {
				if (isAllNonAscii(tokens[i])) {
					String transliteration = transliterations.get(tokens[i]);
					assert transliteration != null : "expected transliteration, but didn't find one";
					tokens[i] = transliteration;
				}
			}
			out.println(untokenize(tokens));
		}
		out.close();

		System.out.println("Done. Wrote file: " + args[1]);
	}

	public static boolean isAllNonAscii(String str) {
		for (int i = 0; i < str.length(); i++) {
			char c = str.charAt(i);
			if (c < Byte.MAX_VALUE) {
				return false;
			}
		}
		return true;
	}

	/**
	 * Copied from info.jonclark.util.StringUtils
	 */
	public static String[] tokenize(String str) {
		StringTokenizer tok = new StringTokenizer(str);
		String[] result = new String[tok.countTokens()];
		for (int i = 0; i < result.length; i++)
			result[i] = tok.nextToken();
		return result;
	}

	/**
	 * Copied from info.jonclark.util.StringUtils
	 */
	public static String untokenize(final String[] tokens) {
		final StringBuilder builder = new StringBuilder();
		for (final String token : tokens)
			builder.append(token + " ");
		return builder.toString().trim();
	}
}
