package edu.cmu.cs.lti.avenue.morphanalyzer;

import info.jonclark.properties.PropertyUtils;
import info.jonclark.properties.SmartProperties;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;

import edu.cmu.cs.lti.avenue.corpus.Corpus;
import edu.cmu.cs.lti.avenue.corpus.SentencePair;
import edu.cmu.cs.lti.avenue.corpus.Serializer;
import edu.cmu.cs.lti.avenue.morphology.ParaMor;
import edu.cmu.cs.lti.avenue.navigation.tools.UtfUtils;

public class ParaMorTrainer {

	public static void main(String[] args) throws Exception {
		if (args.length != 1) {
			System.err.println("Usage: program <properties_file>");
			System.exit(1);
		}

		Properties props = PropertyUtils.getProperties(new File(args[0]));
		SmartProperties smartProps = new SmartProperties(props);

		ParaMor paraMor = new ParaMor(props);

		System.out.println("Loading EC tokens & morphology induction corpus...");

		String encoding = smartProps.getPropertyString("global.encoding");
		File elicitedCorpusFile = smartProps.getPropertyFile("paths.mergedCorpus");
		File morphInductionCorpus = smartProps.getPropertyFile("morphology.inductionCorpus");

		// kill unicode chars
		Corpus elicitationCorpus = Serializer.loadSentencePairs(elicitedCorpusFile, true, encoding);
		for (final SentencePair pair : elicitationCorpus.getSentences()) {
			String[] sentence = pair.getNormalizedTargetTokens();
			for (int i = 0; i < sentence.length; i++) {
				sentence[i] = UtfUtils.replaceUnicodeCharsWith(sentence[i], 'x');
			}
		}

		// ensure tokens from elicitation corpus are at the beginning of the
		// induction tokens since we will only use the first N types for
		// induction
		ArrayList<String> trainingTokens = new ArrayList<String>(1000000);
		for (SentencePair sent : elicitationCorpus.getSentences()) {
			for (String tok : sent.getNormalizedTargetTokens()) {
				trainingTokens.add(tok);
			}
		}

		trainingTokens.addAll(loadTrainingData(morphInductionCorpus));

		System.out.println("Inducing morphology...");
		paraMor.induceMorphology(trainingTokens);
		System.out.println("Found " + paraMor.getAllParadigms().size() + " paradigms.");

		System.out.println("Writing segmenter data...");
		File segmenterData = smartProps.getPropertyFile("morphAnalyzer.segmenterSaveFile");
		paraMor.saveSegmentationModel(segmenterData);
		System.out.println("Wrote segmenter data to : " + segmenterData.getAbsolutePath());
	}

	private static List<String> loadTrainingData(File file) throws IOException {

		ArrayList<String> tokens = new ArrayList<String>();
		BufferedReader in = new BufferedReader(new FileReader(file));
		String line;
		while ((line = in.readLine()) != null) {
			tokens.addAll(Arrays.asList(line.split(" ")));
		}
		in.close();

		return tokens;
	}
}
