package edu.cmu.cs.lti.avenue.morphology;

import info.jonclark.stat.SecondTimer;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.PrintWriter;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import cmonson.morphologyInduction.Corpus;
import cmonson.morphologyInduction.morphemes.Affix;
import cmonson.morphologyInduction.morphemes.Context;
import cmonson.morphologyInduction.morphemes.SetOfMorphemes;
import cmonson.morphologyInduction.networks.PartialOrderNetwork;
import cmonson.morphologyInduction.networks.VirtualPartialOrderNetwork;
import cmonson.morphologyInduction.networks.PartialOrderNetwork.Identifier;
import cmonson.morphologyInduction.networks.PartialOrderNetwork.MorphemicAnalysis;
import cmonson.morphologyInduction.searchAndProcessing.BottomUpSearchResultCluster;
import cmonson.morphologyInduction.searchAndProcessing.BottomUpSearchResultClustering;
import cmonson.morphologyInduction.searchAndProcessing.MorphemeBoundaryTooFarLeft_Filter;
import cmonson.morphologyInduction.searchAndProcessing.MorphemeBoundaryTooFarRight_Filter;
import cmonson.morphologyInduction.searchAndProcessing.SearchBatch;
import cmonson.morphologyInduction.searchAndProcessing.BottomUpSearch.BottomUpParameters;
import cmonson.morphologyInduction.searchAndProcessing.SearchBatch.SearchStepSequenceInstantiation;
import cmonson.morphologyInduction.segmentation.Segmentation;
import cmonson.morphologyInduction.segmentation.SegmentedWord;
import cmonson.morphologyInduction.segmentation.SimpleSuffixSegmentationExplanation;

public class ParaMor implements Segmenter, Serializable {

	private static final long serialVersionUID = -5817477261846383351L;

	private transient static final boolean DEBUG = false;
	private transient static final boolean ALLOW_EMPTY_STEMS = false;
	private Properties props;

	private boolean inductionDone = false;
	private List<String> inductionTokens;
	private List<Paradigm> allParadigms = new ArrayList<Paradigm>();
	private Corpus inductionCorpus;
	private SearchBatch searchBatch;

	private boolean evidenceDone = false;
	private transient Corpus evidenceCorpus;
	private transient PartialOrderNetwork evidenceNetwork;

	private int nextId = 0;
	private final HashMap<BottomUpSearchResultCluster, Integer> paradigmIds =
			new HashMap<BottomUpSearchResultCluster, Integer>();

	public ParaMor(Properties props) throws IOException {
		this.props = props;
	}

	public void induceMorphology(List<String> tokens) throws SegmenterException {
		try {

			// allow this abstract interface by dumping everything to a file
			// TODO: make this more efficient by using a RAM disk or rewriting
			// the Corpus class

			this.inductionTokens = tokens;
			this.inductionCorpus = writeDummyCorpus(tokens);

			SecondTimer timer2 = new SecondTimer();
			timer2.go();

			// 0) "Build" a partial order network
			Identifier networkIdentifier =
					new PartialOrderNetwork.Identifier(VirtualPartialOrderNetwork.class,
							this.inductionCorpus, MorphemicAnalysis.SUFFIX, ALLOW_EMPTY_STEMS);
			networkIdentifier.setTheNetworkClass(VirtualPartialOrderNetwork.class);
			PartialOrderNetwork inductionNetwork = PartialOrderNetwork.factory(networkIdentifier);
			searchBatch = new SearchBatch(inductionNetwork, inductionCorpus.getLanguage());

			// 1) Perform the bottom up search
			BottomUpParameters bottomUpParameters = new BottomUpParameters();
			searchBatch.performSearchStep(bottomUpParameters);

			// 2) Do clustering
			BottomUpSearchResultClustering.Parameters clusteringParameters =
					new BottomUpSearchResultClustering.Parameters();
			ArrayList<Integer> childTypeCoveredCutoffs = new ArrayList<Integer>(1);
			final int correctValue = 37;
			childTypeCoveredCutoffs.add(correctValue);
			clusteringParameters.setChildTypesCoveredCutoffs(childTypeCoveredCutoffs);
			searchBatch.cluster(clusteringParameters);

			// 3) Filter
			MorphemeBoundaryTooFarLeft_Filter.Parameters filterParameters =
					new MorphemeBoundaryTooFarLeft_Filter.Parameters();
			boolean filteringSucceeded =
					searchBatch.doMorphemeBoundaryTooFarLeftFilterOnClusters(filterParameters);
			if (!filteringSucceeded)
				throw new SegmenterException("Error doing MorphemeBoundaryTooFarLeft filtering.");

			MorphemeBoundaryTooFarRight_Filter.Parameters filterParameters2 =
					new MorphemeBoundaryTooFarRight_Filter.Parameters();
			filteringSucceeded =
					searchBatch.doMorphemeBoundaryTooFarRightFilterOnClusters(filterParameters2);
			if (!filteringSucceeded)
				throw new SegmenterException("Error doing MorphemeBoundaryTooFarRight filtering.");

			// now assign ID's to all paradigms (BottomUpSearchResultCluster)
			for (BottomUpSearchResultCluster paradigm : searchBatch.getAllClusters()) {
				Integer id = paradigmIds.get(paradigm);
				if (id == null) {
					id = nextId;
					paradigmIds.put(paradigm, id);
					nextId++;
				}
				paradigm.setParadigmId(id);

				Paradigm par = makeParadigm(paradigm);
				allParadigms.add(par);
			}

			System.out.println("Found " + searchBatch.getAllClusters().size()
					+ " clusters of which " + nextId + " are unique.");

			System.out.println("Trained corpus in: " + timer2.getSecondsFormatted());

			inductionDone = true;

		} catch (IOException e) {
			throw new SegmenterException(e);
		}
	}

	private Corpus writeDummyCorpus(List<String>... tokenLists) throws IOException,
			FileNotFoundException {

		SecondTimer timer1 = new SecondTimer();
		timer1.go();

		Corpus corpus = new Corpus();
		corpus.setTypesToRead(Integer.parseInt(props.getProperty("ParaMor.maxTypes")));
		corpus.setCaseSensitivity(Boolean.parseBoolean(props.getProperty("ParaMor.preserveCase")));
		corpus.setLanguage(props.getProperty("ParaMor.languageName"));
		corpus.setThrowOutNumbers(Boolean.parseBoolean(props.getProperty("ParaMor.throwOutDigits")));

		File trainingTempFile = File.createTempFile("paraMorTemp", ".txt");
		trainingTempFile.deleteOnExit();
		PrintWriter out = new PrintWriter(trainingTempFile);
		for (List<String> list : tokenLists) {
			for (String tok : list) {
				out.println(tok);
			}
			out.close();
		}

		corpus.setPathToCorpus(trainingTempFile.getAbsolutePath());
		corpus.setIgnoreSGMLTags(false);
		
		// XXX: HACK
		if (DEBUG)
			corpus.setTypesToRead(5000);
		
		corpus.collateTypes(null);

		System.out.println("Read corpus in: " + timer1.getSecondsFormatted());

		return corpus;
	}

	/**
	 * Must be called AFTER induceMorphology(). This method will AUTOMATICACLLY
	 * include the induction corpus, so it need not be included in this token
	 * list.
	 */
	public void setEvidenceCorpus(List<String> evidenceTokens) throws SegmenterException {

		if (inductionDone == false)
			throw new SegmenterException("induceMorphology() must be called first.");

		try {
			this.evidenceCorpus = writeDummyCorpus(inductionTokens, evidenceTokens);
			Identifier evidenceNetworkIdentifier =
					new PartialOrderNetwork.Identifier(VirtualPartialOrderNetwork.class,
							this.evidenceCorpus, MorphemicAnalysis.SUFFIX, ALLOW_EMPTY_STEMS);
			evidenceNetworkIdentifier.setTheNetworkClass(VirtualPartialOrderNetwork.class);

			// get the set of affixes that were learned during induction
			SearchStepSequenceInstantiation searchStepKey =
					searchBatch.getIteratorOverCurrentSearchStepSequenceInstantiations().next();
			SetOfMorphemes<Affix> affixesInParadigms =
					searchBatch.getAllCoveredAffixes(searchStepKey);

			evidenceNetwork =
					new VirtualPartialOrderNetwork(evidenceNetworkIdentifier, affixesInParadigms);

			evidenceDone = true;
		} catch (FileNotFoundException e) {
			throw new SegmenterException(e);
		} catch (IOException e) {
			throw new SegmenterException(e);
		}
	}

	public String[] getCombinedSegmentation(String word) throws SegmenterException {

		if (inductionDone == false)
			throw new SegmenterException("induceMorphology() must be called first.");
		if (evidenceDone == false)
			throw new SegmenterException("setEvidenceCorpus() must be called first.");

		SearchStepSequenceInstantiation searchStepSequenceInstantiation =
				searchBatch.getIteratorOverCurrentSearchStepSequenceInstantiations().next();
		SegmentedWord segmentedWord =
				searchBatch.segmentOneWord(word, searchStepSequenceInstantiation, evidenceNetwork);
		return segmentedWord.getCombinedSegmentation();
	}

	public List<Paradigm> getAllParadigms() throws SegmenterException {
		return allParadigms;
	}

	// TODO: Make sure a UNIQUE list of paradigms is returned
	public List<Paradigm> getParadigms(String word) throws SegmenterException {

		if (inductionDone == false)
			throw new SegmenterException("induceMorphology() must be called first.");
		if (evidenceDone == false)
			throw new SegmenterException("setEvidenceCorpus() must be called first.");

		ArrayList<Paradigm> paradigms = new ArrayList<Paradigm>();

		SearchStepSequenceInstantiation searchStepSequenceInstantiation =
				searchBatch.getIteratorOverCurrentSearchStepSequenceInstantiations().next();
		SegmentedWord segmentedWord =
				searchBatch.segmentOneWord(word, searchStepSequenceInstantiation, evidenceNetwork);

		for (Segmentation seg : segmentedWord.getSegmentations()) {
			for (SimpleSuffixSegmentationExplanation exp : seg.getExplanations()) {

				BottomUpSearchResultCluster cluster = exp.getCluster();
				Paradigm paradigm = makeParadigm(cluster);
				paradigms.add(paradigm);
			}
		}
		return paradigms;
	}

	private Paradigm makeParadigm(BottomUpSearchResultCluster cluster) {
		Set<Affix> affixes = cluster.getCoveredAffixes();
		Set<Context> stems = cluster.getCoveredStems();

		String[] stemArray = new String[stems.size()];
		int i = 0;
		for (Context stem : stems) {
			stemArray[i] = stem.getInitialString();
			i++;
		}

		String[] affixArray = new String[affixes.size()];
		i = 0;
		for (Affix affix : affixes) {
			affixArray[i] = affix.toString();
			i++;
		}

		Paradigm paradigm = new Paradigm(stemArray, affixArray, cluster.getParadigmId());
		return paradigm;
	}

	public void saveSegmentationModel(File path) throws SegmenterException, IOException {

		if (inductionDone == false)
			throw new SegmenterException("induceMorphology() must be called first.");

		GZIPOutputStream gzOut =
				new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(path)));

		ObjectOutputStream objectOutputStream = new ObjectOutputStream(gzOut);
		objectOutputStream.writeObject(searchBatch);
		objectOutputStream.flush();

		PrintWriter pw = new PrintWriter(gzOut);
		for (String tok : inductionTokens) {
			pw.println(tok);
		}

		pw.flush();
		gzOut.close();
	}

	public void loadSegmentationModel(File path) throws SegmenterException, IOException {

		GZIPInputStream gzIn =
				new GZIPInputStream(new BufferedInputStream(new FileInputStream(path)));

		ObjectInputStream objectInputStream = new ObjectInputStream(gzIn);
		try {
			searchBatch = (SearchBatch) objectInputStream.readObject();

			BufferedReader br = new BufferedReader(new InputStreamReader(gzIn));
			this.inductionTokens = new ArrayList<String>();
			String line;
			while ((line = br.readLine()) != null) {
				this.inductionTokens.add(line);
			}
			inductionDone = true;

			// we still need to rebuild the evidence network... unless we want
			// to do this for the user
			evidenceDone = false;

		} catch (ClassNotFoundException e) {
			// this should never happen
			throw new Error(e);
		}
		gzIn.close();

	}
}
