/**
 * The AVENUE Project
 * Language Technologies Institute
 * School of Computer Science
 * (c) 2007 Carnegie Mellon University
 * 
 * Corpus Navigator
 * Written by Jonathan Clark
 */
package edu.cmu.cs.lti.avenue.navigation.wals;

import info.jonclark.util.ArrayUtils;
import info.jonclark.util.FileUtils;
import info.jonclark.util.StringUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;

import edu.cmu.cs.lti.avenue.navigation.featuredetection.deductive.FeatureManager;

public class WALS {

	public static final int NUM_LANGUAGES = 2150;
	public static final int NUM_FEATURES = 139;

	private final int[][] wals;
	private final String[] languageNames;
	private final String[] featureNames;
	private final HashMap<String, String> featureCategories = new HashMap<String, String>();
	private final HashMap<String, String[]> featureValues = new HashMap<String, String[]>();

	/**
	 * Create a new WALS object using the default data file names.
	 * 
	 * @param walsDataDirectory
	 * @throws IOException
	 */
	public WALS(File walsDataDirectory) throws IOException {
		this(new File(walsDataDirectory, "feature_values.txt"), new File(walsDataDirectory,
				"language_names.txt"), new File(walsDataDirectory, "wals.txt"));
	}

	public WALS(File featureValuesFile, File languageNamesFile, File walsArrayFile)
			throws IOException {

		// get language names
		this.languageNames =
				StringUtils.tokenize(FileUtils.getFileAsString(languageNamesFile), "\n");
		for (int i = 0; i < languageNames.length; i++) {
			// now grab the part of the language name info we want
			// that is, the 3rd element of the 4 tokens
			languageNames[i] = StringUtils.split(languageNames[i], "|||", 4)[2];
		}

		// get feature names, categories, and values
		ArrayList<String> names = new ArrayList<String>();

		BufferedReader featuresIn = new BufferedReader(new FileReader(featureValuesFile));
		String fLine;
		while ((fLine = featuresIn.readLine()) != null) {

			String[] tokens = StringUtils.split(fLine, "|||", 3);
			assert tokens.length == 3 : "3 tokens expected: " + fLine;

			String featureCategory = tokens[0];
			String featureName = tokens[1];
			String[] values = StringUtils.split(tokens[2], ":::", Integer.MAX_VALUE);

			names.add(featureName);
			this.featureCategories.put(featureName, featureCategory);
			this.featureValues.put(featureName, values);
		}
		featuresIn.close();

		this.featureNames = names.toArray(new String[names.size()]);

		// get their correspondences
		this.wals = new int[NUM_LANGUAGES][];

		BufferedReader arrayIn = new BufferedReader(new FileReader(walsArrayFile));
		String aLine;
		int i = 0;
		while ((aLine = arrayIn.readLine()) != null) {
			this.wals[i] = StringUtils.toIntArray(StringUtils.tokenize(aLine, ","));
			assert wals[i].length == featureNames.length : "Got " + featureNames.length
					+ " features and " + wals[i].length + " corresponding values at index " + i
					+ " which is " + languageNames[i];
			i++;
		}
		arrayIn.close();
	}

	public String[] getLanguageNames() {
		return languageNames;
	}

	public String[] getFeatureNames() {
		return featureNames;
	}

	public HashMap<String, String[]> getAllowableFeatureNames() {
		return featureValues;
	}

	public int getLanguageIndex(String languageName) {
		// TODO: Create a hashmap of ints to speed this up
		return ArrayUtils.findInUnsortedArray(this.languageNames, languageName);
	}

	public int getFeatureIndex(String featureName) {
		// TODO: Create a hashmap of ints to speed this up
		return ArrayUtils.findInUnsortedArray(this.featureNames, featureName);
	}

	public String getFeatureCategory(String featureName) {
		return featureCategories.get(featureName);
	}

	public String[] getFeatureValues(String featureName) {
		return featureValues.get(featureName);
	}

	public String getFeatureValueForLanguage(int languageIndex, int featureIndex) {
		String featureName = featureNames[featureIndex];
		String[] values = featureValues.get(featureName);
		int nFeatureValue = wals[languageIndex][featureIndex];

		if (nFeatureValue == 0) {
			return FeatureManager.NO_DATA;
		} else {
			String strFeatureValue = values[nFeatureValue - 1];
			return strFeatureValue;
		}
	}

	public String getMostCommonFeatureValue(int featureIndex) {
		String featureName = featureNames[featureIndex];
		String[] values = featureValues.get(featureName);

		int[] languagesForValue = new int[values.length];
		for (int languageIndex = 0; languageIndex < NUM_LANGUAGES; languageIndex++) {
			int nFeatureValue = wals[languageIndex][featureIndex];

			// ignore NO_DATA cases
			if (nFeatureValue > 0) {
				languagesForValue[nFeatureValue - 1]++;
			}
		}

		int nMostCommonFeatureValue = Integer.MIN_VALUE;
		int nMax = Integer.MIN_VALUE;
		for (int i = 0; i < languagesForValue.length; i++) {
			if (languagesForValue[i] > nMax) {
				nMax = languagesForValue[i];
				nMostCommonFeatureValue = i;
			}
		}

		String strFeatureValue = values[nMostCommonFeatureValue];
		return strFeatureValue;
	}

	public int[][] getWalsArray() {
		return wals;
	}
}
