package net.sf.gluebooster.demos.pojo.languages.chinese;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Writer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.commons.collections4.MultiValuedMap;
import org.apache.commons.collections4.multimap.HashSetValuedHashMap;
import org.apache.commons.lang3.tuple.Pair;

import net.sf.gluebooster.java.booster.basic.gui.swing.SwingBoostUtils;
import net.sf.gluebooster.java.booster.essentials.utils.Check;

/**
 * Analyzes files from <a href='https://tatoeba.org'>tatoeba</a>.
 * 
 * Not yet fully implemented.
 * 
 * @author cbauer
 *
 */
public class TatoebaAnalyzer {

	public void extractChineseSentences() throws Exception {
		File sentences = SwingBoostUtils.chooseFile("sentences_detailed.csv", true);
		File links = SwingBoostUtils.chooseFile("links.csv", true);
		File result = SwingBoostUtils.chooseFile("result", true);

		extractChineseSentences(new InputStreamReader(new FileInputStream(sentences), "utf-8"), new InputStreamReader(new FileInputStream(links), "utf-8"),
				new FileWriter(result));
	}

	public void extractChineseSentences(Reader sentences_detailed_csv, Reader links_csv, Writer result) throws Exception {

		HashMap<Long, ChineseVocabularyEntry> chinese = new HashMap<Long, ChineseVocabularyEntry>();
		HashMap<Long, ChineseVocabularyEntry> german = new HashMap<Long, ChineseVocabularyEntry>();
		HashMap<Long, ChineseVocabularyEntry> english = new HashMap<Long, ChineseVocabularyEntry>();

		String MANDARIN = "cmn";
		String LITERARY_CHINESE = "lzh";
		String GERMAN = "deu";
		String ENGLISH = "eng";

		BufferedReader sentenceReader = new BufferedReader(sentences_detailed_csv);
		String line = null;
		do {
			line = sentenceReader.readLine();
			if (line != null) {
				String[] parts = line.split("\t");
				Long id = new Long(parts[0]);
				String language = parts[1];
				String text = parts[2];
				String author = parts[3];

				if (MANDARIN.equals(language) || LITERARY_CHINESE.equals(language)) {
					ChineseVocabularyEntry entry = new ChineseVocabularyEntry(text, author);
					chinese.put(id, entry);
				} else if (GERMAN.equals(language)) {
					ChineseVocabularyEntry entry = new ChineseVocabularyEntry();
					entry.addTranslation(text, author);
					german.put(id, entry);
				} else if (ENGLISH.equals(language)) {
					ChineseVocabularyEntry entry = new ChineseVocabularyEntry();
					entry.addTranslation(text, author);
					english.put(id, entry);
				}
			}
		} while (line != null);
		sentenceReader.close();

		System.out.println("found chinese: " + chinese.size() + " german: " + german.size());
		Set<Long> chineseIds = new HashSet<>(chinese.keySet());// because it will be modified
		Set<Long> germanIds = german.keySet();
		Set<Long> englishIds = english.keySet();

		MultiValuedMap<Long, Long> fromChinese = new HashSetValuedHashMap<Long, Long>();
		HashMap<Long, Long> toGerman = new HashMap<Long, Long>();
		MultiValuedMap<Long, Long> chineseEnglish = new HashSetValuedHashMap<Long, Long>();

		BufferedReader linkReader = new BufferedReader(links_csv);
		do {
			line = linkReader.readLine();
			if (line != null) {
				String[] parts = line.split("\t");
				Long from = new Long(parts[0]);
				Long to = new Long(parts[1]);

				if (chineseIds.contains(from)) {
					if (germanIds.contains(to)) {
						ChineseVocabularyEntry germanEntry = german.get(to);
						chinese.get(from).addTranslations(germanEntry.getPinyinTranslations(), germanEntry.getAuthors());
						chineseIds.remove(from);
					} else {
						fromChinese.put(from, to);
						if (englishIds.contains(to)) {
							chineseEnglish.put(from, to);
						}
					}
				} else if (germanIds.contains(to)) {
					toGerman.put(from, to);
				}
			}

		} while (line != null);


		// via intermediate language
		Set<Long> missingChineseIds = new HashSet<>(chineseIds);// because it will be modified
		for (Long missingChinese : chineseIds) {
			for (Long intermediate : fromChinese.get(missingChinese)) {
				if (toGerman.containsKey(intermediate)) {
					ChineseVocabularyEntry germanEntry = german.get(toGerman.get(intermediate));
					chinese.get(missingChinese).addTranslations(germanEntry.getPinyinTranslations(), germanEntry.getAuthors());
					missingChineseIds.remove(missingChinese);
				}
			}

		}

		// add english for the missing sentences
		for (Long missingChinese : missingChineseIds) {
			if (chineseEnglish.containsKey(missingChinese)) {
				ChineseVocabularyEntry chineseEntry = chinese.get(missingChinese);
				for (Long englishId : chineseEnglish.get(missingChinese)) {
					ChineseVocabularyEntry englishEntry = english.get(englishId);
					chineseEntry.addTranslations(englishEntry.getPinyinTranslations(), englishEntry.getAuthors());
				}
			}
		}

		// write result

		Set<String> allAuthors = new HashSet<String>();

		for (Entry<Long, ChineseVocabularyEntry> entry1 : chinese.entrySet()) {
			ChineseVocabularyEntry entry = entry1.getValue();
			if (entry.hasPinyinTranslations()) {
				StringBuilder resultLine = new StringBuilder();
				resultLine.append(entry.getSimplified().replace(" ", ""));
				resultLine.append(" ");
				for (Pair<String, List<String>> pinyinTranslation : entry.getPinyinTranslations()) {
					for (String translation : pinyinTranslation.getRight()) {
						resultLine.append(translation).append(" ");
					}
				}
				resultLine.append(" ");
				resultLine.append("[https://tatoeba.org/eng/sentences/show/").append(entry1.getKey()).append(" Tatoeba] ");
				for (String author : entry.getAuthors()) {
					String authorline = "[https://tatoeba.org/eng/user/profile/" + author + " " + author + "] ";
					allAuthors.add(authorline);
					resultLine.append(authorline);
				}
				resultLine.append("\r\n");
				result.write(resultLine.toString());
			}
		}

		result.write("\r\n");
		result.write("\r\n");
		result.write("\r\n");
		result.write("Authors\r\n");
		for (String author : allAuthors) {
			result.write(author);
			result.write("\r\n");
		}

		linkReader.close();
		sentenceReader.close();
		result.close();
	}

	public static void main(String[] ignored) throws Exception {
		new TatoebaAnalyzer().extractChineseSentences();
	}
}
