package net.sf.gluebooster.demos.pojo.languages.chinese;

import java.io.BufferedReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;

/**
 * A parser for the CeDict format.
 * 
 * @see <a href="https://en.wikipedia.org/wiki/CEDICT">Wikipedia</a>
 * @author CBauer
 *
 */
public class CeDictParser {

	/**
	 * Parses one line of the cedict-file.
	 * 
	 * @param rawText
	 *            one line of a file
	 * 
	 * @return the parse result
	 */
	public static ChineseVocabularyEntry parseEntry(String rawText) {
		ChineseVocabularyEntry result = new ChineseVocabularyEntry();

		rawText = rawText.trim();

		int index = rawText.indexOf(" ");
		result.setTraditional(rawText.substring(0, index));
		rawText = rawText.substring(index).trim();

		index = rawText.indexOf(" ");
		result.setSimplified(rawText.substring(0, index));
		rawText = rawText.substring(index).trim();

		if (!rawText.startsWith("[")) {
			throw new IllegalStateException("did not find pinyin in: "
					+ rawText);
		}
		result.setPinyinTranslations(new ArrayList<Pair<String, List<String>>>());

		index = rawText.indexOf("]");
		String pinyin = rawText.substring(1, index);
		rawText = rawText.substring(index + 1).trim();
		ArrayList<String> translations = new ArrayList<String>();
		result.getPinyinTranslations().add(
				new ImmutablePair<String, List<String>>(pinyin,
				translations));

		if (!rawText.startsWith("/")) {
			throw new IllegalStateException("did not find translations");
		}
		while (rawText.length() > 1) {
			index = rawText.indexOf("/", 1);
			String translation;
			if (index > 0) {
				translation = rawText.substring(1, index).trim();
				rawText = rawText.substring(index).trim();
			} else {
				translation = rawText.substring(1).trim();
				rawText = "";
			}

			// split other pinyins
			// Example
			// 瓦 瓦 [wa4] / decken; Bsp.: 瓦刀 瓦刀 -- Kittmesser; Bsp.: 瓦瓦 瓦瓦 -- ein
			// Dach mit Ziegeln decken / wa3: Dachziegel (S)/ Ziegel/ xyz4: test
			if (translation.contains(":")) {
				index = translation.indexOf(":");
				// if the text before is one word (pinyin ending with a ton)
				if (index > 0
						&& Arrays.asList("1", "2", "3", "4", "5").contains(
								translation.substring(index - 1, index))
						&& !translation.substring(0, index).contains(" ")) {
					pinyin = translation.substring(0, index);
					translation = translation.substring(index + 1).trim();
					translations = new ArrayList<String>();
					result.getPinyinTranslations().add(
							new ImmutablePair<String, List<String>>(
									pinyin, translations));
				}
			}

			// TOOD modify "traditional simplified text" to
			// "(traditional) simplified text". Remove duplicates when the
			// traditional is the simplified text
			// Example
			// 一刀紙 一刀纸 Bsp.: 一刀纸 一刀纸
			// should be changed to
			// (一刀紙) 一刀纸 Bsp.: 一刀纸

			char x = "一刀".charAt(0);
			long y = x;
			String[] words = translation.split(" ");
			StringBuilder newTranslation = new StringBuilder();
			for (int i = 0; i < words.length - 1; i++) {
				String word = words[i];
				String wordAfter = words[i + 1];

				if ((wordAfter.length() == word.length()) && isChinese(word)
						&& isChinese(wordAfter)) {
					// only chinese characters (only after 13000)
					if (wordAfter.equals(word)) {
						// nothing to do (ignore the duplicate word)
					} else {
						// hopefully these are the tradionial and simplified
						// versions
						// word is the traditional
						newTranslation.append(" (").append(word).append(")");
						// the simplified is handled in the next step
					}

				} else {
					newTranslation.append(" ").append(word);
				}
			}

			newTranslation.append(" ").append(words[words.length - 1]);

			translations.add(newTranslation.toString().trim());

		}

		return result;
	}

	/**
	 * Is any character in the word not chinese. TODO: Maybe better algorithm:
	 * return true if at least one character is chinese.
	 * 
	 * @param word
	 *            will be inspected
	 * @return false if one non-chinese character is found
	 */
	private static boolean isChinese(String word) {
		for (int i = word.length() - 1; i > -1; i--) {
			if (word.charAt(i) < 13000) {
				// simplification: all unicode characters after 13000 may be
				// chinese
				return false;
			}
		}

		return true;
	}

	/**
	 * Parses one cedict-text.
	 * 
	 * 
	 * @param rawText
	 *            the whole cedict-file
	 * @return the entries of the file.
	 */
	public static Collection<ChineseVocabularyEntry> parseDictionary(
			String rawText, boolean onlyOneCharacterWords)
			throws Exception {
		return parseDictionary(new StringReader(rawText), onlyOneCharacterWords);
	}

	/**
	 * Parses one cedict-text.
	 * 
	 * 
	 * @param rawText
	 *            the whole cedict-file
	 * @return the entries of the file.
	 */
	public static Collection<ChineseVocabularyEntry> parseDictionary(Reader rawText, boolean onlyOneCharacterWords) throws Exception {
		Map<Pair<String, String>, ChineseVocabularyEntry> result = new HashMap<Pair<String, String>, ChineseVocabularyEntry>();
		// simplified -> entry
		BufferedReader reader = new BufferedReader(rawText);
		int counter = 0;
		String line = reader.readLine();
		while (line != null) {
			counter++;
//			if (line.startsWith("麼")) {
//				System.out.println("delete me");
			// }
			line = line.trim();
			if ((!line.isEmpty()) && (!line.startsWith("#"))) {
				ChineseVocabularyEntry entry = parseEntry(line);
				boolean addEntry = true;
				if (onlyOneCharacterWords) {
					addEntry = (entry.getSimplified() != null && entry.getSimplified().length() == 1)
							|| (entry.getTraditional() != null && entry.getTraditional().length() == 1);
				}

				if (addEntry) {
					ImmutablePair<String, String> simplifiedTraditional = new ImmutablePair<String, String>(entry.getSimplified(), entry.getTraditional());
					if (result.containsKey(simplifiedTraditional)) {
						ChineseVocabularyEntry existingEntry = result.get(simplifiedTraditional);
						existingEntry.getPinyinTranslations().addAll(entry.getPinyinTranslations());
					} else {
						result.put(simplifiedTraditional, entry);
					}
				}
			}
			line = reader.readLine();
		}

		return result.values();
	}

	/**
	 * Add a new entry to a dictionary
	 * 
	 * @param ceDictText
	 *            the dictionary
	 * @param simplified
	 *            the simplified chinese characters
	 * @param tradional
	 *            the traditional chinese characters
	 * @param pinyin
	 *            the pronounciation
	 * @param translation
	 *            the translation of the word
	 */
	public static void addEntry(StringBuilder ceDictText, String simplified,
			String tradional, String pinyin, String translation) {
		ceDictText.append("\n").append(tradional).append(" ")
				.append(simplified)
				.append(" [").append(pinyin).append("] /").append(translation);
	}
}
