package edu.emory.clir.clearnlp.experiment;

import edu.emory.clir.clearnlp.collection.ngram.Unigram;
import edu.emory.clir.clearnlp.collection.pair.ObjectIntPair;
import edu.emory.clir.clearnlp.constituent.CTLib;
import edu.emory.clir.clearnlp.constituent.CTNode;
import edu.emory.clir.clearnlp.constituent.CTReader;
import edu.emory.clir.clearnlp.constituent.CTTagEn;
import edu.emory.clir.clearnlp.constituent.CTTree;
import edu.emory.clir.clearnlp.pos.POSTagEn;
import edu.emory.clir.clearnlp.util.DSUtils;
import edu.emory.clir.clearnlp.util.IOUtils;
import edu.emory.clir.clearnlp.util.StringUtils;
import edu.emory.clir.clearnlp.util.constant.StringConst;
import java.io.PrintStream;
import java.util.Iterator;
import java.util.List;

/* loaded from: input_file:edu/emory/clir/clearnlp/experiment/MWEExtractor.class */
public class MWEExtractor {
    public void extract(String str) {
        CTReader cTReader = new CTReader(IOUtils.createFileInputStream(str));
        Unigram<String> unigram = new Unigram<>();
        Unigram<String> unigram2 = new Unigram<>();
        while (true) {
            CTTree nextTree = cTReader.nextTree();
            if (nextTree == null) {
                printMap(unigram, str + ".qp_pre");
                printMap(unigram2, str + ".qp_post");
                return;
            }
            extract(nextTree.getRoot(), unigram, unigram2);
        }
    }

    private void printMap(Unigram<String> unigram, String str) {
        PrintStream createBufferedPrintStream = IOUtils.createBufferedPrintStream(str);
        List<ObjectIntPair<String>> list = unigram.toList(0);
        DSUtils.sortReverseOrder(list);
        for (ObjectIntPair<String> objectIntPair : list) {
            createBufferedPrintStream.println(objectIntPair.o + " " + objectIntPair.i);
        }
        createBufferedPrintStream.close();
    }

    public void extract(CTNode cTNode, Unigram<String> unigram, Unigram<String> unigram2) {
        if (cTNode.isConstituentTag(CTTagEn.C_QP)) {
            extractQP(cTNode, unigram, unigram2);
            return;
        }
        Iterator<CTNode> it = cTNode.getChildrenList().iterator();
        while (it.hasNext()) {
            extract(it.next(), unigram, unigram2);
        }
    }

    private void extractQP(CTNode cTNode, Unigram<String> unigram, Unigram<String> unigram2) {
        List<CTNode> tokenList = cTNode.getTokenList();
        int size = tokenList.size();
        int i = 0;
        while (true) {
            if (i >= size) {
                break;
            }
            if (!tokenList.get(i).isConstituentTagAny(POSTagEn.POS_CD, POSTagEn.POS_DOLLAR)) {
                i++;
            } else if (i > 0) {
                unigram.add(StringUtils.toLowerCase(CTLib.toForms(tokenList, 0, i, StringConst.SPACE)));
            }
        }
        for (int i2 = size - 1; i2 >= 0; i2--) {
            if (tokenList.get(i2).isConstituentTag(POSTagEn.POS_CD)) {
                if (i2 + 1 < size) {
                    unigram2.add(StringUtils.toLowerCase(CTLib.toForms(tokenList, i2 + 1, size, StringConst.SPACE)));
                    return;
                }
                return;
            }
        }
    }

    public static void main(String[] strArr) {
        new MWEExtractor().extract("/Users/jdchoi/Documents/Data/ontonotes/data/english/onto.parse");
    }
}
