package edu.emory.clir.clearnlp.component.mode.pos;

import edu.emory.clir.clearnlp.collection.map.ObjectIntHashMap;
import edu.emory.clir.clearnlp.collection.ngram.Bigram;
import edu.emory.clir.clearnlp.collection.pair.ObjectDoublePair;
import edu.emory.clir.clearnlp.dependency.DEPNode;
import edu.emory.clir.clearnlp.util.DSUtils;
import edu.emory.clir.clearnlp.util.Joiner;
import edu.emory.clir.clearnlp.util.StringUtils;
import edu.emory.clir.clearnlp.util.constant.StringConst;
import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

/* loaded from: input_file:edu/emory/clir/clearnlp/component/mode/pos/POSLexicon.class */
public class POSLexicon implements Serializable {
    private static final long serialVersionUID = 8363531867786160098L;
    private ObjectIntHashMap<String> document_frequencies = new ObjectIntHashMap<>();
    private Map<String, String> ambiguity_class_features = new HashMap();
    private Bigram<String, String> ambiguity_classes = new Bigram<>();
    private String[] word_vector_paths;
    private Set<String> document;
    private double ambiguity_class_threshold;
    private int document_frequency_cutoff;
    private int document_size;
    private int tree_count;

    public POSLexicon(POSConfiguration pOSConfiguration) {
        initDocument();
        setAmbiguityClassThreshold(pOSConfiguration.getAmbiguityClassThreshold());
        setDocumentFrequencyCutoff(pOSConfiguration.getDocumentFrequencyCutoff());
        setDocumentSize(pOSConfiguration.getDocumentSize());
    }

    public void collect(POSState pOSState) {
        Iterator<DEPNode> it = pOSState.getTree().iterator();
        while (it.hasNext()) {
            DEPNode next = it.next();
            String simplifiedWordForm = next.getSimplifiedWordForm();
            this.ambiguity_classes.add(simplifiedWordForm, next.getPOSTag());
            this.document.add(StringUtils.toLowerCase(simplifiedWordForm));
        }
        int i = this.tree_count + 1;
        this.tree_count = i;
        if (i == this.document_size) {
            initDocument();
        }
    }

    private void initDocument() {
        if (this.document != null) {
            this.document_frequencies.addAll(this.document);
        }
        this.document = new HashSet();
        this.tree_count = 0;
    }

    public void finalizeCollect() {
        finalizeCollect(this.ambiguity_classes.getBigramSet());
    }

    private void finalizeCollect(Set<String> set) {
        initDocument();
        for (String str : set) {
            if (includeForm(StringUtils.toLowerCase(str))) {
                List<ObjectDoublePair<String>> list = this.ambiguity_classes.toList((Bigram<String, String>) str, this.ambiguity_class_threshold);
                if (!list.isEmpty()) {
                    DSUtils.sortReverseOrder(list);
                    this.ambiguity_class_features.put(str, Joiner.joinObject(list, StringConst.UNDERSCORE));
                }
            }
        }
    }

    public String getAmbiguityClassFeature(String str) {
        return this.ambiguity_class_features.get(str);
    }

    public String[] getWordVectorPaths() {
        return this.word_vector_paths;
    }

    public boolean includeForm(String str) {
        return this.document_frequencies.get(str) > this.document_frequency_cutoff;
    }

    public void setAmbiguityClassThreshold(double d) {
        this.ambiguity_class_threshold = d;
    }

    public void setDocumentFrequencyCutoff(int i) {
        this.document_frequency_cutoff = i;
    }

    public void setDocumentSize(int i) {
        this.document_size = i;
    }

    public void setWordVectorPaths(String[] strArr) {
        this.word_vector_paths = strArr;
    }
}
