package net.sf.okapi.connectors.moses;

import com.ibm.icu.text.RuleBasedBreakIterator;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.StreamUtil;
import net.sf.okapi.common.Util;

/* loaded from: input_file:net/sf/okapi/connectors/moses/SimpleTokenizer.class */
public class SimpleTokenizer {
    private RuleBasedBreakIterator wordIterator;

    public SimpleTokenizer(LocaleId localeId) {
        this.wordIterator = null;
        this.wordIterator = new RuleBasedBreakIterator(StreamUtil.streamAsString(SimpleTokenizer.class.getResourceAsStream("/word_break_rules.txt"), "UTF-8"));
    }

    public String tokenize(String str) {
        if (Util.isEmpty(str)) {
            return str;
        }
        int i = 0;
        this.wordIterator.setText(str);
        StringBuffer stringBuffer = new StringBuffer(str.length());
        while (true) {
            int next = this.wordIterator.next();
            if (next == -1) {
                stringBuffer.setLength(stringBuffer.length() - 1);
                return stringBuffer.toString().replaceAll("\\s+", " ");
            }
            stringBuffer.append(str.substring(i, next) + " ");
            i = next;
        }
    }
}
