package edu.emory.clir.clearnlp.bin;

import edu.emory.clir.clearnlp.component.utils.NLPUtils;
import edu.emory.clir.clearnlp.constituent.CTTagEn;
import edu.emory.clir.clearnlp.tokenization.AbstractTokenizer;
import edu.emory.clir.clearnlp.util.BinUtils;
import edu.emory.clir.clearnlp.util.FileUtils;
import edu.emory.clir.clearnlp.util.IOUtils;
import edu.emory.clir.clearnlp.util.Joiner;
import edu.emory.clir.clearnlp.util.constant.StringConst;
import edu.emory.clir.clearnlp.util.lang.TLanguage;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Iterator;
import java.util.List;
import org.kohsuke.args4j.Option;

/* loaded from: input_file:edu/emory/clir/clearnlp/bin/Tokenize.class */
public class Tokenize {

    @Option(name = "-i", usage = "input path (required)", required = true, metaVar = "<filepath>")
    private String s_inputPath;

    @Option(name = "-l", usage = "language (default: english)", required = false, metaVar = "<language>")
    private String s_language = TLanguage.ENGLISH.toString();

    @Option(name = "-ie", usage = "input file extension (default: *)", required = false, metaVar = "<regex>")
    private String s_inputExt = CTTagEn.E_NULL;

    @Option(name = "-oe", usage = "output file extension (default: tok)", required = false, metaVar = "<string>")
    private String s_outputExt = "tok";

    @Option(name = "-line", usage = "if set, treat each line as one sentence", required = false, metaVar = "<boolean>")
    private boolean b_line = false;

    public Tokenize() {
    }

    public Tokenize(String[] strArr) {
        BinUtils.initArgs(strArr, this);
        try {
            AbstractTokenizer tokenizer = NLPUtils.getTokenizer(TLanguage.getType(this.s_language));
            for (String str : FileUtils.getFileList(this.s_inputPath, this.s_inputExt, false)) {
                System.out.println(str);
                if (this.b_line) {
                    tokenizeLines(tokenizer, str, str + "." + this.s_outputExt);
                } else {
                    tokenize(tokenizer, str, str + "." + this.s_outputExt);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void tokenize(AbstractTokenizer abstractTokenizer, String str, String str2) throws IOException {
        FileInputStream createFileInputStream = IOUtils.createFileInputStream(str);
        PrintStream createBufferedPrintStream = IOUtils.createBufferedPrintStream(str2);
        Iterator<List<String>> it = abstractTokenizer.segmentize(createFileInputStream).iterator();
        while (it.hasNext()) {
            createBufferedPrintStream.println(Joiner.join(it.next(), StringConst.SPACE));
        }
        createFileInputStream.close();
        createBufferedPrintStream.close();
    }

    public void tokenizeLines(AbstractTokenizer abstractTokenizer, String str, String str2) throws IOException {
        BufferedReader createBufferedReader = IOUtils.createBufferedReader(str);
        PrintStream createBufferedPrintStream = IOUtils.createBufferedPrintStream(str2);
        while (true) {
            String readLine = createBufferedReader.readLine();
            if (readLine == null) {
                createBufferedReader.close();
                createBufferedPrintStream.close();
                return;
            }
            createBufferedPrintStream.println(Joiner.join(abstractTokenizer.tokenize(readLine), StringConst.SPACE));
        }
    }

    public static void main(String[] strArr) {
        new Tokenize(strArr);
    }
}
