package edu.umd.hooka;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;

/* loaded from: input_file:edu/umd/hooka/HBitextCompiler.class */
public class HBitextCompiler {
    static final String OUTPUT_BASENAME = "bitextcomp.outputbasename";
    static final String EN_PATH = "bitextcomp.enpath";
    static final String FR_PATH = "bitextcomp.frpath";
    static final String AL_PATH = "bitextcomp.alpath";

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:edu/umd/hooka/HBitextCompiler$BitextCompilerCounters.class */
    public enum BitextCompilerCounters {
        EN_WORDS,
        FR_WORDS,
        LINES,
        ENCODING_ERRORS
    }

    /* loaded from: input_file:edu/umd/hooka/HBitextCompiler$BitextCompilerMapper.class */
    public static class BitextCompilerMapper extends MapReduceBase implements Mapper<LongWritable, Text, LongWritable, Text> {
        String outputBase = null;
        Path pf = null;
        Path pe = null;
        Path pa = null;

        public void configure(JobConf jobConf) {
            this.outputBase = jobConf.get(HBitextCompiler.OUTPUT_BASENAME);
            this.pe = new Path(jobConf.get(HBitextCompiler.EN_PATH));
            this.pf = new Path(jobConf.get(HBitextCompiler.FR_PATH));
            String str = jobConf.get(HBitextCompiler.AL_PATH);
            if (str == null || str.compareTo("") == 0) {
                return;
            }
            this.pa = new Path(str);
        }

        public void map(LongWritable longWritable, Text text, OutputCollector<LongWritable, Text> outputCollector, Reporter reporter) throws IOException {
            Path path = new Path(this.outputBase);
            Path path2 = new Path(this.outputBase + ".metadata");
            Configuration configuration = new Configuration();
            FileSystem fileSystem = FileSystem.get(configuration);
            VocabularyWritable vocabularyWritable = new VocabularyWritable();
            VocabularyWritable vocabularyWritable2 = new VocabularyWritable();
            SequenceFile.Writer createWriter = SequenceFile.createWriter(fileSystem, configuration, path, IntWritable.class, PhrasePair.class);
            boolean z = this.pa != null;
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader((InputStream) fileSystem.open(this.pe), "UTF8"));
            BufferedReader bufferedReader2 = new BufferedReader(new InputStreamReader((InputStream) fileSystem.open(this.pf), "UTF8"));
            BufferedReader bufferedReader3 = null;
            if (z) {
                bufferedReader3 = new BufferedReader(new InputStreamReader((InputStream) fileSystem.open(this.pa), "UTF8"));
            }
            IntWritable intWritable = new IntWritable(0);
            int i = 0;
            reporter.incrCounter(BitextCompilerCounters.ENCODING_ERRORS, 0L);
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    if (bufferedReader2.readLine() != null) {
                        throw new RuntimeException(this.pf + " has more lines than " + this.pe);
                    }
                    createWriter.close();
                    DataOutputStream dataOutputStream = new DataOutputStream(new BufferedOutputStream(fileSystem.create(new Path(this.outputBase + ".voc.e"))));
                    vocabularyWritable.write(dataOutputStream);
                    dataOutputStream.close();
                    DataOutputStream dataOutputStream2 = new DataOutputStream(new BufferedOutputStream(fileSystem.create(new Path(this.outputBase + ".voc.f"))));
                    vocabularyWritable2.write(dataOutputStream2);
                    dataOutputStream2.close();
                    Metadata metadata = new Metadata(i, vocabularyWritable.size(), vocabularyWritable2.size());
                    ObjectOutputStream objectOutputStream = new ObjectOutputStream(new BufferedOutputStream(fileSystem.create(path2)));
                    objectOutputStream.writeObject(metadata);
                    objectOutputStream.close();
                    outputCollector.collect(new LongWritable(0L), new Text("done"));
                    return;
                }
                i++;
                if (i % 100 == 0) {
                    reporter.progress();
                }
                reporter.incrCounter(BitextCompilerCounters.LINES, 1L);
                String readLine2 = bufferedReader2.readLine();
                if (readLine2 == null) {
                    throw new RuntimeException(this.pf + " has fewer lines than " + this.pe);
                }
                try {
                    Phrase fromString = Phrase.fromString(0, readLine, vocabularyWritable);
                    Phrase fromString2 = Phrase.fromString(1, readLine2, vocabularyWritable2);
                    PhrasePair phrasePair = new PhrasePair(fromString2, fromString);
                    if (z) {
                        phrasePair.setAlignment(new Alignment(fromString2.size(), fromString.size(), bufferedReader3.readLine()));
                    }
                    intWritable.set(i);
                    createWriter.append(intWritable, phrasePair);
                    reporter.incrCounter(BitextCompilerCounters.EN_WORDS, fromString.getWords().length);
                    reporter.incrCounter(BitextCompilerCounters.FR_WORDS, fromString2.getWords().length);
                    reporter.progress();
                } catch (Exception e) {
                    System.err.println("\nAt line " + i + " caught: " + e);
                    reporter.incrCounter(BitextCompilerCounters.ENCODING_ERRORS, 1L);
                }
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((LongWritable) obj, (Text) obj2, (OutputCollector<LongWritable, Text>) outputCollector, reporter);
        }
    }

    public static void main(String[] strArr) {
        JobConf jobConf = new JobConf(HBitextCompiler.class);
        jobConf.set(OUTPUT_BASENAME, "/shared/bitexts/ep700k+nc.de-en/ep700k+nc");
        jobConf.set(FR_PATH, "filt.lc.de");
        jobConf.set(EN_PATH, "filt.lc.en");
        jobConf.set(AL_PATH, "");
        jobConf.setJobName("bitext.compile");
        jobConf.setOutputKeyClass(LongWritable.class);
        jobConf.setOutputValueClass(Text.class);
        jobConf.setMapperClass(BitextCompilerMapper.class);
        jobConf.setNumMapTasks(1);
        jobConf.setNumReduceTasks(0);
        FileInputFormat.setInputPaths(jobConf, new Path[]{new Path("dummy")});
        try {
            FileSystem.get(jobConf).delete(new Path("dummy.out"));
            FileOutputFormat.setOutputPath(jobConf, new Path("dummy.out"));
            jobConf.setOutputFormat(SequenceFileOutputFormat.class);
            JobClient.runJob(jobConf);
        } catch (IOException e) {
            System.err.println("Caught " + e);
            e.printStackTrace();
        }
    }
}
