package edu.umd.cloud9.collection.wikipedia;

import edu.umd.hooka.Vocab;
import java.io.IOException;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

/* loaded from: input_file:edu/umd/cloud9/collection/wikipedia/RepackWikipedia.class */
public class RepackWikipedia extends Configured implements Tool {
    private static final Logger LOG = Logger.getLogger(RepackWikipedia.class);
    private static final String DOCNO_MAPPING_FIELD = "DocnoMappingDataFile";
    private static final String INPUT_OPTION = "input";
    private static final String OUTPUT_OPTION = "output";
    private static final String MAPPING_FILE_OPTION = "mapping_file";
    private static final String COMPRESSION_TYPE_OPTION = "compression_type";
    private static final String LANGUAGE_OPTION = "wiki_language";

    /* loaded from: input_file:edu/umd/cloud9/collection/wikipedia/RepackWikipedia$MyMapper.class */
    private static class MyMapper extends MapReduceBase implements Mapper<LongWritable, WikipediaPage, IntWritable, WikipediaPage> {
        private static final IntWritable docno = new IntWritable();
        private static final WikipediaDocnoMapping docnoMapping = new WikipediaDocnoMapping();
        String language;

        private MyMapper() {
        }

        public void configure(JobConf jobConf) {
            try {
                Path path = new Path(jobConf.get(RepackWikipedia.DOCNO_MAPPING_FIELD));
                RepackWikipedia.LOG.info("Loading docno mapping: " + path);
                FileSystem fileSystem = FileSystem.get(jobConf);
                if (!fileSystem.exists(path)) {
                    throw new RuntimeException(path + " does not exist!");
                }
                docnoMapping.loadMapping(path, fileSystem);
                this.language = jobConf.get("wiki.language");
            } catch (Exception e) {
                throw new RuntimeException("Error loading docno mapping data file!");
            }
        }

        public void map(LongWritable longWritable, WikipediaPage wikipediaPage, OutputCollector<IntWritable, WikipediaPage> outputCollector, Reporter reporter) throws IOException {
            int docno2;
            reporter.incrCounter(Records.TOTAL, 1L);
            String docid = wikipediaPage.getDocid();
            if (docid == null || (docno2 = docnoMapping.getDocno(docid)) < 0) {
                return;
            }
            docno.set(docno2);
            if (this.language != null) {
                wikipediaPage.setLanguage(this.language);
            }
            outputCollector.collect(docno, wikipediaPage);
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((LongWritable) obj, (WikipediaPage) obj2, (OutputCollector<IntWritable, WikipediaPage>) outputCollector, reporter);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/umd/cloud9/collection/wikipedia/RepackWikipedia$Records.class */
    public enum Records {
        TOTAL;

        /* renamed from: values, reason: to resolve conflict with enum method */
        public static Records[] valuesCustom() {
            Records[] valuesCustom = values();
            int length = valuesCustom.length;
            Records[] recordsArr = new Records[length];
            System.arraycopy(valuesCustom, 0, recordsArr, 0, length);
            return recordsArr;
        }
    }

    public int run(String[] strArr) throws Exception {
        Options options = new Options();
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("XML dump file");
        options.addOption(OptionBuilder.create("input"));
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("output location");
        options.addOption(OptionBuilder.create("output"));
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("mapping file");
        options.addOption(OptionBuilder.create(MAPPING_FILE_OPTION));
        OptionBuilder.withArgName("block|record|none");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("compression type");
        options.addOption(OptionBuilder.create(COMPRESSION_TYPE_OPTION));
        OptionBuilder.withArgName("en|fr|de|zh");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("two-letter language code");
        options.addOption(OptionBuilder.create(LANGUAGE_OPTION));
        try {
            CommandLine parse = new GnuParser().parse(options, strArr);
            if (!parse.hasOption("input") || !parse.hasOption("output") || !parse.hasOption(MAPPING_FILE_OPTION) || !parse.hasOption(COMPRESSION_TYPE_OPTION)) {
                new HelpFormatter().printHelp(getClass().getName(), options);
                ToolRunner.printGenericCommandUsage(System.out);
                return -1;
            }
            String optionValue = parse.getOptionValue("input");
            String optionValue2 = parse.getOptionValue("output");
            String optionValue3 = parse.getOptionValue(MAPPING_FILE_OPTION);
            String optionValue4 = parse.getOptionValue(COMPRESSION_TYPE_OPTION);
            if (!"block".equals(optionValue4) && !"record".equals(optionValue4) && !"none".equals(optionValue4)) {
                System.err.println("Error: \"" + optionValue4 + "\" unknown compression type!");
                return -1;
            }
            String str = null;
            if (parse.hasOption(LANGUAGE_OPTION)) {
                str = parse.getOptionValue(LANGUAGE_OPTION);
                if (str.length() != 2) {
                    System.err.println("Error: \"" + str + "\" unknown language!");
                    return -1;
                }
            }
            JobConf jobConf = new JobConf(getConf(), RepackWikipedia.class);
            jobConf.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", "input", optionValue, "output", optionValue2, COMPRESSION_TYPE_OPTION, optionValue4, LANGUAGE_OPTION, str));
            jobConf.set(DOCNO_MAPPING_FIELD, optionValue3);
            LOG.info("Tool name: " + getClass().getName());
            LOG.info(" - XML dump file: " + optionValue);
            LOG.info(" - output path: " + optionValue2);
            LOG.info(" - docno mapping data file: " + optionValue3);
            LOG.info(" - compression type: " + optionValue4);
            LOG.info(" - language: " + str);
            if ("block".equals(optionValue4)) {
                LOG.info(" - block size: " + Vocab.MAX_VOCAB_INDEX);
            }
            jobConf.setNumMapTasks(10);
            jobConf.setNumReduceTasks(0);
            SequenceFileInputFormat.addInputPath(jobConf, new Path(optionValue));
            SequenceFileOutputFormat.setOutputPath(jobConf, new Path(optionValue2));
            if ("none".equals(optionValue4)) {
                SequenceFileOutputFormat.setCompressOutput(jobConf, false);
            } else {
                SequenceFileOutputFormat.setCompressOutput(jobConf, true);
                if ("record".equals(optionValue4)) {
                    SequenceFileOutputFormat.setOutputCompressionType(jobConf, SequenceFile.CompressionType.RECORD);
                } else {
                    SequenceFileOutputFormat.setOutputCompressionType(jobConf, SequenceFile.CompressionType.BLOCK);
                    jobConf.setInt("io.seqfile.compress.blocksize", Vocab.MAX_VOCAB_INDEX);
                }
            }
            if (str != null) {
                jobConf.set("wiki.language", str);
            }
            jobConf.setInputFormat(WikipediaPageInputFormat.class);
            jobConf.setOutputFormat(SequenceFileOutputFormat.class);
            jobConf.setOutputKeyClass(IntWritable.class);
            jobConf.setOutputValueClass(WikipediaPage.class);
            jobConf.setMapperClass(MyMapper.class);
            FileSystem.get(jobConf).delete(new Path(optionValue2), true);
            JobClient.runJob(jobConf);
            return 0;
        } catch (ParseException e) {
            System.err.println("Error parsing command line: " + e.getMessage());
            return -1;
        }
    }

    public static void main(String[] strArr) throws Exception {
        ToolRunner.run(new RepackWikipedia(), strArr);
    }
}
