package edu.umd.cloud9.collection;

import edu.umd.cloud9.collection.line.TextDocument;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.DriverUtil;
import edu.umd.cloud9.webgraph.TrecExtractLinks;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/* loaded from: input_file:edu/umd/cloud9/collection/ExtractHTMLFieldCollection.class */
public class ExtractHTMLFieldCollection extends PowerTool {
    private static final Logger LOG = Logger.getLogger(ExtractHTMLFieldCollection.class);
    public static final String[] RequiredParameters = {"Cloud9.InputPath", "Cloud9.InputFormat", "Cloud9.OutputPath", "Cloud9.TargetTag"};

    /* loaded from: input_file:edu/umd/cloud9/collection/ExtractHTMLFieldCollection$MyMapper.class */
    public static class MyMapper extends Mapper<LongWritable, Indexable, LongWritable, TextDocument> {
        private static String tag;
        private static NodeFilter filter;
        private static final Parser parser = new Parser();
        private static final LongWritable myKey = new LongWritable();
        private static final TextDocument myValue = new TextDocument();
        private static final StringBuffer strBuf = new StringBuffer();

        /* loaded from: input_file:edu/umd/cloud9/collection/ExtractHTMLFieldCollection$MyMapper$HeadingTagFilter.class */
        public static class HeadingTagFilter implements NodeFilter {
            private static final long serialVersionUID = 3848416345122090905L;
            private final Pattern pattern = Pattern.compile("h[123456]", 2);

            public boolean accept(Node node) {
                return this.pattern.matcher(node.getText()).matches();
            }
        }

        public void setup(Mapper<LongWritable, Indexable, LongWritable, TextDocument>.Context context) throws IOException {
            tag = context.getConfiguration().get("Cloud9.TargetTag");
            if (tag.equalsIgnoreCase("heading")) {
                filter = new HeadingTagFilter();
            } else {
                filter = new TagNameFilter(tag);
            }
        }

        public void map(LongWritable longWritable, Indexable indexable, Mapper<LongWritable, Indexable, LongWritable, TextDocument>.Context context) throws IOException, InterruptedException {
            context.getCounter(TrecExtractLinks.Map.LinkCounter.INPUT_DOCS).increment(1L);
            if (indexable.getDocid() == null || indexable.getContent() == null) {
                return;
            }
            myKey.set(longWritable.get());
            try {
                parser.setInputHTML(indexable.getContent());
                NodeList parse = parser.parse(filter);
                strBuf.setLength(0);
                strBuf.append("<DOC>\n<DOCNO>");
                strBuf.append(indexable.getDocid());
                strBuf.append("</DOCNO>\n");
                for (int i = 0; i < parse.size(); i++) {
                    strBuf.append(parse.elementAt(i).toHtml()).append("\n");
                }
                strBuf.append("</DOC>\n");
                myValue.setDocid(indexable.getDocid());
                myValue.setContent(strBuf.toString());
                context.write(myKey, myValue);
                context.getCounter(TrecExtractLinks.Map.LinkCounter.OUTPUT_DOCS).increment(1L);
            } catch (StackOverflowError e) {
                context.getCounter(TrecExtractLinks.Map.LinkCounter.PARSER_FAILED).increment(1L);
                myValue.setDocid(indexable.getDocid());
                myValue.setContent("<DOC>\n<DOCNO>" + indexable.getDocid() + "</DOCNO>\n<DOC>");
                context.write(myKey, myValue);
            } catch (ParserException e2) {
                context.getCounter(TrecExtractLinks.Map.LinkCounter.PARSER_FAILED).increment(1L);
                myValue.setDocid(indexable.getDocid());
                myValue.setContent("<DOC>\n<DOCNO>" + indexable.getDocid() + "</DOCNO>\n<DOC>");
                context.write(myKey, myValue);
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, Mapper.Context context) throws IOException, InterruptedException {
            map((LongWritable) obj, (Indexable) obj2, (Mapper<LongWritable, Indexable, LongWritable, TextDocument>.Context) context);
        }
    }

    @Override // edu.umd.cloud9.util.PowerTool
    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public ExtractHTMLFieldCollection(Configuration configuration) {
        super(configuration);
    }

    @Override // edu.umd.cloud9.util.PowerTool
    public int runTool() throws Exception {
        Configuration conf = getConf();
        Job job = new Job(conf);
        String str = conf.get("Cloud9.InputPath");
        String str2 = conf.get("Cloud9.InputFormat");
        String str3 = conf.get("Cloud9.OutputPath");
        String str4 = conf.get("Cloud9.TargetTag");
        job.setJobName("ExtractFieldCollection");
        job.setJarByClass(ExtractHTMLFieldCollection.class);
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(Reducer.class);
        job.setNumReduceTasks(DriverUtil.DEFAULT_REDUCERS);
        job.setInputFormatClass(Class.forName(str2));
        recursivelyAddInputPaths(job, str);
        FileOutputFormat.setOutputPath(job, new Path(str3));
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(TextDocument.class);
        LOG.info("ExtractFieldCollection - " + str4);
        LOG.info(" - Input path: " + str);
        LOG.info(" - Input format: " + str2);
        LOG.info(" - Output path: " + str3);
        LOG.info(" - Target tag: " + str4);
        job.waitForCompletion(true);
        return 0;
    }

    public static void recursivelyAddInputPaths(Job job, String str) throws IOException {
        try {
            for (FileStatus fileStatus : FileSystem.get(new URI(str), job.getConfiguration()).listStatus(new Path(str))) {
                if (!fileStatus.getPath().getName().startsWith("_")) {
                    if (fileStatus.isDir()) {
                        recursivelyAddInputPaths(job, fileStatus.getPath().toString());
                    } else {
                        FileInputFormat.addInputPath(job, fileStatus.getPath());
                    }
                }
            }
        } catch (URISyntaxException e) {
            throw new RuntimeException("Error recursively adding path -- " + str);
        }
    }

    public static void main(String[] strArr) throws Exception {
        Configuration configuration = new Configuration();
        if (strArr.length != 4) {
            System.err.println("Usage: ExtractFieldCollection [input-path] [input-format] [output-path] [target-tag]");
            System.exit(-1);
        }
        configuration.set("Cloud9.InputPath", strArr[0]);
        configuration.set("Cloud9.InputFormat", strArr[1]);
        configuration.set("Cloud9.OutputPath", strArr[2]);
        configuration.set("Cloud9.TargetTag", strArr[3]);
        System.exit(ToolRunner.run(configuration, new ExtractHTMLFieldCollection(configuration), strArr));
    }
}
