package edu.umd.cloud9.webgraph;

import edu.umd.cloud9.collection.DocnoMapping;
import edu.umd.cloud9.collection.WebDocument;
import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.webgraph.data.AnchorTextConstants;
import edu.umd.cloud9.webgraph.normalizer.AnchorTextNormalizer;
import java.io.IOException;
import java.io.UTFDataFormatException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Logger;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/* loaded from: input_file:edu/umd/cloud9/webgraph/TrecExtractLinks.class */
public class TrecExtractLinks extends PowerTool {
    private static final Logger LOG = Logger.getLogger(TrecExtractLinks.class);
    public static final String[] RequiredParameters = {"Cloud9.InputPath", "Cloud9.OutputPath", "Cloud9.Mappers", "Cloud9.Reducers", "Cloud9.IncludeInternalLinks", "Cloud9.AnchorTextNormalizer", "Cloud9.DocnoMappingClass", "Cloud9.DocnoMappingFile"};
    CollectionConfigurationManager configer;

    /* loaded from: input_file:edu/umd/cloud9/webgraph/TrecExtractLinks$Map.class */
    public static class Map extends Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>> {
        private static String base;
        private static String baseHost;
        private static int docno;
        private static final Text keyWord = new Text();
        private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable<>();
        private static DocnoMapping docnoMapping = null;
        private static final Parser parser = new Parser();
        private static final NodeFilter filter = new NodeClassFilter(LinkTag.class);
        private static NodeList list;
        private static boolean includeInternalLinks;
        private static AnchorTextNormalizer normalizer;

        /* loaded from: input_file:edu/umd/cloud9/webgraph/TrecExtractLinks$Map$LinkCounter.class */
        public enum LinkCounter {
            INPUT_DOCS,
            OUTPUT_DOCS,
            INVALID_DOCNO,
            INVALID_URL,
            TEXT_TOO_LONG,
            PARSER_FAILED;

            /* renamed from: values, reason: to resolve conflict with enum method */
            public static LinkCounter[] valuesCustom() {
                LinkCounter[] valuesCustom = values();
                int length = valuesCustom.length;
                LinkCounter[] linkCounterArr = new LinkCounter[length];
                System.arraycopy(valuesCustom, 0, linkCounterArr, 0, length);
                return linkCounterArr;
            }
        }

        public void setup(Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>>.Context context) throws IOException {
            Configuration configuration = context.getConfiguration();
            try {
                docnoMapping = (DocnoMapping) Class.forName(configuration.get("Cloud9.DocnoMappingClass")).newInstance();
                if (configuration.get("Cloud9.DocnoMappingFile", (String) null) != null) {
                    try {
                        Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(configuration);
                        try {
                            docnoMapping.loadMapping(localCacheFiles != null ? localCacheFiles[0] : new Path(configuration.get("Cloud9.DocnoMappingFile")), FileSystem.getLocal(configuration));
                        } catch (Exception e) {
                            e.printStackTrace();
                            throw new RuntimeException("Error initializing DocnoMapping!");
                        }
                    } catch (IOException e2) {
                        throw new RuntimeException("Unable to find DocnoMappingFile!");
                    }
                }
                includeInternalLinks = configuration.getBoolean("Cloud9.IncludeInternalLinks", false);
                try {
                    normalizer = (AnchorTextNormalizer) Class.forName(configuration.get("Cloud9.AnchorTextNormalizer")).newInstance();
                } catch (Exception e3) {
                    e3.printStackTrace();
                    throw new RuntimeException("Error initializing AnchorTextNormalizer");
                }
            } catch (Exception e4) {
                throw new RuntimeException("Error initializing DocnoMapping class!");
            }
        }

        /* JADX WARN: Multi-variable type inference failed */
        public void map(LongWritable longWritable, WebDocument webDocument, Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>>.Context context) throws IOException, InterruptedException {
            context.getCounter(LinkCounter.INPUT_DOCS).increment(1L);
            try {
                docno = docnoMapping.getDocno(webDocument.getDocid());
                try {
                    String str = webDocument.getURL().split("\n")[0];
                    TrecExtractLinks.LOG.info("URI: " + str);
                    base = normalizeURL(str);
                    if (base == null) {
                        context.getCounter(LinkCounter.INVALID_URL).increment(1L);
                        return;
                    }
                    arrayList.clear();
                    arrayList.add(new AnchorText(AnchorTextConstants.Type.DOCNO_FIELD.val, "", docno));
                    keyWord.set(base);
                    context.write(keyWord, arrayList);
                    context.getCounter(LinkCounter.OUTPUT_DOCS).increment(1L);
                    try {
                        baseHost = new URI(base).getHost();
                        if (baseHost == null) {
                            context.getCounter(LinkCounter.INVALID_URL).increment(1L);
                            return;
                        }
                        try {
                            parser.setInputHTML(webDocument.getContent());
                            NodeList parse = parser.parse((NodeFilter) null);
                            BaseHrefTag baseHrefTag = new BaseHrefTag();
                            baseHrefTag.setBaseUrl(base);
                            parse.add(baseHrefTag);
                            parser.setInputHTML(parse.toHtml());
                            list = parser.extractAllNodesThatMatch(filter);
                            for (int i = 0; i < list.size(); i++) {
                                LinkTag elementAt = list.elementAt(i);
                                String linkText = elementAt.getLinkText();
                                String normalizeURL = normalizeURL(elementAt.extractLink());
                                if (normalizeURL != null && !normalizeURL.equals(base)) {
                                    try {
                                        String host = new URI(normalizeURL).getHost();
                                        if (host != null) {
                                            if (linkText == null) {
                                                linkText = "";
                                            }
                                            String process = normalizer.process(linkText);
                                            arrayList.clear();
                                            if (!baseHost.equals(host)) {
                                                arrayList.add(new AnchorText(AnchorTextConstants.Type.EXTERNAL_IN_LINK.val, process, docno));
                                            } else if (includeInternalLinks) {
                                                arrayList.add(new AnchorText(AnchorTextConstants.Type.INTERNAL_IN_LINK.val, process, docno));
                                            }
                                            try {
                                                keyWord.set(normalizeURL);
                                                context.write(keyWord, arrayList);
                                            } catch (UTFDataFormatException e) {
                                                context.getCounter(LinkCounter.TEXT_TOO_LONG).increment(1L);
                                                keyWord.set(normalizeURL);
                                                byte type = ((AnchorText) arrayList.get(0)).getType();
                                                arrayList.clear();
                                                arrayList.add(new AnchorText(type, "", docno));
                                                context.write(keyWord, arrayList);
                                            }
                                        }
                                    } catch (Exception e2) {
                                    }
                                }
                            }
                        } catch (ParserException e3) {
                            context.getCounter(LinkCounter.PARSER_FAILED).increment(1L);
                        } catch (StackOverflowError e4) {
                            context.getCounter(LinkCounter.PARSER_FAILED).increment(1L);
                        }
                    } catch (Exception e5) {
                        context.getCounter(LinkCounter.INVALID_URL).increment(1L);
                    }
                } catch (Exception e6) {
                    context.getCounter(LinkCounter.INVALID_URL).increment(1L);
                }
            } catch (NullPointerException e7) {
                context.getCounter(LinkCounter.INVALID_DOCNO).increment(1L);
            }
        }

        private static String normalizeURL(String str) {
            try {
                URI normalize = new URI(str).normalize();
                String lowerCase = normalize.getScheme().toLowerCase();
                String lowerCase2 = normalize.getHost().toLowerCase();
                String path = normalize.getPath();
                while (path != null && path.length() > 0 && path.charAt(path.length() - 1) == '/') {
                    path = path.substring(0, path.length() - 1);
                }
                return new URI(lowerCase, lowerCase2, path, null).toString();
            } catch (Exception e) {
                return null;
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, Mapper.Context context) throws IOException, InterruptedException {
            map((LongWritable) obj, (WebDocument) obj2, (Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>>.Context) context);
        }
    }

    /* loaded from: input_file:edu/umd/cloud9/webgraph/TrecExtractLinks$Reduce.class */
    public static class Reduce extends Reducer<Text, ArrayListWritable<AnchorText>, Text, ArrayListWritable<AnchorText>> {
        private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable<>();
        private static boolean pushed;

        /* JADX WARN: Multi-variable type inference failed */
        public void reduce(Text text, Iterable<ArrayListWritable<AnchorText>> iterable, Reducer<Text, ArrayListWritable<AnchorText>, Text, ArrayListWritable<AnchorText>>.Context context) throws IOException, InterruptedException {
            arrayList.clear();
            Iterator<ArrayListWritable<AnchorText>> it = iterable.iterator();
            while (it.hasNext()) {
                Iterator<E> it2 = it.next().iterator();
                while (it2.hasNext()) {
                    AnchorText anchorText = (AnchorText) it2.next();
                    pushed = false;
                    int i = 0;
                    while (true) {
                        if (i >= arrayList.size()) {
                            break;
                        }
                        if (((AnchorText) arrayList.get(i)).equalsIgnoreSources(anchorText)) {
                            ((AnchorText) arrayList.get(i)).addDocumentsFrom(anchorText);
                            pushed = true;
                            break;
                        }
                        i++;
                    }
                    if (!pushed) {
                        arrayList.add(anchorText.m344clone());
                    }
                }
            }
            context.write(text, arrayList);
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterable iterable, Reducer.Context context) throws IOException, InterruptedException {
            reduce((Text) obj, (Iterable<ArrayListWritable<AnchorText>>) iterable, (Reducer<Text, ArrayListWritable<AnchorText>, Text, ArrayListWritable<AnchorText>>.Context) context);
        }
    }

    @Override // edu.umd.cloud9.util.PowerTool
    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public TrecExtractLinks(Configuration configuration) {
        super(configuration);
    }

    public TrecExtractLinks(Configuration configuration, CollectionConfigurationManager collectionConfigurationManager) {
        super(configuration);
        this.configer = collectionConfigurationManager;
    }

    @Override // edu.umd.cloud9.util.PowerTool
    public int runTool() throws Exception {
        Configuration conf = getConf();
        conf.set("mapred.child.java.opts", "-Xmx3072m");
        conf.setInt("mapred.task.timeout", 60000000);
        Job job = new Job(conf);
        int i = conf.getInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
        String str = conf.get("Cloud9.InputPath");
        String str2 = conf.get("Cloud9.OutputPath");
        String str3 = conf.get("Cloud9.DocnoMappingFile");
        if (!FileSystem.get(conf).exists(new Path(str3))) {
            throw new RuntimeException("Error: Docno mapping data file " + str3 + " doesn't exist!");
        }
        DistributedCache.addCacheFile(new Path(str3).toUri(), job.getConfiguration());
        job.setJobName("ExtractLinks");
        job.setNumReduceTasks(i);
        job.setJarByClass(TrecExtractLinks.class);
        job.setMapperClass(Map.class);
        job.setCombinerClass(Reduce.class);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ArrayListWritable.class);
        this.configer.applyJobConfig(job);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
        recursivelyAddInputPaths(job, str);
        FileOutputFormat.setOutputPath(job, new Path(str2));
        LOG.info("ExtractLinks");
        LOG.info(" - input path: " + str);
        LOG.info(" - output path: " + str2);
        LOG.info(" - mapping file: " + str3);
        LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false));
        job.waitForCompletion(true);
        return 0;
    }

    public static void recursivelyAddInputPaths(Job job, String str) throws IOException {
        try {
            for (FileStatus fileStatus : FileSystem.get(new URI(str), job.getConfiguration()).listStatus(new Path(str))) {
                if (!fileStatus.getPath().getName().startsWith("_")) {
                    if (fileStatus.isDir()) {
                        recursivelyAddInputPaths(job, fileStatus.getPath().toString());
                    } else {
                        FileInputFormat.addInputPath(job, fileStatus.getPath());
                    }
                }
            }
        } catch (URISyntaxException e) {
            throw new RuntimeException("Error recursively adding path -- " + str);
        }
    }
}
