package edu.umd.cloud9.collection.wikipedia;

import edu.umd.cloud9.collection.Indexable;
import edu.umd.cloud9.webgraph.data.AnchorTextConstants;
import info.bliki.wiki.filter.PlainTextConverter;
import info.bliki.wiki.model.WikiModel;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.io.WritableUtils;

/* loaded from: input_file:edu/umd/cloud9/collection/wikipedia/WikipediaPage.class */
public class WikipediaPage extends Indexable {
    public static final String XML_START_TAG = "<page>";
    public static final String XML_END_TAG = "</page>";
    private String page;
    private String title;
    private String mId;
    private int textStart;
    private int textEnd;
    private boolean isRedirect;
    private boolean isDisambig;
    private boolean isStub;
    private WikiModel wikiModel = new WikiModel(AnchorTextConstants.EMPTY_STRING, AnchorTextConstants.EMPTY_STRING);
    private PlainTextConverter textConverter = new PlainTextConverter();
    private static final Pattern REF = Pattern.compile("<ref>.*?</ref>");
    private static final Pattern LANG_LINKS = Pattern.compile("\\[\\[[a-z\\-]+:[^\\]]+\\]\\]");
    private static final Pattern DOUBLE_CURLY = Pattern.compile("\\{\\{.*?\\}\\}");
    private static final Pattern URL = Pattern.compile("http://[^ <]+");
    private static final Pattern HTML_TAG = Pattern.compile("<[^!][^>]*>");
    private static final Pattern HTML_COMMENT = Pattern.compile("<!--.*?-->", 32);

    public void write(DataOutput dataOutput) throws IOException {
        byte[] bytes = this.page.getBytes();
        WritableUtils.writeVInt(dataOutput, bytes.length);
        dataOutput.write(bytes, 0, bytes.length);
    }

    public void readFields(DataInput dataInput) throws IOException {
        int readVInt = WritableUtils.readVInt(dataInput);
        byte[] bArr = new byte[readVInt];
        dataInput.readFully(bArr, 0, readVInt);
        readPage(this, new String(bArr));
    }

    @Override // edu.umd.cloud9.collection.Indexable
    public String getDocid() {
        return this.mId;
    }

    @Override // edu.umd.cloud9.collection.Indexable
    public String getContent() {
        String replaceAll = LANG_LINKS.matcher(getWikiMarkup()).replaceAll(" ");
        this.wikiModel.setUp();
        String str = String.valueOf(getTitle()) + "\n" + this.wikiModel.render(this.textConverter, replaceAll);
        this.wikiModel.tearDown();
        return HTML_TAG.matcher(DOUBLE_CURLY.matcher(URL.matcher(HTML_COMMENT.matcher(REF.matcher(StringEscapeUtils.unescapeHtml(StringEscapeUtils.unescapeHtml(str))).replaceAll(" ")).replaceAll(" ")).replaceAll(" ")).replaceAll(" ")).replaceAll(" ");
    }

    @Override // edu.umd.cloud9.collection.Indexable
    public String getDisplayContent() {
        this.wikiModel.setUp();
        String str = "<h1>" + getTitle() + "</h1>\n" + this.wikiModel.render(getWikiMarkup());
        this.wikiModel.tearDown();
        return DOUBLE_CURLY.matcher(str).replaceAll(" ");
    }

    @Override // edu.umd.cloud9.collection.Indexable
    public String getDisplayContentType() {
        return "text/html";
    }

    public String getRawXML() {
        return this.page;
    }

    public String getWikiMarkup() {
        if (this.textStart == -1) {
            return null;
        }
        return this.page.substring(this.textStart + 27, this.textEnd);
    }

    public String getTitle() {
        return this.title;
    }

    public boolean isDisambiguation() {
        return this.isDisambig;
    }

    public boolean isRedirect() {
        return this.isRedirect;
    }

    public boolean isEmpty() {
        return this.textStart == -1;
    }

    public boolean isStub() {
        return this.isStub;
    }

    public boolean isArticle() {
        return (getTitle().startsWith("File:") || getTitle().startsWith("Category:") || getTitle().startsWith("Special:") || getTitle().startsWith("Wikipedia:") || getTitle().startsWith("Wikipedia:") || getTitle().startsWith("Template:") || getTitle().startsWith("Portal:")) ? false : true;
    }

    public String findInterlanguageLink(String str) {
        int indexOf;
        int indexOf2 = this.page.indexOf("[[" + str + ":");
        if (indexOf2 < 0 || (indexOf = this.page.indexOf("]]", indexOf2)) < 0) {
            return null;
        }
        String substring = this.page.substring(indexOf2 + 3 + str.length(), indexOf);
        if (substring.indexOf("\n") == -1 && substring.length() != 0) {
            return substring;
        }
        return null;
    }

    public List<String> extractLinkDestinations() {
        int indexOf;
        int i = 0;
        ArrayList arrayList = new ArrayList();
        while (true) {
            int indexOf2 = this.page.indexOf("[[", i);
            if (indexOf2 >= 0 && (indexOf = this.page.indexOf("]]", indexOf2)) >= 0) {
                String substring = this.page.substring(indexOf2 + 2, indexOf);
                if (substring.length() == 0) {
                    i = indexOf + 1;
                } else if (substring.indexOf(":") != -1) {
                    i = indexOf + 1;
                } else {
                    int indexOf3 = substring.indexOf("|");
                    if (indexOf3 != -1) {
                        substring = substring.substring(0, indexOf3);
                    }
                    int indexOf4 = substring.indexOf("#");
                    if (indexOf4 != -1) {
                        substring = substring.substring(0, indexOf4);
                    }
                    if (substring.length() == 0) {
                        i = indexOf + 1;
                    } else {
                        arrayList.add(substring.trim());
                        i = indexOf + 1;
                    }
                }
            }
        }
        return arrayList;
    }

    public static void readPage(WikipediaPage wikipediaPage, String str) {
        wikipediaPage.page = str;
        int indexOf = str.indexOf("<title>");
        wikipediaPage.title = StringEscapeUtils.unescapeHtml(str.substring(indexOf + 7, str.indexOf("</title>", indexOf)));
        wikipediaPage.mId = str.substring(str.indexOf("<id>") + 4, str.indexOf("</id>"));
        wikipediaPage.textStart = str.indexOf("<text xml:space=\"preserve\">");
        wikipediaPage.textEnd = str.indexOf("</text>", wikipediaPage.textStart);
        wikipediaPage.isDisambig = (str.indexOf("{{disambig", wikipediaPage.textStart) == -1 && str.indexOf("{{Disambig", wikipediaPage.textStart) == -1) ? false : true;
        wikipediaPage.isRedirect = str.substring(wikipediaPage.textStart + 27, wikipediaPage.textStart + 36).compareTo("#REDIRECT") == 0 || str.substring(wikipediaPage.textStart + 27, wikipediaPage.textStart + 36).compareTo("#redirect") == 0;
        wikipediaPage.isStub = str.indexOf("stub}}", wikipediaPage.textStart) != -1;
    }
}
