package it.tidalwave.bluebill.factsheet.wikipedia.photos;

import it.tidalwave.bluebill.taxonomy.Taxon;
import it.tidalwave.bluebill.taxonomy.birds.col.CatalogueOfLifeImporter;
import it.tidalwave.openrdf.elmo.impl.ElmoEntityFactory;
import it.tidalwave.semantic.EntityFactory;
import it.tidalwave.semantic.Wrapper;
import it.tidalwave.util.Initializer;
import it.tidalwave.xml.XPathProvider;
import it.tidalwave.xml.XmlParser;
import it.tidalwave.xml.dom4j.Dom4jXmlParser;
import it.tidalwave.xml.jaxen.JaxenXPathProvider;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.Proxy;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.annotation.Nonnull;
import javax.xml.namespace.QName;
import org.apache.commons.io.IOUtils;
import org.jaxen.XPath;
import org.openrdf.concepts.dc.DcResource;
import org.openrdf.concepts.foaf.FoafResource;
import org.openrdf.concepts.foaf.Image;
import org.openrdf.concepts.skos.core.Concept;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/* loaded from: input_file:it/tidalwave/bluebill/factsheet/wikipedia/photos/WikipediaPhotosImporter.class */
public class WikipediaPhotosImporter extends CatalogueOfLifeImporter {
    public static final String DCMI_TYPE_STILL_IMAGE = "http://purl.org/dc/dcmitype/StillImage";
    public static final String LICENSE_GNU_FDL = "http://www.gnu.org/copyleft/fdl.html";
    private static final String URI_COMMONS_WIKIMEDIA = "http://commons.wikimedia.org";
    private static final String URN_BLUEBILL_WIKIMEDIA_CONTRIBUTOR = "urn:bluebill:wikimedia/contributor/";

    @Nonnull
    private final transient XmlParser xmlParser;

    @Nonnull
    private final transient XPathProvider xPathProvider;
    private transient Proxy proxy;
    private static final String XPATH_LI_GALLERY_BOX = "//html:li[@class='gallerybox']";
    private static final String XPATH_A_HREF = ".//html:a[@class='image']/@href";
    private static final String XPATH_A_IMG_SRC = ".//html:a[@class='image']/html:img/@src";
    private static final String XPATH_IMAGE_URL = "//html:div[@id='file']/html:a[1]/@href";
    private static final String XPATH_LICENSE_URL = "/html:html/html:head/html:link[@rel='copyright']/@href";
    private static final String XPATH_AUTHOR_P = "//html:tr[html:th='Author']/html:td/html:p/text()";
    private static final String XPATH_AUTHOR_A_NAME = "//html:tr[html:th='Author']/html:td/html:p/html:a/text()";
    private static final String XPATH_AUTHOR_A_URI = "//html:tr[html:th='Author']/html:td/html:p/html:a/@href";
    private static final String XPATH_AUTHOR_A_NAME3 = "//html:p[contains(html:b, 'Photographer:')]/html:a/text()";
    private static final String XPATH_AUTHOR_A_URI3 = "//html:p[contains(html:b, 'Photographer:')]/html:a/@href";
    private static final String XPATH_AUTHOR_A_NAME5 = "//html:p[contains(., 'Author:')]/html:a/text()";
    private static final String XPATH_AUTHOR_A_URI5 = "//html:p[contains(., 'Author:')]/html:a/@href";
    private static final String XPATH_AUTHOR_A_NAME6 = "//html:li[contains(html:b, 'Fotograf')]/html:a/text()";
    private static final String XPATH_AUTHOR_A_URI6 = "//html:li[contains(html:b, 'Fotograf')]/html:a/@href";
    private static final String XPATH_AUTHOR_A_NAME7 = "//html:i[contains(., 'grants anyone')]/html:a/text()";
    private static final String XPATH_AUTHOR_A_URI7 = "//html:i[contains(., 'grants anyone')]/html:a/@href";
    private static final String XPATH_AUTHOR_A_URI2 = "//html:p[contains(., 'Author:')]/text()";
    private static final String XPATH_AUTHOR_A_URI4 = "//html:li[contains(., 'Creator:')]/text()";
    protected final transient EntityFactory<Description, ElmoDescription> stillImageFactory;
    protected final transient EntityFactory<Organization, ElmoOrganization> organizationFactory;
    protected final transient EntityFactory<Person, ElmoPerson> authorFactory;
    protected transient Organization wikiMediaCommons;
    private final transient Map<String, Person> map;
    protected final transient EntityResolver entityResolver;
    private static final Logger log = LoggerFactory.getLogger(WikipediaPhotosImporter.class);

    public WikipediaPhotosImporter() throws IOException {
        this(new Dom4jXmlParser(), new JaxenXPathProvider());
    }

    WikipediaPhotosImporter(@Nonnull XmlParser xmlParser, @Nonnull XPathProvider xPathProvider) throws IOException {
        this.proxy = Proxy.NO_PROXY;
        this.stillImageFactory = new ElmoEntityFactory(Description.class, ElmoDescription.class, new Class[]{Image.class, DcResource.class});
        this.organizationFactory = new ElmoEntityFactory(Organization.class, ElmoOrganization.class, new Class[]{Concept.class, DcResource.class, org.openrdf.concepts.foaf.Organization.class, FoafResource.class});
        this.authorFactory = new ElmoEntityFactory(Person.class, ElmoPerson.class, new Class[]{Concept.class, DcResource.class, org.openrdf.concepts.foaf.Person.class});
        this.map = new HashMap();
        this.entityResolver = new EntityResolver() { // from class: it.tidalwave.bluebill.factsheet.wikipedia.photos.WikipediaPhotosImporter.1
            @Override // org.xml.sax.EntityResolver
            @Nonnull
            public InputSource resolveEntity(@Nonnull String str, @Nonnull String str2) throws SAXException, IOException {
                if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd".equals(str2)) {
                    return new InputSource(getClass().getResourceAsStream("xhtml1-transitional.dtd"));
                }
                return null;
            }
        };
        this.xmlParser = xmlParser;
        this.xPathProvider = xPathProvider;
        this.xmlParser.setEntityResolver(this.entityResolver);
    }

    protected void initialize() throws IOException {
        super.initialize();
        this.wikiMediaCommons = (Organization) this.organizationFactory.create(new QName("http://commons.wikimedia.org/#organization"));
        ((Concept) this.wikiMediaCommons.getLookup().lookup(Concept.class)).setRdfsLabel("Wikimedia Commons");
    }

    @Nonnull
    protected Taxon.Builder process(@Nonnull String str, @Nonnull String str2, @Nonnull Taxon.Builder builder) throws Exception {
        log.info("process({}, {}, {})", new Object[]{str, str2, builder});
        if (builder.getRank() != Taxon.Rank.SPECIES) {
            log.info(">>>> ignoring rank {}", builder.getRank());
            throw new CatalogueOfLifeImporter.IgnoreException();
        }
        URL url = new URL("http://commons.wikimedia.org/wiki/" + str2.replace(' ', '_'));
        log.debug(">>>> scraping page: {}", url);
        InputStream openStream = url.openStream();
        try {
            final List<URI> scrapeMainPage = scrapeMainPage(openStream);
            openStream.close();
            log.debug(">>>> image page Urls: {}", scrapeMainPage);
            Taxon.Builder withInitializer = builder.withInitializer(new Initializer<Taxon>() { // from class: it.tidalwave.bluebill.factsheet.wikipedia.photos.WikipediaPhotosImporter.2
                @Nonnull
                public Taxon initialize(@Nonnull Taxon taxon) {
                    for (URI uri : scrapeMainPage) {
                        try {
                            WikipediaPhotosImporter.log.info(">>>> processing image page: {}", uri);
                            String lowerCase = uri.getPath().toLowerCase();
                            if (lowerCase.endsWith(".jpg") || lowerCase.endsWith(".jpeg")) {
                                InputStream openStream2 = uri.toURL().openStream();
                                Description scrapeImagePage = WikipediaPhotosImporter.this.scrapeImagePage(openStream2);
                                openStream2.close();
                                ((Image) scrapeImagePage.getLookup().lookup(Image.class)).getFoafDepicts().add(Wrapper.unwrap(taxon));
                            } else {
                                WikipediaPhotosImporter.log.warn("Ignoring non-jpeg resource: {}", uri);
                            }
                        } catch (Exception e) {
                            WikipediaPhotosImporter.log.error("", e);
                        }
                    }
                    return taxon;
                }
            });
            if (Collections.singletonList(openStream).get(0) != null) {
                openStream.close();
            }
            return withInitializer;
        } catch (Throwable th) {
            if (Collections.singletonList(openStream).get(0) != null) {
                openStream.close();
            }
            throw th;
        }
    }

    protected List<URI> scrapeMainPage(@Nonnull InputStream inputStream) throws Exception {
        Object parse = this.xmlParser.parse(inputStream);
        XPath createXPath = this.xPathProvider.createXPath(parse, XPATH_LI_GALLERY_BOX);
        XPath createXPath2 = this.xPathProvider.createXPath(parse, XPATH_A_HREF);
        XPath createXPath3 = this.xPathProvider.createXPath(parse, XPATH_A_IMG_SRC);
        List selectNodes = createXPath.selectNodes(parse);
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < selectNodes.size(); i++) {
            Object obj = selectNodes.get(i);
            String str = URI_COMMONS_WIKIMEDIA + createXPath2.stringValueOf(obj);
            log.debug("wikiPage: {} thumbnail: {}", str, createXPath3.stringValueOf(obj));
            arrayList.add(new URI(str));
        }
        return arrayList;
    }

    protected Description scrapeImagePage(@Nonnull InputStream inputStream) throws Exception {
        Object parse = this.xmlParser.parse(new ByteArrayInputStream(IOUtils.toByteArray(inputStream)));
        XPath createXPath = this.xPathProvider.createXPath(parse, XPATH_IMAGE_URL);
        XPath createXPath2 = this.xPathProvider.createXPath(parse, XPATH_LICENSE_URL);
        XPath createXPath3 = this.xPathProvider.createXPath(parse, XPATH_AUTHOR_P);
        XPath createXPath4 = this.xPathProvider.createXPath(parse, XPATH_AUTHOR_A_NAME);
        XPath createXPath5 = this.xPathProvider.createXPath(parse, XPATH_AUTHOR_A_URI);
        XPath createXPath6 = this.xPathProvider.createXPath(parse, XPATH_AUTHOR_A_NAME3);
        XPath createXPath7 = this.xPathProvider.createXPath(parse, XPATH_AUTHOR_A_URI3);
        XPath createXPath8 = this.xPathProvider.createXPath(parse, XPATH_AUTHOR_A_NAME5);
        XPath createXPath9 = this.xPathProvider.createXPath(parse, XPATH_AUTHOR_A_URI5);
        XPath createXPath10 = this.xPathProvider.createXPath(parse, XPATH_AUTHOR_A_NAME6);
        XPath createXPath11 = this.xPathProvider.createXPath(parse, XPATH_AUTHOR_A_URI6);
        XPath createXPath12 = this.xPathProvider.createXPath(parse, XPATH_AUTHOR_A_NAME7);
        XPath createXPath13 = this.xPathProvider.createXPath(parse, XPATH_AUTHOR_A_URI7);
        XPath createXPath14 = this.xPathProvider.createXPath(parse, XPATH_AUTHOR_A_URI2);
        XPath createXPath15 = this.xPathProvider.createXPath(parse, XPATH_AUTHOR_A_URI4);
        String stringValueOf = createXPath.stringValueOf(parse);
        String stringValueOf2 = createXPath2.stringValueOf(parse);
        Description description = (Description) this.stillImageFactory.create(new QName(stringValueOf));
        DcResource dcResource = (DcResource) description.getLookup().lookup(DcResource.class);
        dcResource.getDcTypes().add(DCMI_TYPE_STILL_IMAGE);
        dcResource.getDcFormats().add("image/jpeg");
        dcResource.getDcPublishers().add(Wrapper.unwrap(this.wikiMediaCommons));
        dcResource.getDcIdentifiers().add(stringValueOf);
        dcResource.getDcRights().add(stringValueOf2);
        String replaceAll = createXPath3.stringValueOf(parse).replaceAll(", *$", "");
        String stringValueOf3 = createXPath4.stringValueOf(parse);
        String stringValueOf4 = createXPath5.stringValueOf(parse);
        String stringValueOf5 = createXPath6.stringValueOf(parse);
        String stringValueOf6 = createXPath7.stringValueOf(parse);
        String stringValueOf7 = createXPath8.stringValueOf(parse);
        String stringValueOf8 = createXPath9.stringValueOf(parse);
        String stringValueOf9 = createXPath10.stringValueOf(parse);
        String stringValueOf10 = createXPath11.stringValueOf(parse);
        String stringValueOf11 = createXPath12.stringValueOf(parse);
        String stringValueOf12 = createXPath13.stringValueOf(parse);
        String stringValueOf13 = createXPath14.stringValueOf(parse);
        String stringValueOf14 = createXPath15.stringValueOf(parse);
        log.info(">>>> authorP: {}, authorAName: {}, authorAUri: {}", new Object[]{replaceAll, stringValueOf3, stringValueOf4});
        if (!"".equals(stringValueOf3)) {
            dcResource.getDcCreators().add(Wrapper.unwrap(findOrCreateAuthor(fixedUri(URI_COMMONS_WIKIMEDIA, stringValueOf4), stringValueOf3)));
        } else if (!"".equals(stringValueOf5)) {
            dcResource.getDcCreators().add(Wrapper.unwrap(findOrCreateAuthor(fixedUri(URI_COMMONS_WIKIMEDIA, stringValueOf6), stringValueOf5)));
        } else if (!"".equals(stringValueOf7)) {
            dcResource.getDcCreators().add(Wrapper.unwrap(findOrCreateAuthor(fixedUri(URI_COMMONS_WIKIMEDIA, stringValueOf8), stringValueOf7)));
        } else if (!"".equals(stringValueOf9)) {
            dcResource.getDcCreators().add(Wrapper.unwrap(findOrCreateAuthor(fixedUri(URI_COMMONS_WIKIMEDIA, stringValueOf10), stringValueOf9)));
        } else if (!"".equals(stringValueOf11)) {
            dcResource.getDcCreators().add(Wrapper.unwrap(findOrCreateAuthor(fixedUri(URI_COMMONS_WIKIMEDIA, stringValueOf12), stringValueOf11)));
        } else if (!"".equals(replaceAll)) {
            dcResource.getDcCreators().add(Wrapper.unwrap(findOrCreateAuthor(fixedUri(URN_BLUEBILL_WIKIMEDIA_CONTRIBUTOR, replaceAll), replaceAll)));
        } else if (!"".equals(stringValueOf13) && !"Author:".equals(stringValueOf13.trim())) {
            dcResource.getDcCreators().add(Wrapper.unwrap(findOrCreateAuthor(URN_BLUEBILL_WIKIMEDIA_CONTRIBUTOR + stringValueOf13.replace("Author:", "").replaceAll("\\(.*\\)", "").replace(" ", "").trim(), stringValueOf13.replace("Author:", "").trim())));
        } else if ("".equals(stringValueOf14)) {
            log.warn("No author for {}", stringValueOf);
        } else {
            dcResource.getDcCreators().add(Wrapper.unwrap(findOrCreateAuthor(URN_BLUEBILL_WIKIMEDIA_CONTRIBUTOR + stringValueOf14.replace("Creator:", "").replaceAll("\\(.*\\)", "").replace(" ", "").trim(), stringValueOf14.replace("Creator:", "").trim())));
        }
        return description;
    }

    @Nonnull
    private static String fixedUri(@Nonnull String str, @Nonnull String str2) {
        return str2.startsWith("http://") ? str2 : str + str2;
    }

    @Nonnull
    private Person findOrCreateAuthor(@Nonnull String str, @Nonnull String str2) {
        Person person = this.map.get(str);
        if (person == null) {
            person = (Person) this.authorFactory.create(new QName(str));
            this.map.put(str, person);
            ((org.openrdf.concepts.foaf.Person) person.getLookup().lookup(org.openrdf.concepts.foaf.Person.class)).getFoafGivennames().add(str2);
        }
        return person;
    }
}
