package it.unimi.di.mg4j.document;

import com.google.common.base.Charsets;
import it.unimi.di.mg4j.document.DocumentFactory;
import it.unimi.di.mg4j.document.PropertyBasedDocumentFactory;
import it.unimi.di.mg4j.util.MG4JClassParser;
import it.unimi.di.mg4j.util.parser.callback.AnchorExtractor;
import it.unimi.dsi.fastutil.chars.CharArrays;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.callback.ComposedCallbackBuilder;
import it.unimi.dsi.parser.callback.TextExtractor;
import it.unimi.dsi.util.Properties;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.nio.charset.Charset;
import org.apache.commons.configuration.ConfigurationException;

/* JADX WARN: Classes with same name are omitted:
  
 */
/* loaded from: input_file:it/unimi/di/mg4j/document/HtmlDocumentFactory.class */
public class HtmlDocumentFactory extends PropertyBasedDocumentFactory {
    private static final long serialVersionUID = 1;
    protected static final int DEFAULT_BUFFER_SIZE = 16384;
    protected transient BulletParser parser;
    protected transient TextExtractor textExtractor;
    protected transient AnchorExtractor anchorExtractor;
    protected transient WordReader wordReader;
    protected int maxPreAnchor;
    protected int maxAnchor;
    protected int maxPostAnchor;
    protected transient char[] text;

    /* JADX WARN: Classes with same name are omitted:
      
     */
    /* loaded from: input_file:it/unimi/di/mg4j/document/HtmlDocumentFactory$HtmlDocument.class */
    protected class HtmlDocument extends AbstractDocument {
        protected final Reference2ObjectMap<Enum<?>, Object> metadata;
        protected boolean parsed;
        protected final InputStream rawContent;

        protected void ensureParsed() throws IOException {
            if (this.parsed) {
                return;
            }
            int i = 0;
            Charset charset = Charsets.ISO_8859_1;
            try {
                charset = Charset.forName((String) HtmlDocumentFactory.this.resolveNotNull(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, this.metadata));
            } catch (RuntimeException e) {
            }
            InputStreamReader inputStreamReader = new InputStreamReader(this.rawContent, charset);
            while (true) {
                int read = inputStreamReader.read(HtmlDocumentFactory.this.text, i, HtmlDocumentFactory.this.text.length - i);
                if (read <= 0) {
                    HtmlDocumentFactory.this.parser.parse(HtmlDocumentFactory.this.text, 0, i);
                    HtmlDocumentFactory.this.textExtractor.title.trim();
                    this.parsed = true;
                    return;
                } else {
                    i += read;
                    HtmlDocumentFactory.this.text = CharArrays.grow(HtmlDocumentFactory.this.text, i + 1);
                }
            }
        }

        protected HtmlDocument(InputStream inputStream, Reference2ObjectMap<Enum<?>, Object> reference2ObjectMap) {
            this.metadata = reference2ObjectMap;
            this.rawContent = inputStream;
        }

        @Override // it.unimi.di.mg4j.document.Document
        public CharSequence title() {
            try {
                ensureParsed();
                Object obj = this.metadata.get(PropertyBasedDocumentFactory.MetadataKeys.TITLE);
                return (CharSequence) (obj != null ? obj : HtmlDocumentFactory.this.textExtractor.title);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }

        @Override // it.unimi.di.mg4j.document.AbstractDocument
        public String toString() {
            return uri() != null ? uri().toString() : this.parsed ? title().toString() : "[unparsed]";
        }

        @Override // it.unimi.di.mg4j.document.Document
        public CharSequence uri() {
            return (CharSequence) HtmlDocumentFactory.this.resolve(PropertyBasedDocumentFactory.MetadataKeys.URI, this.metadata);
        }

        @Override // it.unimi.di.mg4j.document.Document
        public Object content(int i) throws IOException {
            HtmlDocumentFactory.this.ensureFieldIndex(i);
            ensureParsed();
            switch (i) {
                case 0:
                    return new FastBufferedReader(HtmlDocumentFactory.this.textExtractor.text);
                case 1:
                    return new FastBufferedReader(HtmlDocumentFactory.this.textExtractor.title);
                case 2:
                    return HtmlDocumentFactory.this.anchorExtractor.anchors;
                default:
                    throw new IllegalArgumentException();
            }
        }

        @Override // it.unimi.di.mg4j.document.Document
        public WordReader wordReader(int i) {
            HtmlDocumentFactory.this.ensureFieldIndex(i);
            return HtmlDocumentFactory.this.wordReader;
        }
    }

    /* JADX WARN: Classes with same name are omitted:
      
     */
    /* loaded from: input_file:it/unimi/di/mg4j/document/HtmlDocumentFactory$MetadataKeys.class */
    public enum MetadataKeys {
        MAXPREANCHOR,
        MAXANCHOR,
        MAXPOSTANCHOR
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // it.unimi.di.mg4j.document.PropertyBasedDocumentFactory
    public boolean parseProperty(String str, String[] strArr, Reference2ObjectMap<Enum<?>, Object> reference2ObjectMap) throws ConfigurationException {
        if (sameKey(PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, str)) {
            reference2ObjectMap.put(PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, ensureJustOne(str, strArr));
            return true;
        }
        if (sameKey(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, str)) {
            reference2ObjectMap.put(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, Charset.forName(ensureJustOne(str, strArr)).toString());
            return true;
        }
        if (sameKey(PropertyBasedDocumentFactory.MetadataKeys.WORDREADER, str)) {
            try {
                String str2 = ensureJustOne(str, strArr).toString();
                reference2ObjectMap.put(PropertyBasedDocumentFactory.MetadataKeys.WORDREADER, str2);
                ObjectParser.fromSpec(str2, WordReader.class, MG4JClassParser.PACKAGE);
                return true;
            } catch (ClassNotFoundException e) {
                throw new ConfigurationException(e);
            } catch (Exception e2) {
                throw new ConfigurationException(e2);
            }
        }
        if (sameKey(MetadataKeys.MAXPREANCHOR, str)) {
            reference2ObjectMap.put(MetadataKeys.MAXPREANCHOR, Integer.valueOf(ensureJustOne(str, strArr)));
            return true;
        }
        if (sameKey(MetadataKeys.MAXANCHOR, str)) {
            reference2ObjectMap.put(MetadataKeys.MAXANCHOR, Integer.valueOf(ensureJustOne(str, strArr)));
            return true;
        }
        if (!sameKey(MetadataKeys.MAXPOSTANCHOR, str)) {
            return super.parseProperty(str, strArr, reference2ObjectMap);
        }
        reference2ObjectMap.put(MetadataKeys.MAXPOSTANCHOR, Integer.valueOf(ensureJustOne(str, strArr)));
        return true;
    }

    protected void init() {
        this.parser = new BulletParser();
        ComposedCallbackBuilder composedCallbackBuilder = new ComposedCallbackBuilder();
        TextExtractor textExtractor = new TextExtractor();
        this.textExtractor = textExtractor;
        composedCallbackBuilder.add(textExtractor);
        AnchorExtractor anchorExtractor = new AnchorExtractor(this.maxPreAnchor, this.maxAnchor, this.maxPostAnchor);
        this.anchorExtractor = anchorExtractor;
        composedCallbackBuilder.add(anchorExtractor);
        this.parser.setCallback(composedCallbackBuilder.compose());
        try {
            Object obj = this.defaultMetadata.get(PropertyBasedDocumentFactory.MetadataKeys.WORDREADER);
            this.wordReader = obj == null ? new FastBufferedReader() : (WordReader) ObjectParser.fromSpec(obj.toString(), WordReader.class, MG4JClassParser.PACKAGE);
            this.text = new char[DEFAULT_BUFFER_SIZE];
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    protected void initVars() {
        this.maxPreAnchor = ((Integer) resolve(MetadataKeys.MAXPREANCHOR, this.defaultMetadata, 8)).intValue();
        this.maxAnchor = ((Integer) resolve(MetadataKeys.MAXANCHOR, this.defaultMetadata, 256)).intValue();
        this.maxPostAnchor = ((Integer) resolve(MetadataKeys.MAXPOSTANCHOR, this.defaultMetadata, 4)).intValue();
    }

    @Override // it.unimi.di.mg4j.document.DocumentFactory
    /* renamed from: copy, reason: merged with bridge method [inline-methods] */
    public HtmlDocumentFactory m10copy() {
        return new HtmlDocumentFactory(this.defaultMetadata);
    }

    public HtmlDocumentFactory(Properties properties) throws ConfigurationException {
        super(properties);
        initVars();
        init();
    }

    public HtmlDocumentFactory(Reference2ObjectMap<Enum<?>, Object> reference2ObjectMap) {
        super(reference2ObjectMap);
        initVars();
        init();
    }

    public HtmlDocumentFactory(String[] strArr) throws ConfigurationException {
        super(strArr);
        initVars();
        init();
    }

    public HtmlDocumentFactory() {
        initVars();
        init();
    }

    @Override // it.unimi.di.mg4j.document.DocumentFactory
    public int numberOfFields() {
        return 3;
    }

    @Override // it.unimi.di.mg4j.document.DocumentFactory
    public String fieldName(int i) {
        ensureFieldIndex(i);
        switch (i) {
            case 0:
                return "text";
            case 1:
                return "title";
            case 2:
                return "anchor";
            default:
                throw new IllegalArgumentException();
        }
    }

    @Override // it.unimi.di.mg4j.document.DocumentFactory
    public int fieldIndex(String str) {
        for (int i = 0; i < numberOfFields(); i++) {
            if (fieldName(i).equals(str)) {
                return i;
            }
        }
        return -1;
    }

    @Override // it.unimi.di.mg4j.document.DocumentFactory
    public DocumentFactory.FieldType fieldType(int i) {
        ensureFieldIndex(i);
        switch (i) {
            case 0:
                return DocumentFactory.FieldType.TEXT;
            case 1:
                return DocumentFactory.FieldType.TEXT;
            case 2:
                return DocumentFactory.FieldType.VIRTUAL;
            default:
                throw new IllegalArgumentException();
        }
    }

    private void readObject(ObjectInputStream objectInputStream) throws IOException, ClassNotFoundException {
        objectInputStream.defaultReadObject();
        init();
    }

    @Override // it.unimi.di.mg4j.document.DocumentFactory
    public Document getDocument(InputStream inputStream, Reference2ObjectMap<Enum<?>, Object> reference2ObjectMap) throws IOException {
        return new HtmlDocument(inputStream, reference2ObjectMap);
    }
}
