/*
 * Decompiled with CFR 0.152.
 */
package ai.platon.pulsar.common;

import ai.platon.pulsar.common.config.ImmutableConfig;
import ai.platon.pulsar.persist.WebPage;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class EncodingDetector {
    public static final Logger LOG = LoggerFactory.getLogger(EncodingDetector.class);
    public static final int NO_THRESHOLD = -1;
    public static final String MIN_CONFIDENCE_KEY = "encodingdetector.charset.min.confidence";
    private static final int CHUNK_SIZE = 2000;
    private static final HashMap<String, String> ALIASES = new HashMap();
    private static final HashSet<String> DETECTABLES = new HashSet();
    private static final int MIN_LENGTH = 4;
    private static Pattern metaPattern = Pattern.compile("<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>", 2);
    private static Pattern charsetPattern = Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)", 2);
    private static Pattern charsetPatternHTML5 = Pattern.compile("<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>", 2);
    private final CharsetDetector detector = new CharsetDetector();
    private final List<EncodingClue> clues = new ArrayList<EncodingClue>();
    private int minConfidence = -1;
    private String defaultCharEncoding = "utf-8";

    public EncodingDetector() {
    }

    public EncodingDetector(ImmutableConfig conf) {
        this.minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1);
        this.defaultCharEncoding = conf.get("parser.character.encoding.default", "utf-8");
    }

    public static String resolveEncodingAlias(String encoding) {
        try {
            if (encoding == null || !Charset.isSupported(encoding)) {
                return null;
            }
            String canonicalName = Charset.forName(encoding).name();
            String encodingAlias = ALIASES.getOrDefault(canonicalName, canonicalName);
            return encodingAlias.toLowerCase();
        }
        catch (Exception e) {
            LOG.warn("Invalid encoding " + encoding + " detected, using default.");
            return null;
        }
    }

    public static String parseCharacterEncoding(CharSequence contentTypeUtf8) {
        if (contentTypeUtf8 == null) {
            return null;
        }
        String contentType = contentTypeUtf8.toString();
        int start2 = contentType.indexOf("charset=");
        if (start2 < 0) {
            return null;
        }
        String encoding = contentType.substring(start2 + 8);
        int end = encoding.indexOf(59);
        if (end >= 0) {
            encoding = encoding.substring(0, end);
        }
        if ((encoding = encoding.trim()).length() > 2 && encoding.startsWith("\"") && encoding.endsWith("\"")) {
            encoding = encoding.substring(1, encoding.length() - 1);
        }
        return encoding.trim();
    }

    public String getDefaultCharEncoding() {
        return this.defaultCharEncoding;
    }

    public void setDefaultCharEncoding(String defaultCharEncoding) {
        this.defaultCharEncoding = defaultCharEncoding;
    }

    public int getMinConfidence() {
        return this.minConfidence;
    }

    public void setMinConfidence(int minConfidence) {
        this.minConfidence = minConfidence;
    }

    public String sniffEncoding(WebPage page) {
        String trustedEncoding = page.getHeaders().get("Q-Trusted-Content-Encoding");
        if (trustedEncoding != null) {
            return trustedEncoding;
        }
        this.clearClues();
        this.autoDetectClues(page, true);
        this.addClue(this.sniffCharacterEncoding(page.getContentAsBytes()), "sniffed");
        return this.guessEncoding(page, this.defaultCharEncoding);
    }

    public List<EncodingClue> getClues() {
        return this.clues;
    }

    public String getCluesAsString() {
        return StringUtils.join(this.getClues(), (String)", ");
    }

    public void autoDetectClues(WebPage page, boolean filter) {
        String contentType = page.getHeaders().get("Content-Type");
        this.autoDetectClues(page.getContent(), page.getContentType(), EncodingDetector.parseCharacterEncoding(contentType), filter);
    }

    public String sniffCharacterEncoding(byte[] content) {
        Matcher charsetMatcher;
        int length = content.length < 2000 ? content.length : 2000;
        String str = new String(content, 0, length, StandardCharsets.US_ASCII);
        Matcher metaMatcher = metaPattern.matcher(str);
        String encoding = null;
        if (metaMatcher.find() && (charsetMatcher = charsetPattern.matcher(metaMatcher.group(1))).find()) {
            encoding = charsetMatcher.group(1);
            if (LOG.isTraceEnabled()) {
                LOG.trace("metaPattern: " + encoding);
            }
        }
        if (encoding == null && (metaMatcher = charsetPatternHTML5.matcher(str)).find()) {
            encoding = metaMatcher.group(1);
            if (LOG.isTraceEnabled()) {
                LOG.trace("charsetPatternHTML5: " + encoding);
            }
        }
        if (encoding == null) {
            if (content.length >= 3 && content[0] == -17 && content[1] == -69 && content[2] == -65) {
                encoding = "UTF-8";
            } else if (content.length >= 2) {
                if (content[0] == -1 && content[1] == -2) {
                    encoding = "UTF-16LE";
                } else if (content[0] == -2 && content[1] == -1) {
                    encoding = "UTF-16BE";
                }
            }
            if (LOG.isTraceEnabled()) {
                LOG.trace("BOM: " + encoding);
            }
        }
        return encoding;
    }

    protected void autoDetectClues(ByteBuffer dataBuffer, String contentType, String encoding, boolean filter) {
        if (dataBuffer == null) {
            return;
        }
        int length = dataBuffer.remaining();
        if (this.minConfidence >= 0 && DETECTABLES.contains(contentType) && length > 4) {
            CharsetMatch[] matches = null;
            try {
                this.detector.enableInputFilter(filter);
                this.detector.setText((InputStream)new ByteArrayInputStream(dataBuffer.array(), dataBuffer.arrayOffset() + dataBuffer.position(), length));
                matches = this.detector.detectAll();
            }
            catch (Exception e) {
                LOG.debug("Exception from ICU4J (ignoring): ", (Throwable)e);
            }
            if (matches != null) {
                for (void var10_11 : matches) {
                    this.addClue(var10_11.getName(), "detect", var10_11.getConfidence());
                }
            }
        }
        this.addClue(encoding, "header");
    }

    protected void addClue(String value, String source, int confidence) {
        if (value == null || value.isEmpty()) {
            return;
        }
        if ((value = EncodingDetector.resolveEncodingAlias(value)) != null) {
            this.clues.add(new EncodingClue(value, source, confidence));
        }
    }

    public void addClue(String value, String source) {
        this.addClue(value, source, -1);
    }

    public String guessEncoding(WebPage page, String defaultValue) {
        return this.guessEncoding(page.getLocation(), defaultValue);
    }

    private String guessEncoding(String baseUrl, String defaultValue) {
        EncodingClue defaultClue;
        if (LOG.isTraceEnabled()) {
            this.findDisagreements(baseUrl, this.clues);
        }
        EncodingClue bestClue = defaultClue = new EncodingClue(defaultValue, "default");
        int i = 0;
        for (EncodingClue clue : this.clues) {
            if (LOG.isTraceEnabled()) {
                LOG.trace(++i + ".\tcharset " + clue);
            }
            String charset = clue.value;
            if (this.minConfidence >= 0 && clue.confidence >= this.minConfidence) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace("Choosing encoding: " + charset + " with confidence " + clue.confidence);
                }
                return EncodingDetector.resolveEncodingAlias(charset);
            }
            if (clue.confidence != -1 || bestClue != defaultClue) continue;
            if (LOG.isTraceEnabled()) {
                LOG.trace("Choose as best clue " + clue);
            }
            bestClue = clue;
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Best clue: " + bestClue);
        }
        return bestClue.value.toLowerCase();
    }

    public void clearClues() {
        this.clues.clear();
    }

    private void findDisagreements(String url, List<EncodingClue> newClues) {
        HashSet<String> valsSeen = new HashSet<String>();
        HashSet<String> sourcesSeen = new HashSet<String>();
        boolean disagreement = false;
        for (EncodingClue clue : newClues) {
            if (clue.isEmpty() || sourcesSeen.contains(clue.source)) continue;
            if (valsSeen.size() > 0 && !valsSeen.contains(clue.value) && clue.meetsThreshold()) {
                disagreement = true;
            }
            if (clue.meetsThreshold()) {
                valsSeen.add(clue.value);
            }
            sourcesSeen.add(clue.source);
        }
        if (disagreement) {
            StringBuilder sb = new StringBuilder();
            sb.append("Disagreement: ").append(url).append("; ");
            for (int i = 0; i < newClues.size(); ++i) {
                if (i > 0) {
                    sb.append(", ");
                }
                sb.append(newClues.get(i));
            }
            LOG.trace(sb.toString());
        }
    }

    static {
        DETECTABLES.add("text/html");
        DETECTABLES.add("text/plain");
        DETECTABLES.add("text/richtext");
        DETECTABLES.add("text/rtf");
        DETECTABLES.add("text/sgml");
        DETECTABLES.add("text/tab-separated-values");
        DETECTABLES.add("text/xml");
        DETECTABLES.add("application/rss+xml");
        DETECTABLES.add("application/xhtml+xml");
        ALIASES.put("ISO-8859-1", "windows-1252");
        ALIASES.put("EUC-KR", "x-windows-949");
        ALIASES.put("x-EUC-CN", "GB18030");
    }

    public class EncodingClue {
        private final String value;
        private final String source;
        private final int confidence;

        public EncodingClue(String value, String source) {
            this(value, source, -1);
        }

        public EncodingClue(String value, String source, int confidence) {
            this.value = value.toLowerCase();
            this.source = source;
            this.confidence = confidence;
        }

        public String getSource() {
            return this.source;
        }

        public String getValue() {
            return this.value;
        }

        public String toString() {
            return this.value + " (" + this.source + (String)(this.confidence >= 0 ? ", " + this.confidence + "% confidence" : "") + ")";
        }

        public boolean isEmpty() {
            return this.value == null || "".equals(this.value);
        }

        public boolean meetsThreshold() {
            return this.confidence < 0 || EncodingDetector.this.minConfidence >= 0 && this.confidence >= EncodingDetector.this.minConfidence;
        }
    }
}

