ai.platon.pulsar.skeleton.common.EncodingDetector Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ai.platon.pulsar.skeleton.common;

import ai.platon.pulsar.common.HttpHeaders;
import ai.platon.pulsar.common.config.ImmutableConfig;
import ai.platon.pulsar.persist.WebPage;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.ByteArrayInputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A simple class for detecting character encodings.
 * 
 * 

 * Broadly this encompasses two functions, which are distinctly separate:
 * 

 * 

 * Auto detecting a set of "clues" from input text.
 * Taking a set of clues and making a "best guess" as to the "real"
 * encoding.
 * 
 * 
 * 
 * 

 * A caller will often have some extra information about what the encoding might
 * be (e.g. from the HTTP header or HTML meta-tags, often wrong but still
 * potentially useful clues). The types of clues may differ from caller to
 * caller. Thus a typical calling sequence is:
 * 
 * Run step (1) to generate a set of auto-detected clues;
 * Combine these clues with the caller-dependent "extra clues" available;
 * Run step (2) to guess what the most probable answer is.
 * 
 * 
 * TODO: Use Tika's EncodingDetector
 */
public class EncodingDetector {

    public static final Logger LOG = LoggerFactory.getLogger(EncodingDetector.class);
    public static final int NO_THRESHOLD = -1;
    public static final String MIN_CONFIDENCE_KEY = "encodingdetector.charset.min.confidence";
    // I used 1000 bytes at first, but found that some documents have
    // meta tag well past the first 1000 bytes.
    // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
    private static final int CHUNK_SIZE = 2000;
    private static final HashMap ALIASES = new HashMap<>();
    private static final HashSet DETECTABLES = new HashSet<>();
    // CharsetDetector will die without a minimum amount of data.
    private static final int MIN_LENGTH = 4;
    // PULSAR-1006 Meta equiv with single quotes not accepted
    private static Pattern metaPattern = Pattern.compile(
            "]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
            Pattern.CASE_INSENSITIVE);
    private static Pattern charsetPattern = Pattern.compile(
            "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
    private static Pattern charsetPatternHTML5 = Pattern.compile(
            "]*>",
            Pattern.CASE_INSENSITIVE);

    static {
        DETECTABLES.add("text/html");
        DETECTABLES.add("text/plain");
        DETECTABLES.add("text/richtext");
        DETECTABLES.add("text/rtf");
        DETECTABLES.add("text/sgml");
        DETECTABLES.add("text/tab-separated-values");
        DETECTABLES.add("text/xml");
        DETECTABLES.add("application/rss+xml");
        DETECTABLES.add("application/xhtml+xml");
    /*
     * the following map is not an alias mapping table, but maps character
     * encodings which are often used in mislabelled documents to their correct
     * encodings. For instance, there are a lot of documents labelled
     * 'ISO-8859-1' which contain characters not covered by ISO-8859-1 but
     * covered by windows-1252. Because windows-1252 is a superset of ISO-8859-1
     * (sharing code points for the common part), it's better to treat
     * ISO-8859-1 as synonymous with windows-1252 than to reject, as invalid,
     * documents labelled as ISO-8859-1 that have characters outside ISO-8859-1.
     */
        ALIASES.put("ISO-8859-1", "windows-1252");
        ALIASES.put("EUC-KR", "x-windows-949");
        ALIASES.put("x-EUC-CN", "GB18030");
        /**
         * GB18030有两个版本：GB18030-2000和GB18030-2005，
         * GB18030-2000是GBK的取代版本，它的主要特点是在GBK基础上增加了CJK统一汉字扩充A的汉字。
         * GB18030-2005的主要特点是在GB18030-2000基础上增加了CJK统一汉字扩充B的汉字。
         * @see http://www.fmddlmyy.cn/text24.html
         * */
        // ALIASES.put("GBK", "GB18030");
        // ALIASES.put("Big5", "Big5HKSCS");
        // ALIASES.put("TIS620", "Cp874");
        // ALIASES.put("ISO-8859-11", "Cp874");
    }

    private final CharsetDetector detector = new CharsetDetector();
    private final List clues = new ArrayList<>();
    private int minConfidence = -1;
    private String defaultCharEncoding = "utf-8";

    public EncodingDetector() {
    }

    public EncodingDetector(ImmutableConfig conf) {
        this.minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1);
        this.defaultCharEncoding = conf.get("parser.character.encoding.default", "utf-8");
    }

    public static String resolveEncodingAlias(String encoding) {
        try {
            if (encoding == null || !Charset.isSupported(encoding)) {
                return null;
            }

            String canonicalName = Charset.forName(encoding).name();
            String encodingAlias = ALIASES.getOrDefault(canonicalName, canonicalName);
            return encodingAlias.toLowerCase();
        } catch (Exception e) {
            LOG.warn("Invalid encoding " + encoding + " detected, using default.");
            return null;
        }
    }

    /**
     * ParseResult the character encoding from the specified content type header. If the
     * content type is null, or there is no explicit character encoding,
     * null is returned.
     * This method was copied from org.apache.catalina.util.RequestUtil, which is
     * licensed under the Apache License, Version 2.0 (the "License").
     *
     * @param contentTypeUtf8 utf8 encoded content
     */
    public static String parseCharacterEncoding(CharSequence contentTypeUtf8) {
        if (contentTypeUtf8 == null) {
            return null;
        }

        String contentType = contentTypeUtf8.toString();
        int start = contentType.indexOf("charset=");
        if (start < 0) {
            return null;
        }

        String encoding = contentType.substring(start + 8);
        int end = encoding.indexOf(';');
        if (end >= 0) {
            encoding = encoding.substring(0, end);
        }

        encoding = encoding.trim();
        if ((encoding.length() > 2) && (encoding.startsWith("\"")) && (encoding.endsWith("\""))) {
            encoding = encoding.substring(1, encoding.length() - 1);
        }

        return encoding.trim();
    }

    public String getDefaultCharEncoding() {
        return this.defaultCharEncoding;
    }

    public void setDefaultCharEncoding(String defaultCharEncoding) {
        this.defaultCharEncoding = defaultCharEncoding;
    }

    public int getMinConfidence() {
        return minConfidence;
    }

    public void setMinConfidence(int minConfidence) {
        this.minConfidence = minConfidence;
    }

    public String sniffEncoding(WebPage page) {
        String trustedEncoding = page.getHeaders().get(HttpHeaders.Q_TRUSTED_CONTENT_ENCODING);
        if (trustedEncoding != null) {
            return trustedEncoding;
        }

        clearClues();

        autoDetectClues(page, true);
        addClue(sniffCharacterEncoding(page.getContentAsBytes()), "sniffed");

        return guessEncoding(page, defaultCharEncoding);
    }

    public List getClues() {
        return clues;
    }

    public String getCluesAsString() {
        return StringUtils.join(getClues(), ", ");
    }

    public void autoDetectClues(WebPage page, boolean filter) {
        String contentType = page.getHeaders().get(HttpHeaders.CONTENT_TYPE);
        autoDetectClues(page.getContent(), page.getContentType(),
                parseCharacterEncoding(contentType), filter);
    }

    /**
     * Given a byte[] representing an html file of an
     * unknown encoding, read out 'charset' parameter in the meta tag
     * from the first CHUNK_SIZE bytes. If there's no meta tag for
     * Content-Type or no charset is specified, the content is checked for a
     * Unicode Byte Order Mark (BOM). This will also cover non-byte oriented
     * character encodings (UTF-16 only). If no character set can be determined,
     * null is returned.
     * See also
     * http://www.w3.org/International/questions/qa-html-encoding-declarations,
     * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and
     * http://www.w3.org/TR/REC-xml/#sec-guessing
     *
     * @param content byte[] representation of an html file
     */
    public String sniffCharacterEncoding(byte[] content) {
        int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE;

        // We don't care about non-ASCII parts so that it's sufficient
        // to just inflate each byte to a 16-bit value by padding.
        // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
        // {U+0041, U+0082, U+00B7}.
        String str = new String(content, 0, length, StandardCharsets.US_ASCII);

        Matcher metaMatcher = metaPattern.matcher(str);
        String encoding = null;
        if (metaMatcher.find()) {
            Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
            if (charsetMatcher.find()) {
                encoding = charsetMatcher.group(1);
                if (LOG.isTraceEnabled()) {
                    LOG.trace("metaPattern: " + encoding);
                }
            }
        }

        if (encoding == null) {
            // check for HTML5 meta charset
            metaMatcher = charsetPatternHTML5.matcher(str);
            if (metaMatcher.find()) {
                encoding = metaMatcher.group(1);
                if (LOG.isTraceEnabled()) {
                    LOG.trace("charsetPatternHTML5: " + encoding);
                }
            }
        }

        if (encoding == null) {
            // check for BOM
            if (content.length >= 3 && content[0] == (byte) 0xEF
                    && content[1] == (byte) 0xBB && content[2] == (byte) 0xBF) {
                encoding = "UTF-8";
            } else if (content.length >= 2) {
                if (content[0] == (byte) 0xFF && content[1] == (byte) 0xFE) {
                    encoding = "UTF-16LE";
                } else if (content[0] == (byte) 0xFE && content[1] == (byte) 0xFF) {
                    encoding = "UTF-16BE";
                }
            }
            if (LOG.isTraceEnabled()) {
                LOG.trace("BOM: " + encoding);
            }
        }

        return encoding;
    }

    protected void autoDetectClues(ByteBuffer dataBuffer, String contentType, String encoding, boolean filter) {
        if (dataBuffer == null) {
            return;
        }

        int length = dataBuffer.remaining();

        if (minConfidence >= 0 && DETECTABLES.contains(contentType) && length > MIN_LENGTH) {
            CharsetMatch[] matches = null;

            // do all these in a try/catch; setText and detect/detectAll
            // will sometimes throw exceptions
            try {
                detector.enableInputFilter(filter);
                detector.setText(new ByteArrayInputStream(dataBuffer.array(),
                        dataBuffer.arrayOffset() + dataBuffer.position(), length));
                matches = detector.detectAll();
            } catch (Exception e) {
                LOG.debug("Exception from ICU4J (ignoring): ", e);
            }

            if (matches != null) {
                for (CharsetMatch match : matches) {
                    addClue(match.getName(), "detect", match.getConfidence());
                }
            }
        }

        // add character encoding coming from HTTP response header
        addClue(encoding, "header");
    }

    protected void addClue(String value, String source, int confidence) {
        if (value == null || value.isEmpty()) {
            return;
        }

        value = resolveEncodingAlias(value);
        if (value != null) {
            clues.add(new EncodingClue(value, source, confidence));
            // log.trace("addClue " + value + ", " + source + ", " + confidence);
        }
    }

    public void addClue(String value, String source) {
        addClue(value, source, NO_THRESHOLD);
    }

    /**
     * Guess the encoding with the previously specified list of clues.
     *
     * @param page         URL's row
     * @param defaultValue Default encoding to return if no encoding can be detected with
     *                     enough confidence. Note that this will not be normalized
     *                     with {@link EncodingDetector#resolveEncodingAlias}
     * @return Guessed encoding or defaultValue
     */
    public String guessEncoding(WebPage page, String defaultValue) {
        return guessEncoding(page.getLocation(), defaultValue);
    }

    /**
     * Guess the encoding with the previously specified list of clues.
     *
     * @param baseUrl      Base URL
     * @param defaultValue Default encoding to return if no encoding can be detected with
     *                     enough confidence. Note that this will not be normalized
     *                     with {@link EncodingDetector#resolveEncodingAlias}
     * @return Guessed encoding or defaultValue
     */
    private String guessEncoding(String baseUrl, String defaultValue) {
    /*
     * This algorithm could be replaced by something more sophisticated; ideally
     * we would gather a bunch of data on where various clues (autodetect, HTTP
     * headers, HTML meta tags, etc.) disagree, tag each with the correct
     * answer, and use machine learning/some statistical method to generate a
     * better heuristic.
     */

        if (LOG.isTraceEnabled()) {
            findDisagreements(baseUrl, clues);
        }

    /*
     * Go down the list of encoding "clues". Use a clue if:
     * 1. Has a confidence value which meets our confidence threshold, OR
     * 2. Doesn't meet the threshold, but is the best try, since nothing else is available.
     */
        EncodingClue defaultClue = new EncodingClue(defaultValue, "default");
        EncodingClue bestClue = defaultClue;

        int i = 0;
        for (EncodingClue clue : clues) {
            if (LOG.isTraceEnabled()) {
                LOG.trace(++i + ".\tcharset " + clue);
            }

            String charset = clue.value;
            if (minConfidence >= 0 && clue.confidence >= minConfidence) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace("Choosing encoding: " + charset + " with confidence " + clue.confidence);
                }
                return resolveEncodingAlias(charset);
            } else if (clue.confidence == NO_THRESHOLD && bestClue == defaultClue) {
                if (LOG.isTraceEnabled()) {
                    // Always be the first one if confidence is -1
                    LOG.trace("Choose as best clue " + clue);
                }
                bestClue = clue;
            } else {

            }
        }

        if (LOG.isTraceEnabled()) {
            LOG.trace("Best clue: " + bestClue);
        }

        return bestClue.value.toLowerCase();
    }

    /**
     * Clears all clues.
     */
    public void clearClues() {
        clues.clear();
    }

    /*
     * Strictly for analysis, look for "disagreements." The top guess from each
     * source is examined; if these meet the threshold and disagree, then we log
     * the information -- useful for testing or generating training data for a
     * better heuristic.
     */
    private void findDisagreements(String url, List newClues) {
        HashSet valsSeen = new HashSet<>();
        HashSet sourcesSeen = new HashSet<>();
        boolean disagreement = false;

        for (EncodingClue clue : newClues) {
            if (!clue.isEmpty() && !sourcesSeen.contains(clue.source)) {
                if (valsSeen.size() > 0 && !valsSeen.contains(clue.value) && clue.meetsThreshold()) {
                    disagreement = true;
                }

                if (clue.meetsThreshold()) {
                    valsSeen.add(clue.value);
                }

                sourcesSeen.add(clue.source);
            }
        }

        if (disagreement) {
            // dump all values in case of disagreement
            StringBuilder sb = new StringBuilder();
            sb.append("Disagreement: ").append(url).append("; ");
            for (int i = 0; i < newClues.size(); i++) {
                if (i > 0) {
                    sb.append(", ");
                }
                sb.append(newClues.get(i));
            }
            LOG.trace(sb.toString());
        }
    }

    public class EncodingClue {
        private final String value;
        private final String source;
        private final int confidence;

        // Constructor for clues with no confidence values (ignore thresholds)
        public EncodingClue(String value, String source) {
            this(value, source, NO_THRESHOLD);
        }

        public EncodingClue(String value, String source, int confidence) {
            this.value = value.toLowerCase();
            this.source = source;
            this.confidence = confidence;
        }

        public String getSource() {
            return source;
        }

        public String getValue() {
            return value;
        }

        @Override
        public String toString() {
            return value + " (" + source
                    + ((confidence >= 0) ? ", " + confidence + "% confidence" : "") + ")";
        }

        public boolean isEmpty() {
            return (value == null || "".equals(value));
        }

        public boolean meetsThreshold() {
            return (confidence < 0 || (minConfidence >= 0 && confidence >= minConfidence));
        }
    }
}