All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nu.validator.xml.LanguageDetectingXMLReaderWrapper Maven / Gradle / Ivy

Go to download

An HTML-checking library (used by https://html5.validator.nu and the HTML5 facet of the W3C Validator)

There is a newer version: 20.7.2
Show newest version
/*
 * Copyright (c) 2016-2017 Mozilla Foundation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 */

package nu.validator.xml;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.List;

import javax.servlet.http.HttpServletRequest;

import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import com.cybozu.labs.langdetect.Language;
import com.ibm.icu.util.ULocale;
import io.mola.galimatias.URL;
import io.mola.galimatias.Host;
import io.mola.galimatias.GalimatiasParseException;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.DTDHandler;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.LocatorImpl;

import org.apache.log4j.Logger;

public final class LanguageDetectingXMLReaderWrapper
        implements XMLReader, ContentHandler {

    private static final Logger log4j = Logger.getLogger(
            LanguageDetectingXMLReaderWrapper.class);

    private static final String languageList = "nu/validator/localentities/files/"
            + "language-profiles-list.txt";

    private static final String profilesDir = "nu/validator/localentities/files/"
            + "language-profiles/";

    private static List profiles = new ArrayList<>();

    private static List languageTags = new ArrayList<>();

    private static final Map LANG_TAGS_BY_TLD = new HashMap<>();

    static {
      LANG_TAGS_BY_TLD.put("ae", new String[] { "ar" });
      LANG_TAGS_BY_TLD.put("af", new String[] { "ps" });
      LANG_TAGS_BY_TLD.put("am", new String[] { "hy" });
      LANG_TAGS_BY_TLD.put("ar", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("at", new String[] { "de" });
      LANG_TAGS_BY_TLD.put("az", new String[] { "az" });
      LANG_TAGS_BY_TLD.put("ba", new String[] { "bs", "hr", "sr" });
      LANG_TAGS_BY_TLD.put("bd", new String[] { "bn" });
      LANG_TAGS_BY_TLD.put("be", new String[] { "de", "fr", "nl" });
      LANG_TAGS_BY_TLD.put("bg", new String[] { "bg" });
      LANG_TAGS_BY_TLD.put("bh", new String[] { "ar" });
      LANG_TAGS_BY_TLD.put("bo", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("br", new String[] { "pt" });
      LANG_TAGS_BY_TLD.put("by", new String[] { "be" });
      LANG_TAGS_BY_TLD.put("bz", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("ch", new String[] { "de", "fr", "it", "rm" });
      LANG_TAGS_BY_TLD.put("cl", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("co", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("cu", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("cr", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("cz", new String[] { "cs" });
      LANG_TAGS_BY_TLD.put("de", new String[] { "de" });
      LANG_TAGS_BY_TLD.put("dk", new String[] { "da" });
      LANG_TAGS_BY_TLD.put("do", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("ec", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("ee", new String[] { "et" });
      LANG_TAGS_BY_TLD.put("eg", new String[] { "ar" });
      LANG_TAGS_BY_TLD.put("es", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("fi", new String[] { "fi" });
      LANG_TAGS_BY_TLD.put("fr", new String[] { "fr" });
      LANG_TAGS_BY_TLD.put("ge", new String[] { "ka" });
      LANG_TAGS_BY_TLD.put("gr", new String[] { "el" });
      LANG_TAGS_BY_TLD.put("gt", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("hn", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("hr", new String[] { "hr" });
      LANG_TAGS_BY_TLD.put("hu", new String[] { "hu" });
      LANG_TAGS_BY_TLD.put("id", new String[] { "id" });
      LANG_TAGS_BY_TLD.put("is", new String[] { "is" });
      LANG_TAGS_BY_TLD.put("it", new String[] { "it" });
      LANG_TAGS_BY_TLD.put("il", new String[] { "iw" });
      LANG_TAGS_BY_TLD.put("in", new String[] { "bn", "gu", "hi", "kn", "ml", "mr", "pa", "ta", "te" });
      LANG_TAGS_BY_TLD.put("ja", new String[] { "jp" });
      LANG_TAGS_BY_TLD.put("jo", new String[] { "ar" });
      LANG_TAGS_BY_TLD.put("ke", new String[] { "sw" });
      LANG_TAGS_BY_TLD.put("kg", new String[] { "ky" });
      LANG_TAGS_BY_TLD.put("kh", new String[] { "km" });
      LANG_TAGS_BY_TLD.put("kr", new String[] { "ko" });
      LANG_TAGS_BY_TLD.put("kw", new String[] { "ar" });
      LANG_TAGS_BY_TLD.put("kz", new String[] { "kk" });
      LANG_TAGS_BY_TLD.put("la", new String[] { "lo" });
      LANG_TAGS_BY_TLD.put("li", new String[] { "de" });
      LANG_TAGS_BY_TLD.put("lb", new String[] { "ar" });
      LANG_TAGS_BY_TLD.put("lk", new String[] { "si", "ta" });
      LANG_TAGS_BY_TLD.put("lt", new String[] { "lt" });
      LANG_TAGS_BY_TLD.put("lu", new String[] { "de" });
      LANG_TAGS_BY_TLD.put("lv", new String[] { "lv" });
      LANG_TAGS_BY_TLD.put("md", new String[] { "mo" });
      LANG_TAGS_BY_TLD.put("mk", new String[] { "mk" });
      LANG_TAGS_BY_TLD.put("mn", new String[] { "mn" });
      LANG_TAGS_BY_TLD.put("mx", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("my", new String[] { "ms" });
      LANG_TAGS_BY_TLD.put("ni", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("nl", new String[] { "nl" });
      LANG_TAGS_BY_TLD.put("no", new String[] { "nn", "no" });
      LANG_TAGS_BY_TLD.put("np", new String[] { "ne" });
      LANG_TAGS_BY_TLD.put("pa", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("pe", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("ph", new String[] { "tl" });
      LANG_TAGS_BY_TLD.put("pl", new String[] { "pl" });
      LANG_TAGS_BY_TLD.put("pk", new String[] { "ur" });
      LANG_TAGS_BY_TLD.put("pr", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("pt", new String[] { "pt" });
      LANG_TAGS_BY_TLD.put("py", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("qa", new String[] { "ar" });
      LANG_TAGS_BY_TLD.put("ro", new String[] { "ro" });
      LANG_TAGS_BY_TLD.put("rs", new String[] { "sr" });
      LANG_TAGS_BY_TLD.put("ru", new String[] { "ru" });
      LANG_TAGS_BY_TLD.put("sa", new String[] { "ar" });
      LANG_TAGS_BY_TLD.put("se", new String[] { "sv" });
      LANG_TAGS_BY_TLD.put("si", new String[] { "sl" });
      LANG_TAGS_BY_TLD.put("sk", new String[] { "sk" });
      LANG_TAGS_BY_TLD.put("sv", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("th", new String[] { "th" });
      LANG_TAGS_BY_TLD.put("tj", new String[] { "tg" });
      LANG_TAGS_BY_TLD.put("tm", new String[] { "tk" });
      LANG_TAGS_BY_TLD.put("ua", new String[] { "uk" });
      LANG_TAGS_BY_TLD.put("uy", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("uz", new String[] { "uz" });
      LANG_TAGS_BY_TLD.put("ve", new String[] { "es" });
      LANG_TAGS_BY_TLD.put("vn", new String[] { "vi" });
      LANG_TAGS_BY_TLD.put("za", new String[] { "af" });
    }

    public static void initialize() throws LangDetectException {
        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(
                    LanguageDetectingXMLReaderWrapper.class.getClassLoader().getResourceAsStream(
                            languageList)));
            String languageTagAndName = br.readLine();
            while (languageTagAndName != null) {
                languageTags.add(languageTagAndName.split("\t")[0]);
                languageTagAndName = br.readLine();
            }
            for (String languageTag : languageTags) {
                profiles.add((new BufferedReader(new InputStreamReader(
                        LanguageDetectingXMLReaderWrapper.class.getClassLoader().getResourceAsStream(
                                profilesDir + languageTag)))).readLine());
            }
            DetectorFactory.clear();
            DetectorFactory.loadProfile(profiles);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private final XMLReader wrappedReader;

    private ContentHandler contentHandler;

    private ErrorHandler errorHandler;

    private HttpServletRequest request;

    private String systemId;

    private String tld;

    private Locator locator = null;

    private Locator htmlStartTagLocator;

    private StringBuilder elementContent;

    private StringBuilder documentContent;

    private String httpContentLangHeader;

    private String htmlElementLangAttrValue;

    private String declaredLangCode;

    private boolean htmlElementHasLang;

    private String dirAttrValue;

    private boolean hasDir;

    private boolean inBody;

    private int currentOpenElementsInDifferentLang;

    private boolean loggedStyleInBody;

    private boolean collectingCharacters;

    private int nonWhitespaceCharacterCount;

    private static final int MAX_CHARS = 30720;

    private static final int MIN_CHARS = 1024;

    private static final double MIN_PROBABILITY = .90;

    private static final String[] RTL_LANGS = { "ar", "azb", "ckb", "dv", "fa",
            "he", "pnb", "ps", "sd", "ug", "ur" };

    private static final String[] COMMON_LANGS = { "ar", "ca", "cs", "da",
            "de", "el", "en", "es", "et", "fa", "fi", "fr", "he", "hi", "hu",
            "id", "it", "ja", "ka", "ko", "lt", "lv", "ms", "nl", "no", "pl",
            "pt", "ro", "ru", "sk", "sq", "sv", "th", "tr", "uk", "vi",
            "zh-hans", "zh-hant" };

    public LanguageDetectingXMLReaderWrapper(XMLReader wrappedReader,
            HttpServletRequest request, ErrorHandler errorHandler,
            String httpContentLangHeader, String systemId) {
        this.wrappedReader = wrappedReader;
        this.contentHandler = wrappedReader.getContentHandler();
        this.errorHandler = errorHandler;
        this.request = request;
        this.systemId = systemId;
        this.tld = "";
        this.htmlStartTagLocator = null;
        this.inBody = false;
        this.currentOpenElementsInDifferentLang = 0;
        this.loggedStyleInBody = false;
        this.collectingCharacters = false;
        this.nonWhitespaceCharacterCount = 0;
        this.elementContent = new StringBuilder();
        this.documentContent = new StringBuilder();
        this.httpContentLangHeader = httpContentLangHeader;
        this.htmlElementHasLang = false;
        this.htmlElementLangAttrValue = "";
        this.declaredLangCode = "";
        this.hasDir = false;
        this.dirAttrValue = "";
        try {
            if (systemId != null && systemId.startsWith("http")) {
                Host hostname = URL.parse(systemId).host();
                if (hostname != null) {
                    String host = hostname.toString();
                    this.tld = host.substring(host.lastIndexOf(".") + 1);
                }
            }
        } catch (GalimatiasParseException e) {
            throw new RuntimeException(e);
        }
        wrappedReader.setContentHandler(this);
    }

    /**
     * @see org.xml.sax.helpers.XMLFilterImpl#characters(char[], int, int)
     */
    @Override
    public void characters(char[] ch, int start, int length)
            throws SAXException {
        if (contentHandler == null) {
            return;
        }
        if (collectingCharacters && nonWhitespaceCharacterCount < MAX_CHARS) {
            for (int i = start; i < start + length; i++) {
                switch (ch[i]) {
                    case ' ':
                    case '\t':
                    case '\r':
                    case '\n':
                        continue;
                    default:
                        nonWhitespaceCharacterCount++;
                }
            }
            elementContent.append(ch, start, length);
        }
        contentHandler.characters(ch, start, length);
    }

    /**
     * @see org.xml.sax.helpers.XMLFilterImpl#endElement(java.lang.String,
     *      java.lang.String, java.lang.String)
     */
    @Override
    public void endElement(String uri, String localName, String qName)
            throws SAXException {
        if (contentHandler == null) {
            return;
        }
        if (nonWhitespaceCharacterCount < MAX_CHARS) {
            documentContent.append(elementContent);
            elementContent.setLength(0);
        }
        if ("body".equals(localName)) {
            inBody = false;
            collectingCharacters = false;
        }
        if (currentOpenElementsInDifferentLang > 0) {
            currentOpenElementsInDifferentLang--;
            if (currentOpenElementsInDifferentLang == 0) {
                collectingCharacters = true;
            }
        } else {
            if (inBody && ("script".equals(localName) //
                    || "style".equals(localName) //
                    || "pre".equals(localName) //
                    || "a".equals(localName) //
                    || "td".equals(localName) //
                    || "select".equals(localName) //
                    || "ul".equals(localName) //
                    || "nav".equals(localName) //
                    || "form".equals(localName))) {
                collectingCharacters = true;
            }
        }
        contentHandler.endElement(uri, localName, qName);
    }

    /**
     * @see org.xml.sax.helpers.XMLFilterImpl#startDocument()
     */
    @Override
    public void startDocument() throws SAXException {
        if (contentHandler == null) {
            return;
        }
        documentContent.setLength(0);
        contentHandler.startDocument();
    }

    /**
     * @see org.xml.sax.helpers.XMLFilterImpl#startElement(java.lang.String,
     *      java.lang.String, java.lang.String, org.xml.sax.Attributes)
     */
    @Override
    public void startElement(String uri, String localName, String qName,
            Attributes atts) throws SAXException {
        if (contentHandler == null) {
            return;
        }
        if ("html".equals(localName)) {
            htmlStartTagLocator = new LocatorImpl(locator);
            for (int i = 0; i < atts.getLength(); i++) {
                if ("lang".equals(atts.getLocalName(i))) {
                    if (request != null) {
                        request.setAttribute(
                                "http://validator.nu/properties/lang-found",
                                true);
                    }
                    htmlElementHasLang = true;
                    htmlElementLangAttrValue = atts.getValue(i);
                    declaredLangCode = new ULocale(
                            htmlElementLangAttrValue).getLanguage();
                } else if ("dir".equals(atts.getLocalName(i))) {
                    hasDir = true;
                    dirAttrValue = atts.getValue(i);
                }
            }
        } else if ("link".equals(localName)) {
            boolean hasAppleTouchIcon = false;
            boolean hasSizes = false;
            for (int i = 0; i < atts.getLength(); i++) {
                if ("rel".equals(atts.getLocalName(i))) {
                    if (atts.getValue(i).contains("apple-touch-icon")) {
                        hasAppleTouchIcon = true;
                    }
                } else if ("sizes".equals(atts.getLocalName(i))) {
                    hasSizes = true;
                }
            }
            if (request != null && hasAppleTouchIcon && hasSizes) {
                request.setAttribute(
                        "http://validator.nu/properties/apple-touch-icon-with-sizes-found",
                        true);
            }
        } else if (inBody && "style".equals(localName) && !loggedStyleInBody) {
            loggedStyleInBody = true;
            if (request != null) {
                request.setAttribute(
                        "http://validator.nu/properties/style-in-body-found",
                        true);
            }
        } else if ("body".equals(localName)) {
            inBody = true;
            collectingCharacters = true;
        } else if (inBody) {
            if (currentOpenElementsInDifferentLang > 0) {
                currentOpenElementsInDifferentLang++;
            } else {
                for (int i = 0; i < atts.getLength(); i++) {
                    if ("lang".equals(atts.getLocalName(i))) {
                        if (!"".equals(htmlElementLangAttrValue)
                                && !htmlElementLangAttrValue.equals(
                                        atts.getValue(i))) {
                            currentOpenElementsInDifferentLang++;
                            collectingCharacters = false;
                        }
                    }
                }
            }
        }
        if ("script".equals(localName) //
                || "style".equals(localName) //
                || "pre".equals(localName) //
                || "a".equals(localName) //
                || "td".equals(localName) //
                || "select".equals(localName) //
                || "ul".equals(localName) //
                || "nav".equals(localName) //
                || "textarea".equals(localName) //
                || "form".equals(localName)) {
            collectingCharacters = false;
        }
        contentHandler.startElement(uri, localName, qName, atts);
    }

    /**
     * @see org.xml.sax.helpers.XMLFilterImpl#setDocumentLocator(org.xml.sax.Locator)
     */
    @Override
    public void setDocumentLocator(Locator locator) {
        if (contentHandler == null) {
            return;
        }
        this.locator = locator;
        contentHandler.setDocumentLocator(locator);
    }

    @Override
    public ContentHandler getContentHandler() {
        return contentHandler;
    }

    /**
     * @throws SAXException
     * @see org.xml.sax.ContentHandler#endDocument()
     */
    @Override
    public void endDocument() throws SAXException {
        if (contentHandler == null) {
            return;
        }
        detectLanguageAndCheckAgainstDeclaredLanguage();
        contentHandler.endDocument();
    }

    private void detectLanguageAndCheckAgainstDeclaredLanguage()
            throws SAXException {
        if (nonWhitespaceCharacterCount < MIN_CHARS) {
            return;
        }
        if ("zxx".equals(declaredLangCode) // "No Linguistic Content"
                || "eo".equals(declaredLangCode) // Esperanto
                || "la".equals(declaredLangCode) // Latin
        ) {
            return;
        }
        if (LANG_TAGS_BY_TLD.containsKey(tld)
                && Arrays.binarySearch(LANG_TAGS_BY_TLD.get(tld),
                        declaredLangCode) >= 0) {
            return;
        }
        try {
            String textContent = documentContent.toString() //
                    .replaceAll("\\s+", " ");
            String detectedLanguage = "";
            Detector detector = DetectorFactory.create();
            detector.append(textContent);
            detector.getProbabilities();
            ArrayList possibileLanguages = new ArrayList<>();
            ArrayList possibilities = detector.getProbabilities();
            for (Language possibility : possibilities) {
                possibileLanguages.add(possibility.lang);
                ULocale plocale = new ULocale(possibility.lang);
                if (Arrays.binarySearch(COMMON_LANGS, possibility.lang) < 0
                        && systemId != null) {
                    log4j.info(
                            String.format("%s %s %s", plocale.getDisplayName(),
                                    possibility.prob, systemId));
                }
                if (possibility.prob > MIN_PROBABILITY) {
                    detectedLanguage = possibility.lang;
                    setDocumentLanguage(detectedLanguage);
                } else if ((possibileLanguages.contains("hr")
                        && (possibileLanguages.contains("sr-latn")
                                || possibileLanguages.contains("bs")))
                        || (possibileLanguages.contains("sr-latn")
                                && (possibileLanguages.contains("hr")
                                        || possibileLanguages.contains("bs")))
                        || (possibileLanguages.contains("bs")
                                && (possibileLanguages.contains("hr")
                                        || possibileLanguages.contains(
                                                "sr-latn")))) {
                    if (htmlElementHasLang || systemId != null) {
                        detectedLanguage = getDetectedLanguageSerboCroatian();
                        setDocumentLanguage(detectedLanguage);
                    }
                    if ("sh".equals(detectedLanguage)) {
                        checkLangAttributeSerboCroatian();
                        return;
                    }
                }
            }
            if ("".equals(detectedLanguage)) {
                if (!htmlElementHasLang && errorHandler != null) {
                    String message = "Consider adding a \u201Clang\u201D"
                            + " attribute to the \u201Chtml\u201D"
                            + " start tag to declare the language"
                            + " of this document.";
                    SAXParseException spe = new SAXParseException(message,
                            htmlStartTagLocator);
                    errorHandler.warning(spe);
                }
                contentHandler.endDocument();
                return;
            }
            String detectedLanguageName = "";
            String preferredLanguageCode = "";
            ULocale locale = new ULocale(detectedLanguage);
            String detectedLanguageCode = locale.getLanguage();
            if ("no".equals(detectedLanguage)) {
                checkLangAttributeNorwegian();
                checkContentLanguageHeaderNorwegian(detectedLanguage,
                        detectedLanguageName, detectedLanguageCode);
                return;
            }
            if ("zh-hans".equals(detectedLanguage)) {
                detectedLanguageName = "Simplified Chinese";
                preferredLanguageCode = "zh-hans";
            } else if ("zh-hant".equals(detectedLanguage)) {
                detectedLanguageName = "Traditional Chinese";
                preferredLanguageCode = "zh-hant";
            } else if ("mhr".equals(detectedLanguage)) {
                detectedLanguageName = "Meadow Mari";
                preferredLanguageCode = "mhr";
            } else if ("mrj".equals(detectedLanguage)) {
                detectedLanguageName = "Hill Mari";
                preferredLanguageCode = "mrj";
            } else if ("nah".equals(detectedLanguage)) {
                detectedLanguageName = "Nahuatl";
                preferredLanguageCode = "nah";
            } else if ("pnb".equals(detectedLanguage)) {
                detectedLanguageName = "Western Panjabi";
                preferredLanguageCode = "pnb";
            } else if ("sr-cyrl".equals(detectedLanguage)) {
                detectedLanguageName = "Serbian";
                preferredLanguageCode = "sr";
            } else if ("sr-latn".equals(detectedLanguage)) {
                detectedLanguageName = "Serbian";
                preferredLanguageCode = "sr";
            } else if ("uz-cyrl".equals(detectedLanguage)) {
                detectedLanguageName = "Uzbek";
                preferredLanguageCode = "uz";
            } else if ("uz-latn".equals(detectedLanguage)) {
                detectedLanguageName = "Uzbek";
                preferredLanguageCode = "uz";
            } else if ("zxx".equals(detectedLanguage)) {
                detectedLanguageName = "Lorem ipsum text";
                preferredLanguageCode = "zxx";
            } else {
                detectedLanguageName = locale.getDisplayName();
                preferredLanguageCode = detectedLanguageCode;
            }
            checkLangAttribute(detectedLanguage, detectedLanguageName,
                    detectedLanguageCode, preferredLanguageCode);
            checkDirAttribute(detectedLanguage, detectedLanguageName,
                    detectedLanguageCode, preferredLanguageCode);
            checkContentLanguageHeader(detectedLanguage, detectedLanguageName,
                    detectedLanguageCode, preferredLanguageCode);
        } catch (LangDetectException e) {
        }
    }

    private void setDocumentLanguage(String languageTag) {
        if (request != null) {
            request.setAttribute(
                    "http://validator.nu/properties/document-language",
                    languageTag);
        }
    }

    private String getDetectedLanguageSerboCroatian() throws SAXException {
        if ("hr".equals(declaredLangCode) || "hr".equals(tld)) {
            return "hr";
        }
        if ("sr".equals(declaredLangCode) || ".rs".equals(tld)) {
            return "sr-latn";
        }
        if ("bs".equals(declaredLangCode) || ".ba".equals(tld)) {
            return "bs";
        }
        return "sh";
    }

    private void checkLangAttributeSerboCroatian() throws SAXException {
        String lowerCaseLang = htmlElementLangAttrValue.toLowerCase();
        String langWarning = "";
        if (!htmlElementHasLang) {
            langWarning = "This document appears to be written in either"
                    + " Croatian, Serbian, or Bosnian. Consider adding either"
                    + " \u201Clang=\"hr\"\u201D, \u201Clang=\"sr\"\u201D, or"
                    + " \u201Clang=\"bs\"\u201D to the"
                    + " \u201Chtml\u201D start tag.";
        } else if (!("hr".equals(declaredLangCode)
                || "sr".equals(declaredLangCode)
                || "bs".equals(declaredLangCode))) {
            langWarning = String.format(
                    "This document appears to be written in either Croatian,"
                            + " Serbian, or Bosnian, but the \u201Chtml\u201D"
                            + " start tag has %s. Consider using either"
                            + " \u201Clang=\"hr\"\u201D,"
                            + " \u201Clang=\"sr\"\u201D, or"
                            + " \u201Clang=\"bs\"\u201D instead.",
                    getAttValueExpr("lang", lowerCaseLang));
        }
        if (!"".equals(langWarning)) {
            warn(langWarning);
        }
    }

    private void checkLangAttributeNorwegian() throws SAXException {
        String lowerCaseLang = htmlElementLangAttrValue.toLowerCase();
        String langWarning = "";
        if (!htmlElementHasLang) {
            langWarning = "This document appears to be written in Norwegian"
                    + " Consider adding either"
                    + " \u201Clang=\"nn\"\u201D or \u201Clang=\"nb\"\u201D"
                    + " (or variant) to the \u201Chtml\u201D start tag.";
        } else if (!("no".equals(declaredLangCode)
                || "nn".equals(declaredLangCode)
                || "nb".equals(declaredLangCode))) {
            langWarning = String.format(
                    "This document appears to be written in Norwegian, but the"
                            + " \u201Chtml\u201D start tag has %s. Consider"
                            + " using either \u201Clang=\"nn\"\u201D or"
                            + " \u201Clang=\"nb\"\u201D (or variant) instead.",
                    getAttValueExpr("lang", lowerCaseLang));
        }
        if (!"".equals(langWarning)) {
            warn(langWarning);
        }
    }

    private void checkContentLanguageHeaderNorwegian(String detectedLanguage,
            String detectedLanguageName, String detectedLanguageCode)
                    throws SAXException {
        if ("".equals(httpContentLangHeader)
                || httpContentLangHeader.contains(",")) {
            return;
        }
        String lowerCaseContentLang = httpContentLangHeader.toLowerCase();
        String contentLangCode = new ULocale(
                lowerCaseContentLang).getLanguage();
        if (!("no".equals(contentLangCode) || "nn".equals(contentLangCode)
                || "nb".equals(contentLangCode))) {
            warn("This document appears to be written in Norwegian but the"
                    + " value of the HTTP \u201CContent-Language\u201D header"
                    + " is \u201C" + lowerCaseContentLang + "\u201D. Consider"
                    + " changing it to \u201Cnn\u201D or \u201Cnn\u201D"
                    + " (or variant) instead.");
        }
    }

    private void checkLangAttribute(String detectedLanguage,
            String detectedLanguageName, String detectedLanguageCode,
            String preferredLanguageCode) throws SAXException {
        String langWarning = "";
        String lowerCaseLang = htmlElementLangAttrValue.toLowerCase();
        if (!htmlElementHasLang) {
            langWarning = String.format(
                    "This document appears to be written in %s."
                            + " Consider adding \u201Clang=\"%s\"\u201D"
                            + " (or variant) to the \u201Chtml\u201D"
                            + " start tag.",
                    detectedLanguageName, preferredLanguageCode);
        } else {
            if (request != null) {
                if ("".equals(lowerCaseLang)) {
                    request.setAttribute(
                            "http://validator.nu/properties/lang-empty", true);
                } else {
                    request.setAttribute(
                            "http://validator.nu/properties/lang-value",
                            lowerCaseLang);
                }
            }
            if ("tl".equals(detectedLanguageCode)
                    && ("ceb".equals(declaredLangCode)
                            || "ilo".equals(declaredLangCode)
                            || "pag".equals(declaredLangCode)
                            || "war".equals(declaredLangCode))) {
                return;
            }
            if ("id".equals(detectedLanguageCode)
                    && "min".equals(declaredLangCode)) {
                return;
            }
            if ("ms".equals(detectedLanguageCode)
                    && "min".equals(declaredLangCode)) {
                return;
            }
            if ("hr".equals(detectedLanguageCode)
                    && ("sr".equals(declaredLangCode)
                            || "bs".equals(declaredLangCode)
                            || "sh".equals(declaredLangCode))) {
                return;
            }
            if ("sr".equals(detectedLanguageCode)
                    && ("hr".equals(declaredLangCode)
                            || "bs".equals(declaredLangCode)
                            || "sh".equals(declaredLangCode))) {
                return;
            }
            if ("bs".equals(detectedLanguageCode)
                    && ("hr".equals(declaredLangCode)
                            || "sr".equals(declaredLangCode)
                            || "sh".equals(declaredLangCode))) {
                return;
            }
            if ("de".equals(detectedLanguageCode)
                    && ("bar".equals(declaredLangCode)
                            || "gsw".equals(declaredLangCode)
                            || "lb".equals(declaredLangCode))) {
                return;
            }
            if ("zh".equals(detectedLanguageCode)
                    && "yue".equals(lowerCaseLang)) {
                return;
            }
            if ("es".equals(detectedLanguageCode)
                    && ("an".equals(declaredLangCode)
                            || "ast".equals(declaredLangCode))) {
                return;
            }
            if ("it".equals(detectedLanguageCode)
                    && ("co".equals(declaredLangCode)
                            || "pms".equals(declaredLangCode)
                            || "vec".equals(declaredLangCode)
                            || "lmo".equals(declaredLangCode)
                            || "scn".equals(declaredLangCode)
                            || "nap".equals(declaredLangCode))) {
                return;
            }
            if ("rw".equals(detectedLanguageCode)
                    && "rn".equals(declaredLangCode)) {
                return;
            }
            if ("mhr".equals(detectedLanguageCode)
                    && ("chm".equals(declaredLangCode)
                            || "mrj".equals(declaredLangCode))) {
                return;
            }
            if ("mrj".equals(detectedLanguageCode)
                    && ("chm".equals(declaredLangCode)
                            || "mhr".equals(declaredLangCode))) {
                return;
            }
            String message = "This document appears to be written in %s"
                    + " but the \u201Chtml\u201D start tag has %s. Consider"
                    + " using \u201Clang=\"%s\"\u201D (or variant) instead.";
            if (zhSubtagMismatch(detectedLanguage, lowerCaseLang)
                    || !declaredLangCode.equals(detectedLanguageCode)) {
                if (request != null) {
                    request.setAttribute(
                            "http://validator.nu/properties/lang-wrong", true);
                }
                langWarning = String.format(message, detectedLanguageName,
                        getAttValueExpr("lang", htmlElementLangAttrValue),
                        preferredLanguageCode);
            }
        }
        if (!"".equals(langWarning)) {
            warn(langWarning);
        }
    }

    private void checkContentLanguageHeader(String detectedLanguage,
            String detectedLanguageName, String detectedLanguageCode,
            String preferredLanguageCode) throws SAXException {
        if ("".equals(httpContentLangHeader)
                || httpContentLangHeader.contains(",")) {
            return;
        }
        String message = "";
        String lowerCaseContentLang = httpContentLangHeader.toLowerCase();
        String contentLangCode = new ULocale(
                lowerCaseContentLang).getLanguage();
        if ("tl".equals(detectedLanguageCode) && ("ceb".equals(contentLangCode)
                || "ilo".equals(contentLangCode)
                || "pag".equals(contentLangCode)
                || "war".equals(contentLangCode))) {
            return;
        }
        if ("id".equals(detectedLanguageCode)
                && "min".equals(contentLangCode)) {
            return;
        }
        if ("ms".equals(detectedLanguageCode)
                && "min".equals(contentLangCode)) {
            return;
        }
        if ("hr".equals(detectedLanguageCode) && ("sr".equals(contentLangCode)
                || "bs".equals(contentLangCode)
                || "sh".equals(contentLangCode))) {
            return;
        }
        if ("sr".equals(detectedLanguageCode) && ("hr".equals(contentLangCode)
                || "bs".equals(contentLangCode)
                || "sh".equals(contentLangCode))) {
            return;
        }
        if ("bs".equals(detectedLanguageCode) && ("hr".equals(contentLangCode)
                || "sr".equals(contentLangCode)
                || "sh".equals(contentLangCode))) {
            return;
        }
        if ("de".equals(detectedLanguageCode) && ("bar".equals(contentLangCode)
                || "gsw".equals(contentLangCode)
                || "lb".equals(contentLangCode))) {
            return;
        }
        if ("zh".equals(detectedLanguageCode)
                && "yue".equals(lowerCaseContentLang)) {
            return;
        }
        if ("es".equals(detectedLanguageCode) && ("an".equals(contentLangCode)
                || "ast".equals(contentLangCode))) {
            return;
        }
        if ("it".equals(detectedLanguageCode) && ("co".equals(contentLangCode)
                || "pms".equals(contentLangCode)
                || "vec".equals(contentLangCode)
                || "lmo".equals(contentLangCode)
                || "scn".equals(contentLangCode)
                || "nap".equals(contentLangCode))) {
            return;
        }
        if ("rw".equals(detectedLanguageCode)
                && "rn".equals(contentLangCode)) {
            return;
        }
        if ("mhr".equals(detectedLanguageCode) && ("chm".equals(contentLangCode)
                || "mrj".equals(contentLangCode))) {
            return;
        }
        if ("mrj".equals(detectedLanguageCode) && ("chm".equals(contentLangCode)
                || "mhr".equals(contentLangCode))) {
            return;
        }
        if (zhSubtagMismatch(detectedLanguage, lowerCaseContentLang)
                || !contentLangCode.equals(detectedLanguageCode)) {
            message = "This document appears to be written in %s but the value"
                    + " of the HTTP \u201CContent-Language\u201D header is"
                    + " \u201C%s\u201D. Consider changing it to"
                    + " \u201C%s\u201D (or variant).";
            String warning = String.format(message, detectedLanguageName,
                    lowerCaseContentLang, preferredLanguageCode,
                    preferredLanguageCode);
            if (errorHandler != null) {
                SAXParseException spe = new SAXParseException(warning, null);
                errorHandler.warning(spe);
            }
        }
        if (htmlElementHasLang) {
            message = "The value of the HTTP \u201CContent-Language\u201D"
                    + " header is \u201C%s\u201D but it will be ignored because"
                    + " the \u201Chtml\u201D start tag has %s.";
            String lowerCaseLang = htmlElementLangAttrValue.toLowerCase();
            if (htmlElementHasLang) {
                if (zhSubtagMismatch(lowerCaseContentLang, lowerCaseLang)
                        || !contentLangCode.equals(declaredLangCode)) {
                    warn(String.format(message, httpContentLangHeader,
                            getAttValueExpr("lang", //
                                    htmlElementLangAttrValue)));
                }
            }
        }
    }

    private void checkDirAttribute(String detectedLanguage,
            String detectedLanguageName, String detectedLanguageCode,
            String preferredLanguageCode) throws SAXException {
        if (Arrays.binarySearch(RTL_LANGS, detectedLanguageCode) < 0) {
            return;
        }
        String dirWarning = "";
        if (!hasDir) {
            dirWarning = String.format(
                    "This document appears to be written in %s."
                            + " Consider adding \u201Cdir=\"rtl\"\u201D"
                            + " to the \u201Chtml\u201D start tag.",
                    detectedLanguageName, preferredLanguageCode);
        } else if (!"rtl".equals(dirAttrValue)) {
            String message = "This document appears to be written in %s"
                    + " but the \u201Chtml\u201D start tag has %s."
                    + " Consider using \u201Cdir=\"rtl\"\u201D instead.";
            dirWarning = String.format(message, detectedLanguageName,
                    getAttValueExpr("dir", dirAttrValue));
        }
        if (!"".equals(dirWarning)) {
            warn(dirWarning);
        }
    }

    private boolean zhSubtagMismatch(String expectedLanguage,
            String declaredLanguage) {
        return (("zh-hans".equals(expectedLanguage)
                && (declaredLanguage.contains("zh-tw")
                        || declaredLanguage.contains("zh-hant")))
                || ("zh-hant".equals(expectedLanguage)
                        && (declaredLanguage.contains("zh-cn")
                                || declaredLanguage.contains("zh-hans"))));
    }

    private String getAttValueExpr(String attName, String attValue) {
        if ("".equals(attValue)) {
            return String.format("an empty \u201c%s\u201d attribute", attName);
        } else {
            return String.format("\u201C%s=\"%s\"\u201D", attName, attValue);
        }
    }

    private void warn(String message) throws SAXException {
        if (errorHandler != null) {
            SAXParseException spe = new SAXParseException(message,
                    htmlStartTagLocator);
            errorHandler.warning(spe);
        }
    }

    /**
     * @param prefix
     * @throws SAXException
     * @see org.xml.sax.ContentHandler#endPrefixMapping(java.lang.String)
     */
    @Override
    public void endPrefixMapping(String prefix) throws SAXException {
        if (contentHandler == null) {
            return;
        }
        contentHandler.endPrefixMapping(prefix);
    }

    /**
     * @param ch
     * @param start
     * @param length
     * @throws SAXException
     * @see org.xml.sax.ContentHandler#ignorableWhitespace(char[], int, int)
     */
    @Override
    public void ignorableWhitespace(char[] ch, int start, int length)
            throws SAXException {
        if (contentHandler == null) {
            return;
        }
        contentHandler.ignorableWhitespace(ch, start, length);
    }

    /**
     * @param target
     * @param data
     * @throws SAXException
     * @see org.xml.sax.ContentHandler#processingInstruction(java.lang.String,
     *      java.lang.String)
     */
    @Override
    public void processingInstruction(String target, String data)
            throws SAXException {
        if (contentHandler == null) {
            return;
        }
        contentHandler.processingInstruction(target, data);
    }

    /**
     * @param name
     * @throws SAXException
     * @see org.xml.sax.ContentHandler#skippedEntity(java.lang.String)
     */
    @Override
    public void skippedEntity(String name) throws SAXException {
        if (contentHandler == null) {
            return;
        }
        contentHandler.skippedEntity(name);
    }

    /**
     * @param prefix
     * @param uri
     * @throws SAXException
     * @see org.xml.sax.ContentHandler#startPrefixMapping(java.lang.String,
     *      java.lang.String)
     */
    @Override
    public void startPrefixMapping(String prefix, String uri)
            throws SAXException {
        if (contentHandler == null) {
            return;
        }
        contentHandler.startPrefixMapping(prefix, uri);
    }

    /**
     * @return
     * @see org.xml.sax.XMLReader#getDTDHandler()
     */
    @Override
    public DTDHandler getDTDHandler() {
        return wrappedReader.getDTDHandler();
    }

    /**
     * @return
     * @see org.xml.sax.XMLReader#getEntityResolver()
     */
    @Override
    public EntityResolver getEntityResolver() {
        return wrappedReader.getEntityResolver();
    }

    /**
     * @return
     * @see org.xml.sax.XMLReader#getErrorHandler()
     */
    @Override
    public ErrorHandler getErrorHandler() {
        return errorHandler;
    }

    /**
     * @param name
     * @return
     * @throws SAXNotRecognizedException
     * @throws SAXNotSupportedException
     * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
     */
    @Override
    public boolean getFeature(String name)
            throws SAXNotRecognizedException, SAXNotSupportedException {
        return wrappedReader.getFeature(name);
    }

    /**
     * @param name
     * @return
     * @throws SAXNotRecognizedException
     * @throws SAXNotSupportedException
     * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
     */
    @Override
    public Object getProperty(String name)
            throws SAXNotRecognizedException, SAXNotSupportedException {
        return wrappedReader.getProperty(name);
    }

    /**
     * @param input
     * @throws IOException
     * @throws SAXException
     * @see org.xml.sax.XMLReader#parse(org.xml.sax.InputSource)
     */
    @Override
    public void parse(InputSource input) throws IOException, SAXException {
        wrappedReader.parse(input);
    }

    /**
     * @param systemId
     * @throws IOException
     * @throws SAXException
     * @see org.xml.sax.XMLReader#parse(java.lang.String)
     */
    @Override
    public void parse(String systemId) throws IOException, SAXException {
        wrappedReader.parse(systemId);
    }

    /**
     * @param handler
     * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler)
     */
    @Override
    public void setContentHandler(ContentHandler handler) {
        contentHandler = handler;
    }

    /**
     * @param handler
     * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
     */
    @Override
    public void setDTDHandler(DTDHandler handler) {
        wrappedReader.setDTDHandler(handler);
    }

    /**
     * @param resolver
     * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
     */
    @Override
    public void setEntityResolver(EntityResolver resolver) {
        wrappedReader.setEntityResolver(resolver);
    }

    /**
     * @param handler
     * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
     */
    @Override
    public void setErrorHandler(ErrorHandler handler) {
        wrappedReader.setErrorHandler(handler);
    }

    /**
     * @param name
     * @param value
     * @throws SAXNotRecognizedException
     * @throws SAXNotSupportedException
     * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
     */
    @Override
    public void setFeature(String name, boolean value)
            throws SAXNotRecognizedException, SAXNotSupportedException {
        wrappedReader.setFeature(name, value);
    }

    /**
     * @param name
     * @param value
     * @throws SAXNotRecognizedException
     * @throws SAXNotSupportedException
     * @see org.xml.sax.XMLReader#setProperty(java.lang.String,
     *      java.lang.Object)
     */
    @Override
    public void setProperty(String name, Object value)
            throws SAXNotRecognizedException, SAXNotSupportedException {
        wrappedReader.setProperty(name, value);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy