com.android.tools.lint.checks.TypoDetector Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lint-checks Show documentation
A packaging of the IntelliJ Community Edition lint-checks library. This is release number 1 of trunk branch 142.
The newest version!
/*
 * Copyright (C) 2011 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.tools.lint.checks;

import static com.android.SdkConstants.ATTR_LOCALE;
import static com.android.SdkConstants.ATTR_TRANSLATABLE;
import static com.android.SdkConstants.FD_RES_VALUES;
import static com.android.SdkConstants.TAG_PLURALS;
import static com.android.SdkConstants.TAG_STRING;
import static com.android.SdkConstants.TAG_STRING_ARRAY;
import static com.android.SdkConstants.TOOLS_URI;
import static com.android.tools.lint.checks.TypoLookup.isLetter;
import static com.google.common.base.Objects.equal;

import com.android.annotations.NonNull;
import com.android.annotations.Nullable;
import com.android.ide.common.resources.configuration.LocaleQualifier;
import com.android.resources.ResourceFolderType;
import com.android.tools.lint.detector.api.Category;
import com.android.tools.lint.detector.api.Context;
import com.android.tools.lint.detector.api.Implementation;
import com.android.tools.lint.detector.api.Issue;
import com.android.tools.lint.detector.api.Location;
import com.android.tools.lint.detector.api.ResourceXmlDetector;
import com.android.tools.lint.detector.api.Scope;
import com.android.tools.lint.detector.api.Severity;
import com.android.tools.lint.detector.api.Speed;
import com.android.tools.lint.detector.api.TextFormat;
import com.android.tools.lint.detector.api.XmlContext;
import com.android.utils.Pair;
import com.google.common.base.Charsets;
import com.google.common.base.Splitter;

import org.w3c.dom.Attr;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;

/**
 * Check which looks for likely typos in Strings.
 * 
 * TODO:
 * 

 *  Add check of Java String literals too!
 * 
 Add support for additional languages. The typo detector is now
 *      multilingual and looks for typos-*locale*.txt files to use. However,
 *      we need to seed it with additional typo databases. I did some searching
 *      and came up with some alternatives. Here's the strategy I used:
 *      Used Google Translate to translate "Wikipedia Common Misspellings", and
 *      then I went to google.no, google.fr etc searching with that translation, and
 *      came up with what looks like wikipedia language local lists of typos.
 *      This is how I found the Norwegian one for example:
 *      

 *         http://no.wikipedia.org/wiki/Wikipedia:Liste_over_alminnelige_stavefeil/Maskinform
 *      

 *     Here are some additional possibilities not yet processed:
 *      
 *         French: http://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Liste_de_fautes_d'orthographe_courantes
 *            (couldn't find a machine-readable version there?)
 *         
 Swedish:
 *              http://sv.wikipedia.org/wiki/Wikipedia:Lista_%C3%B6ver_vanliga_spr%C3%A5kfel
 *              (couldn't find a machine-readable version there?)
 *        
 German
 *              http://de.wikipedia.org/wiki/Wikipedia:Liste_von_Tippfehlern/F%C3%BCr_Maschinen
 *       
 * 
 Consider also digesting files like
 *       http://sv.wikipedia.org/wiki/Wikipedia:AutoWikiBrowser/Typos
 *       See http://en.wikipedia.org/wiki/Wikipedia:AutoWikiBrowser/User_manual.
 * 
 */
public class TypoDetector extends ResourceXmlDetector {
    @Nullable private TypoLookup mLookup;
    @Nullable private String mLastLanguage;
    @Nullable private String mLastRegion;
    @Nullable private String mLanguage;
    @Nullable private String mRegion;

    /** The main issue discovered by this detector */
    public static final Issue ISSUE = Issue.create(
            "Typos", //$NON-NLS-1$
            "Spelling error",

            "This check looks through the string definitions, and if it finds any words " +
            "that look like likely misspellings, they are flagged.",
            Category.MESSAGES,
            7,
            Severity.WARNING,
            new Implementation(
                    TypoDetector.class,
                    Scope.RESOURCE_FILE_SCOPE));

    /** Constructs a new detector */
    public TypoDetector() {
    }

    @Override
    public boolean appliesTo(@NonNull ResourceFolderType folderType) {
        return folderType == ResourceFolderType.VALUES;
    }

    /** Look up the locale and region from the given parent folder name and store it
     * in {@link #mLanguage} and {@link #mRegion} */
    private void initLocale(@NonNull String parent) {
        mLanguage = null;
        mRegion = null;

        if (parent.equals(FD_RES_VALUES)) {
            return;
        }

        Pair locale = getLocale(parent);
        if (locale != null) {
            mLanguage = locale.getFirst();
            mRegion = locale.getSecond();
        }
    }

    /**
     * Returns the locale for the given parent folder.
     *
     * @param parent the name of the parent folder
     * @return null if the locale is not known, or a pair of language and region
     *        where one or the other but not both can be null
     */
    @Nullable
    public static Pair getLocale(@NonNull String parent) {
        String language = null;
        String region = null;
        for (String qualifier : Splitter.on('-').split(parent)) {
            int qualifierLength = qualifier.length();
            if (qualifierLength == 2) {
                char first = qualifier.charAt(0);
                char second = qualifier.charAt(1);
                if (first >= 'a' && first <= 'z' && second >= 'a' && second <= 'z') {
                    language = qualifier;
                }
            } else if (qualifierLength == 3 && qualifier.charAt(0) == 'r') {
                char first = qualifier.charAt(1);
                char second = qualifier.charAt(2);
                if (first >= 'A' && first <= 'Z' && second >= 'A' && second <= 'Z') {
                    region = new String(new char[] { first, second }); // Don't include the "r"
                }
                break;
            } else if (qualifier.startsWith(LocaleQualifier.PREFIX)) {
                return LocaleQualifier.parseBcp47(qualifier);
            }
        }

        if (language != null || region != null) {
            return Pair.of(language, region);
        } else {
            return null;
        }
    }

    /**
     * Returns the locale for the given context.
     *
     * @param context the context to look up the locale for
     * @return null if the locale is not known, or a pair of language and region
     *        where one or the other but not both can be null
     */
    @Nullable
    public static Pair getLocale(@NonNull XmlContext context) {
        Element root = context.document.getDocumentElement();
        if (root != null) {
            String locale = root.getAttributeNS(TOOLS_URI, ATTR_LOCALE);
            if (locale != null && !locale.isEmpty()) {
                return getLocale(locale);
            }
        }

        return getLocale(context.file.getParentFile().getName());
    }

    @Override
    public void beforeCheckFile(@NonNull Context context) {
        initLocale(context.file.getParentFile().getName());
        if (mLanguage == null) {
            // Check to see if the user has specified the language for this folder
            // using a tools:locale attribute
            if (context instanceof XmlContext) {
                Element root = ((XmlContext) context).document.getDocumentElement();
                if (root != null) {
                    String locale = root.getAttributeNS(TOOLS_URI, ATTR_LOCALE);
                    if (locale != null && !locale.isEmpty()) {
                        initLocale(FD_RES_VALUES + '-' + locale);
                    }
                }
            }

            if (mLanguage == null) {
                mLanguage = "en"; //$NON-NLS-1$
            }
        }

        if (!equal(mLastLanguage, mLanguage) || !equal(mLastRegion, mRegion)) {
            mLookup = TypoLookup.get(context.getClient(), mLanguage, mRegion);
            mLastLanguage = mLanguage;
            mLastRegion = mRegion;
        }
    }

    @NonNull
    @Override
    public Speed getSpeed() {
        return Speed.NORMAL;
    }

    @Override
    public Collection getApplicableElements() {
        return Arrays.asList(
                TAG_STRING,
                TAG_STRING_ARRAY,
                TAG_PLURALS
        );
    }

    @Override
    public void visitElement(@NonNull XmlContext context, @NonNull Element element) {
        if (mLookup == null) {
            return;
        }

        visit(context, element, element);
    }

    private void visit(XmlContext context, Element parent, Node node) {
        if (node.getNodeType() == Node.TEXT_NODE) {
            // TODO: Figure out how to deal with entities
            check(context, parent, node, node.getNodeValue());
        } else {
            NodeList children = node.getChildNodes();
            for (int i = 0, n = children.getLength(); i < n; i++) {
                visit(context, parent, children.item(i));
            }
        }
    }

    private void check(XmlContext context, Element element, Node node, String text) {
        int max = text.length();
        int index = 0;
        int lastWordBegin = -1;
        int lastWordEnd = -1;
        boolean checkedTypos = false;

        for (; index < max; index++) {
            char c = text.charAt(index);
            if (!Character.isWhitespace(c)) {
                if (c == '@' || (c == '?')) {
                    // Don't look for typos in resource references; they are not
                    // user visible anyway
                    return;
                }
                break;
            }
        }

        while (index < max) {
            for (; index < max; index++) {
                char c = text.charAt(index);
                if (c == '\\') {
                    index++;
                } else if (Character.isLetter(c)) {
                    break;
                }
            }
            if (index >= max) {
                return;
            }
            int begin = index;
            for (; index < max; index++) {
                char c = text.charAt(index);
                if (c == '\\') {
                    index++;
                    break;
                } else if (!Character.isLetter(c)) {
                    break;
                } else if (text.charAt(index) >= 0x80) {
                    // Switch to UTF-8 handling for this string
                    if (checkedTypos) {
                        // If we've already checked words we may have reported typos
                        // so create a substring from the current word and on.
                        byte[] utf8Text = text.substring(begin).getBytes(Charsets.UTF_8);
                        check(context, element, node, utf8Text, 0, utf8Text.length, text, begin);
                    } else {
                        // If all we've done so far is skip whitespace (common scenario)
                        // then no need to substring the text, just re-search with the
                        // UTF-8 routines
                        byte[] utf8Text = text.getBytes(Charsets.UTF_8);
                        check(context, element, node, utf8Text, 0, utf8Text.length, text, 0);
                    }
                    return;
                }
            }

            int end = index;
            checkedTypos = true;
            assert mLookup != null;
            List replacements = mLookup.getTypos(text, begin, end);
            if (replacements != null && isTranslatable(element)) {
                reportTypo(context, node, text, begin, replacements);
            }

            checkRepeatedWords(context, element, node, text, lastWordBegin, lastWordEnd, begin,
                    end);

            lastWordBegin = begin;
            lastWordEnd = end;
            index = end + 1;
        }
    }

    private static void checkRepeatedWords(XmlContext context, Element element, Node node,
            String text, int lastWordBegin, int lastWordEnd, int begin, int end) {
        if (lastWordBegin != -1 && end - begin == lastWordEnd - lastWordBegin
                && end - begin > 1) {
            // See whether we have a repeated word
            boolean different = false;
            for (int i = lastWordBegin, j = begin; i < lastWordEnd; i++, j++) {
                if (text.charAt(i) != text.charAt(j)) {
                    different = true;
                    break;
                }
            }
            if (!different && onlySpace(text, lastWordEnd, begin) && isTranslatable(element)) {
                reportRepeatedWord(context, node, text, lastWordBegin, begin, end);
            }
        }
    }

    private static boolean onlySpace(String text, int fromInclusive, int toExclusive) {
        for (int i = fromInclusive; i < toExclusive; i++) {
            if (!Character.isWhitespace(text.charAt(i))) {
                return false;
            }
        }

        return true;
    }

    private void check(XmlContext context, Element element, Node node, byte[] utf8Text,
            int byteStart, int byteEnd, String text, int charStart) {
        int lastWordBegin = -1;
        int lastWordEnd = -1;
        int index = byteStart;
        while (index < byteEnd) {
            // Find beginning of word
            while (index < byteEnd) {
                byte b = utf8Text[index];
                if (b == '\\') {
                    index++;
                    charStart++;
                    if (index < byteEnd) {
                        b = utf8Text[index];
                    }
                } else if (isLetter(b)) {
                    break;
                }
                index++;
                if ((b & 0x80) == 0 || (b & 0xC0) == 0xC0) {
                    // First characters in UTF-8 are always ASCII (0 high bit) or 11XXXXXX
                    charStart++;
                }
            }

            if (index >= byteEnd) {
                return;
            }
            int charEnd = charStart;
            int begin = index;

            // Find end of word. Unicode has the nice property that even 2nd, 3rd and 4th
            // bytes won't match these ASCII characters (because the high bit must be set there)
            while (index < byteEnd) {
                byte b = utf8Text[index];
                if (b == '\\') {
                    index++;
                    charEnd++;
                    if (index < byteEnd) {
                        b = utf8Text[index++];
                        if ((b & 0x80) == 0 || (b & 0xC0) == 0xC0) {
                            charEnd++;
                        }
                    }
                    break;
                } else if (!isLetter(b)) {
                    break;
                }
                index++;
                if ((b & 0x80) == 0 || (b & 0xC0) == 0xC0) {
                    // First characters in UTF-8 are always ASCII (0 high bit) or 11XXXXXX
                    charEnd++;
                }
            }

            int end = index;
            List replacements = mLookup.getTypos(utf8Text, begin, end);
            if (replacements != null && isTranslatable(element)) {
                reportTypo(context, node, text, charStart, replacements);
            }

            checkRepeatedWords(context, element, node, text, lastWordBegin, lastWordEnd, charStart,
                    charEnd);

            lastWordBegin = charStart;
            lastWordEnd = charEnd;
            charStart = charEnd;
        }
    }

    private static boolean isTranslatable(Element element) {
        Attr translatable = element.getAttributeNode(ATTR_TRANSLATABLE);
        return translatable == null || Boolean.valueOf(translatable.getValue());
    }

    /** Report the typo found at the given offset and suggest the given replacements */
    private static void reportTypo(XmlContext context, Node node, String text, int begin,
            List replacements) {
        if (replacements.size() < 2) {
            return;
        }

        String typo = replacements.get(0);
        String word = text.substring(begin, begin + typo.length());

        String first = null;
        String message;

        boolean isCapitalized = Character.isUpperCase(word.charAt(0));
        StringBuilder sb = new StringBuilder(40);
        for (int i = 1, n = replacements.size(); i < n; i++) {
            String replacement = replacements.get(i);
            if (first == null) {
                first = replacement;
            }
            if (sb.length() > 0) {
                sb.append(" or ");
            }
            sb.append('"');
            if (isCapitalized) {
                sb.append(Character.toUpperCase(replacement.charAt(0)));
                sb.append(replacement.substring(1));
            } else {
                sb.append(replacement);
            }
            sb.append('"');
        }

        if (first != null && first.equalsIgnoreCase(word)) {
            if (first.equals(word)) {
                return;
            }
            message = String.format(
                    "\"%1$s\" is usually capitalized as \"%2$s\"",
                    word, first);
        } else {
            message = String.format(
                    "\"%1$s\" is a common misspelling; did you mean %2$s ?",
                    word, sb.toString());
        }

        int end = begin + word.length();
        context.report(ISSUE, node, context.getLocation(node, begin, end), message);
    }

    /** Reports a repeated word */
    private static void reportRepeatedWord(XmlContext context, Node node, String text,
            int lastWordBegin,
            int begin, int end) {
        String message = String.format(
                "Repeated word \"%1$s\" in message: possible typo",
                text.substring(begin, end));
        Location location = context.getLocation(node, lastWordBegin, end);
        context.report(ISSUE, node, location, message);
    }

    /** Returns the suggested replacements, if any, for the given typo. The error
     * message must be one supplied by lint.
     *
     * @param errorMessage the error message
     * @param format the format of the error message
     * @return a list of replacement words suggested by the error message
     */
    @Nullable
    public static List getSuggestions(@NonNull String errorMessage,
            @NonNull TextFormat format) {
        errorMessage = format.toText(errorMessage);

        // The words are all in quotes; the first word is the misspelling,
        // the other words are the suggested replacements
        List words = new ArrayList();
        // Skip the typo
        int index = errorMessage.indexOf('"');
        index = errorMessage.indexOf('"', index + 1);
        index++;

        while (true) {
            index = errorMessage.indexOf('"', index);
            if (index == -1) {
                break;
            }
            index++;
            int start = index;
            index = errorMessage.indexOf('"', index);
            if (index == -1) {
                index = errorMessage.length();
            }
            words.add(errorMessage.substring(start, index));
            index++;
        }

        return words;
    }

    /**
     * Returns the typo word in the error message from this detector
     *
     * @param errorMessage the error message produced earlier by this detector
     * @param format the format of the error message
     * @return the typo
     */
    @Nullable
    public static String getTypo(@NonNull String errorMessage, @NonNull TextFormat format) {
        errorMessage = format.toText(errorMessage);
        // The words are all in quotes
        int index = errorMessage.indexOf('"');
        int start = index + 1;
        index = errorMessage.indexOf('"', start);
        if (index != -1) {
            return errorMessage.substring(start, index);
        }

        return null;
    }
}