org.languagetool.rules.en.AvsAnRule Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of language-en Show documentation
There is a newer version: 6.5
/* LanguageTool, a natural language style checker
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.en;

import org.languagetool.*;
import org.languagetool.rules.*;
import org.languagetool.tools.StringTools;
import org.languagetool.tools.Tools;

import java.util.*;
import java.util.regex.Pattern;

import static java.util.regex.Pattern.*;
import static org.languagetool.rules.en.AvsAnData.getWordsRequiringA;
import static org.languagetool.rules.en.AvsAnData.getWordsRequiringAn;

/**
 * Check if the determiner (if any) preceding a word is:
 * 
 *   an if the next word starts with a vowel
 *   
a if the next word does not start with a vowel
 * 
 *  This rule loads some exceptions from external files {@code det_a.txt} and
 *  {@code det_an.txt} (e.g. for an hour).
 * 
 * @author Daniel Naber
 */
public class AvsAnRule extends Rule {

  enum Determiner {
    A, AN, A_OR_AN, UNKNOWN
  }

  private static final Pattern cleanupPattern = compile("[^αa-zA-Z0-9.;,:']");
  private static final Pattern delimPattern = compile("[-\"“'‘()\\[\\]]+");
  private static final Pattern dashQuotePattern = compile("[-']");

  public AvsAnRule(ResourceBundle messages) {
    super.setCategory(Categories.MISC.getCategory(messages));
    setLocQualityIssueType(ITSIssueType.Misspelling);
    setUrl(Tools.getUrl("https://languagetool.org/insights/post/indefinite-articles/"));
    addExamplePair(Example.wrong("The train arrived a hour ago."),
                   Example.fixed("The train arrived an hour ago."));
  }

  @Override
  public String getId() {
    return "EN_A_VS_AN";
  }

  @Override
  public String getDescription() {
    return "Use of 'a' vs. 'an'";
  }

  @Override
  public int estimateContextForSureMatch() {
    return 1;
  }

  @Override
  public RuleMatch[] match(AnalyzedSentence sentence) {
    List ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    int prevTokenIndex = 0;
    boolean isSentenceStart;
    boolean equalsA;
    boolean equalsAn;
    for (int i = 1; i < tokens.length; i++) {  // ignoring token 0, i.e., SENT_START
      AnalyzedTokenReadings token = tokens[i];
      String prevTokenStr = prevTokenIndex > 0 ? tokens[prevTokenIndex].getToken() : null;
      isSentenceStart = prevTokenIndex == 1;
      if (!isSentenceStart) {
        equalsA = "a".equals(prevTokenStr);
        equalsAn = "an".equals(prevTokenStr);
      } else {
      	equalsA = "a".equalsIgnoreCase(prevTokenStr);
        equalsAn = "an".equalsIgnoreCase(prevTokenStr);
      }
      if (equalsA || equalsAn) {
        Determiner determiner = getCorrectDeterminerFor(token);
        String msg = null;
        if (equalsA && determiner == Determiner.AN) {
          String replacement = StringTools.startsWithUppercase(prevTokenStr) ? "An" : "an";
          msg = "Use " + replacement + " instead of '" + prevTokenStr + "' if the following "+
                  "word starts with a vowel sound, e.g. 'an article', 'an hour'.";
        } else if (equalsAn && determiner == Determiner.A) {
          String replacement = StringTools.startsWithUppercase(prevTokenStr) ? "A" : "a";
          msg = "Use " + replacement + " instead of '" + prevTokenStr + "' if the following "+
                  "word doesn't start with a vowel sound, e.g. 'a sentence', 'a university'.";
        }
        if (msg != null) {
          RuleMatch match = new RuleMatch(
              this, sentence, tokens[prevTokenIndex].getStartPos(), tokens[prevTokenIndex].getEndPos(),
                  tokens[prevTokenIndex].getStartPos(), token.getEndPos(), msg, "Wrong article");
          ruleMatches.add(match);
        }
      }
      String nextToken = "";
      if (i + 1 < tokens.length) {
        nextToken = tokens[i + 1].getToken();
      }
      if (token.hasPosTag("DT")) {
        prevTokenIndex = i;
      } else if (nextToken.length() > 1 && delimPattern.matcher(token.getToken()).matches()) {
        // skip e.g. the quote in >>an "industry party"<<
      } else {
        prevTokenIndex = 0;
      }
    }
    return toRuleMatchArray(ruleMatches);
  }

  /**
   * Adds "a" or "an" to the English noun. Used for suggesting the proper form of the indefinite article.
   * For the rare cases where both "a" and "an" are considered okay (e.g. for "historical"), "a" is returned.
   * @param origWord Word that needs an article.
   * @return String containing the word with a determiner, or just the word if the word is an abbreviation.
   */
  public String suggestAorAn(String origWord) {
    AnalyzedTokenReadings token = new AnalyzedTokenReadings(new AnalyzedToken(origWord, null, null), 0);
    Determiner determiner = getCorrectDeterminerFor(token);
    if (determiner == Determiner.A || determiner == Determiner.A_OR_AN) {
      return "a " + StringTools.lowercaseFirstCharIfCapitalized(origWord);
    } else if (determiner == Determiner.AN) {
      return "an " + StringTools.lowercaseFirstCharIfCapitalized(origWord);
    } else {
      return origWord;
    }
  }

  static Determiner getCorrectDeterminerFor(AnalyzedTokenReadings token) {
    String word = token.getToken();
    Determiner determiner = Determiner.UNKNOWN;
    String[] parts = dashQuotePattern.split(word);  // for example, in "one-way" only "one" is relevant
    if (parts.length >= 1 && !parts[0].equalsIgnoreCase("a")) {  // avoid false alarm on "A-levels are..."
      word = parts[0];
    }
    if (token.isWhitespaceBefore() || !"-".equals(word)) { // e.g., 'a- or anti- are prefixes'
      word = cleanupPattern.matcher(word).replaceAll("");         // e.g. >>an "industry party"<<
      if (StringTools.isEmpty(word)) {
        return Determiner.UNKNOWN;
      }
    }
    if (getWordsRequiringA().contains(word.toLowerCase()) || getWordsRequiringA().contains(word)) {
      determiner = Determiner.A;
    }
    if (getWordsRequiringAn().contains(word.toLowerCase()) || getWordsRequiringAn().contains(word)) {
      if (determiner == Determiner.A) {
        determiner = Determiner.A_OR_AN;   // e.g. for 'historical'
      } else {
        determiner = Determiner.AN;
      }
    }
    if (determiner == Determiner.UNKNOWN) {
      char tokenFirstChar = word.charAt(0);
      if (StringTools.isAllUppercase(word) || StringTools.isMixedCase(word)) {
        // we don't know how all-uppercase words (often abbreviations) are pronounced,
        // so never complain about these
        determiner = Determiner.UNKNOWN;
      } else if (isVowel(tokenFirstChar)) {
        determiner = Determiner.AN;
      } else {
        determiner = Determiner.A;
      }
    }
    return determiner;
  }

  private static boolean isVowel(char c) {
    char lc = Character.toLowerCase(c);
    return lc == 'a' || lc == 'e' || lc == 'i' || lc == 'o' || lc == 'u';
  }

}