All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.rules.en.AvsAnRule Maven / Gradle / Ivy

There is a newer version: 6.5
Show newest version
/* LanguageTool, a natural language style checker
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.en;

import org.languagetool.*;
import org.languagetool.rules.*;
import org.languagetool.tools.StringTools;
import org.languagetool.tools.Tools;

import java.util.*;
import java.util.regex.Pattern;

import static java.util.regex.Pattern.*;
import static org.languagetool.rules.en.AvsAnData.getWordsRequiringA;
import static org.languagetool.rules.en.AvsAnData.getWordsRequiringAn;

/**
 * Check if the determiner (if any) preceding a word is:
 * 
    *
  • an if the next word starts with a vowel *
  • a if the next word does not start with a vowel *
* This rule loads some exceptions from external files {@code det_a.txt} and * {@code det_an.txt} (e.g. for an hour). * * @author Daniel Naber */ public class AvsAnRule extends Rule { enum Determiner { A, AN, A_OR_AN, UNKNOWN } private static final Pattern cleanupPattern = compile("[^αa-zA-Z0-9.;,:']"); private static final Pattern delimPattern = compile("[-\"“'‘()\\[\\]]+"); private static final Pattern dashQuotePattern = compile("[-']"); public AvsAnRule(ResourceBundle messages) { super.setCategory(Categories.MISC.getCategory(messages)); setLocQualityIssueType(ITSIssueType.Misspelling); setUrl(Tools.getUrl("https://languagetool.org/insights/post/indefinite-articles/")); addExamplePair(Example.wrong("The train arrived a hour ago."), Example.fixed("The train arrived an hour ago.")); } @Override public String getId() { return "EN_A_VS_AN"; } @Override public String getDescription() { return "Use of 'a' vs. 'an'"; } @Override public int estimateContextForSureMatch() { return 1; } @Override public RuleMatch[] match(AnalyzedSentence sentence) { List ruleMatches = new ArrayList<>(); AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace(); int prevTokenIndex = 0; boolean isSentenceStart; boolean equalsA; boolean equalsAn; for (int i = 1; i < tokens.length; i++) { // ignoring token 0, i.e., SENT_START AnalyzedTokenReadings token = tokens[i]; String prevTokenStr = prevTokenIndex > 0 ? tokens[prevTokenIndex].getToken() : null; isSentenceStart = prevTokenIndex == 1; if (!isSentenceStart) { equalsA = "a".equals(prevTokenStr); equalsAn = "an".equals(prevTokenStr); } else { equalsA = "a".equalsIgnoreCase(prevTokenStr); equalsAn = "an".equalsIgnoreCase(prevTokenStr); } if (equalsA || equalsAn) { Determiner determiner = getCorrectDeterminerFor(token); String msg = null; if (equalsA && determiner == Determiner.AN) { String replacement = StringTools.startsWithUppercase(prevTokenStr) ? "An" : "an"; msg = "Use " + replacement + " instead of '" + prevTokenStr + "' if the following "+ "word starts with a vowel sound, e.g. 'an article', 'an hour'."; } else if (equalsAn && determiner == Determiner.A) { String replacement = StringTools.startsWithUppercase(prevTokenStr) ? "A" : "a"; msg = "Use " + replacement + " instead of '" + prevTokenStr + "' if the following "+ "word doesn't start with a vowel sound, e.g. 'a sentence', 'a university'."; } if (msg != null) { RuleMatch match = new RuleMatch( this, sentence, tokens[prevTokenIndex].getStartPos(), tokens[prevTokenIndex].getEndPos(), tokens[prevTokenIndex].getStartPos(), token.getEndPos(), msg, "Wrong article"); ruleMatches.add(match); } } String nextToken = ""; if (i + 1 < tokens.length) { nextToken = tokens[i + 1].getToken(); } if (token.hasPosTag("DT")) { prevTokenIndex = i; } else if (nextToken.length() > 1 && delimPattern.matcher(token.getToken()).matches()) { // skip e.g. the quote in >>an "industry party"<< } else { prevTokenIndex = 0; } } return toRuleMatchArray(ruleMatches); } /** * Adds "a" or "an" to the English noun. Used for suggesting the proper form of the indefinite article. * For the rare cases where both "a" and "an" are considered okay (e.g. for "historical"), "a" is returned. * @param origWord Word that needs an article. * @return String containing the word with a determiner, or just the word if the word is an abbreviation. */ public String suggestAorAn(String origWord) { AnalyzedTokenReadings token = new AnalyzedTokenReadings(new AnalyzedToken(origWord, null, null), 0); Determiner determiner = getCorrectDeterminerFor(token); if (determiner == Determiner.A || determiner == Determiner.A_OR_AN) { return "a " + StringTools.lowercaseFirstCharIfCapitalized(origWord); } else if (determiner == Determiner.AN) { return "an " + StringTools.lowercaseFirstCharIfCapitalized(origWord); } else { return origWord; } } static Determiner getCorrectDeterminerFor(AnalyzedTokenReadings token) { String word = token.getToken(); Determiner determiner = Determiner.UNKNOWN; String[] parts = dashQuotePattern.split(word); // for example, in "one-way" only "one" is relevant if (parts.length >= 1 && !parts[0].equalsIgnoreCase("a")) { // avoid false alarm on "A-levels are..." word = parts[0]; } if (token.isWhitespaceBefore() || !"-".equals(word)) { // e.g., 'a- or anti- are prefixes' word = cleanupPattern.matcher(word).replaceAll(""); // e.g. >>an "industry party"<< if (StringTools.isEmpty(word)) { return Determiner.UNKNOWN; } } if (getWordsRequiringA().contains(word.toLowerCase()) || getWordsRequiringA().contains(word)) { determiner = Determiner.A; } if (getWordsRequiringAn().contains(word.toLowerCase()) || getWordsRequiringAn().contains(word)) { if (determiner == Determiner.A) { determiner = Determiner.A_OR_AN; // e.g. for 'historical' } else { determiner = Determiner.AN; } } if (determiner == Determiner.UNKNOWN) { char tokenFirstChar = word.charAt(0); if (StringTools.isAllUppercase(word) || StringTools.isMixedCase(word)) { // we don't know how all-uppercase words (often abbreviations) are pronounced, // so never complain about these determiner = Determiner.UNKNOWN; } else if (isVowel(tokenFirstChar)) { determiner = Determiner.AN; } else { determiner = Determiner.A; } } return determiner; } private static boolean isVowel(char c) { char lc = Character.toLowerCase(c); return lc == 'a' || lc == 'e' || lc == 'i' || lc == 'o' || lc == 'u'; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy