All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.rules.AbstractSimpleReplaceRule Maven / Gradle / Ivy

Go to download

LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.

There is a newer version: 6.5
Show newest version
/* LanguageTool, a natural language style checker
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.ResourceBundle;
import java.util.stream.Collectors;

import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.tools.StringTools;

/**
 * A rule that matches words which should not be used and suggests
 * correct ones instead. Loads the relevant words from
 * rules/XX/replace.txt, where XX is a code of the language.
 * 
 * @author Andriy Rysin
 */
public abstract class AbstractSimpleReplaceRule extends Rule {

  protected boolean ignoreTaggedWords = false;
  private boolean checkLemmas = true;

  protected abstract Map> getWrongWords();

  protected static Map> load(String path) {
    return new SimpleReplaceDataLoader().loadWords(path);
  }

  /**
   * Indicates if the rule is case-sensitive. Default value is true.
   * 
   * @return true if the rule is case-sensitive, false otherwise.
   */
  public boolean isCaseSensitive() {
    return true;
  }

  /**
   * @return the locale used for case conversion when {@link #isCaseSensitive()}
   *         is set to false.
   */
  public Locale getLocale() {
    return Locale.getDefault();
  }

  /**
   * Skip words that are known in the POS tagging dictionary, assuming they
   * cannot be incorrect.
   * @since 2.3
   */
  public void setIgnoreTaggedWords() {
    ignoreTaggedWords = true;
  }

  public AbstractSimpleReplaceRule(ResourceBundle messages)
      throws IOException {
    super.setCategory(Categories.MISC.getCategory(messages));
  }

  @Override
  public String getId() {
    return "SIMPLE_REPLACE";
  }

  @Override
  public String getDescription() {
    return "Checks for wrong words/phrases";
  }

  public String getMessage(String tokenStr, List replacements) {
    return tokenStr + " is not valid. Use: "
        + String.join(", ", replacements) + ".";
  }

  public String getShort() {
    return "Wrong word";
  }

  private String cleanup(String word) {
    return isCaseSensitive() ? word : word.toLowerCase(getLocale()); 
  }

  @Override
  public RuleMatch[] match(AnalyzedSentence sentence) {
    List ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();

    for (AnalyzedTokenReadings tokenReadings : tokens) {

      // short for SENT_START
      if( JLanguageTool.SENTENCE_START_TAGNAME.equals(tokenReadings.getAnalyzedToken(0).getPOSTag()) )
        continue;

      //this rule is used mostly for spelling, so ignore both immunized
      // and speller-ignorable rules
      if (tokenReadings.isImmunized() || tokenReadings.isIgnoredBySpeller()) {
        continue;
      }

      if (ignoreTaggedWords && isTagged(tokenReadings)) {
        continue;
      }

      List matchesForToken = findMatches(tokenReadings, sentence);
      ruleMatches.addAll( matchesForToken );
    }
    
    return toRuleMatchArray(ruleMatches);
  }

  protected List findMatches(AnalyzedTokenReadings tokenReadings, AnalyzedSentence sentence) {
    List ruleMatches = new ArrayList<>();

    String originalTokenStr = tokenReadings.getToken();
    String tokenString = cleanup(originalTokenStr);

    // try first with the original word, then with the all lower-case version
    List possibleReplacements = getWrongWords().get(originalTokenStr);
    if (possibleReplacements == null) {
      possibleReplacements = getWrongWords().get(tokenString);
    }

    if (possibleReplacements == null && checkLemmas) {
      possibleReplacements = new ArrayList<>();

      List lemmas = new ArrayList<>();
      for (AnalyzedToken analyzedToken : tokenReadings.getReadings()) {
        String lemma = analyzedToken.getLemma();
        if (lemma != null && getWrongWords().containsKey(lemma) && ! lemmas.contains(lemma) ) {
          lemmas.add(cleanup(lemma));
        }
      }

      for (String lemma: lemmas) {
        List replacements = getWrongWords().get(lemma);
        if (replacements != null) {
          possibleReplacements.addAll(replacements);
        }
      }

      possibleReplacements = possibleReplacements.stream().distinct().collect(Collectors.toList());
    }

    if (possibleReplacements != null && possibleReplacements.size() > 0) {
      List replacements = new ArrayList<>();
      replacements.addAll(possibleReplacements);
      if (replacements.contains(originalTokenStr)) {
        replacements.remove(originalTokenStr);
      }
      if (replacements.size() > 0) {
        RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, replacements, sentence);
        ruleMatches.add(potentialRuleMatch);
      }
    }
    
    return ruleMatches;
  }

  /**
   * This method allows to override which tags will mark token as tagged
   * @return returns true if token has valid tag
   */
  protected boolean isTagged(AnalyzedTokenReadings tokenReadings) {
    return tokenReadings.isTagged();
  }

  protected RuleMatch createRuleMatch(AnalyzedTokenReadings tokenReadings,
                                      List replacements, AnalyzedSentence sentence) {
    String tokenString = tokenReadings.getToken();
    int pos = tokenReadings.getStartPos();

    RuleMatch potentialRuleMatch = new RuleMatch(this, sentence, pos, pos
        + tokenString.length(), getMessage(tokenString, replacements), getShort());

    if (!isCaseSensitive() && StringTools.startsWithUppercase(tokenString)) {
      for (int i = 0; i < replacements.size(); i++) {
        replacements.set(i, StringTools.uppercaseFirstChar(replacements.get(i)));
      }
    }

    potentialRuleMatch.setSuggestedReplacements(replacements);

    return potentialRuleMatch;
  }

  /**
   * @since 2.5
   */
  public boolean isCheckLemmas() {
    return checkLemmas;
  }

  /**
   * Used to disable matching lemmas.
   * @since 2.5
   */
  public void setCheckLemmas(boolean checkLemmas) {
    this.checkLemmas = checkLemmas;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy