All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.rules.AbstractSimpleReplaceRule Maven / Gradle / Ivy

Go to download

LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.

There is a newer version: 6.4
Show newest version
/* LanguageTool, a natural language style checker
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules;

import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;

import org.jetbrains.annotations.Nullable;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.synthesis.Synthesizer;
import org.languagetool.tools.StringTools;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A rule that matches words which should not be used and suggests
 * correct ones instead. Loads the relevant words from
 * rules/XX/replace.txt, where XX is a code of the language.
 * 
 * @author Andriy Rysin
 */
public abstract class AbstractSimpleReplaceRule extends Rule {

  protected boolean ignoreTaggedWords = false;
  protected boolean subRuleSpecificIds;

  private static final Logger logger = LoggerFactory.getLogger(AbstractSimpleReplaceRule.class);
  private boolean checkLemmas = true;

  protected abstract Map> getWrongWords();

  protected static Map> loadFromPath(String path) {
    return new SimpleReplaceDataLoader().loadWords(path);
  }

  /**
   * @since 5.0
   */
  protected static Map> loadFromPath(String... paths) {
    SimpleReplaceDataLoader loader = new SimpleReplaceDataLoader();
    Map> map = new HashMap<>();
    for (String path : paths) {
      map.putAll(loader.loadWords(path));
    }
    return map;
  }

  /**
   * Indicates if the rule is case-sensitive. Default value is true.
   * 
   * @return true if the rule is case-sensitive, false otherwise.
   */
  public boolean isCaseSensitive() {
    return true;
  }

  /**
   * @return the locale used for case conversion when {@link #isCaseSensitive()}
   *         is set to false.
   */
  public Locale getLocale() {
    return Locale.getDefault();
  }

  /**
   * Skip words that are known in the POS tagging dictionary, assuming they
   * cannot be incorrect.
   * @since 2.3
   */
  public void setIgnoreTaggedWords() {
    ignoreTaggedWords = true;
  }

  public AbstractSimpleReplaceRule(ResourceBundle messages) {
    super(messages);
    super.setCategory(Categories.MISC.getCategory(messages));
  }

  @Override
  public String getId() {
    return "SIMPLE_REPLACE";
  }

  @Override
  public String getDescription() {
    return "Checks for wrong words/phrases";
  }

  public String getMessage(String tokenStr, List replacements) {
    return tokenStr + " is not valid. Use: "
        + String.join(", ", replacements) + ".";
  }

  public String getShort() {
    return "Wrong word";
  }

  private String cleanup(String word) {
    return isCaseSensitive() ? word : word.toLowerCase(getLocale()); 
  }

  @Override
  public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
    List ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    for (AnalyzedTokenReadings tokenReadings : tokens) {
      // short for SENT_START
      if( JLanguageTool.SENTENCE_START_TAGNAME.equals(tokenReadings.getAnalyzedToken(0).getPOSTag()) ||
          tokenReadings.isImmunized() ||        //this rule is used mostly for spelling, so ignore both immunized
          tokenReadings.isIgnoredBySpeller() || //and speller-ignorable rules
          isTokenException(tokenReadings) ||
          (ignoreTaggedWords && isTagged(tokenReadings))
      ) {
        continue;
      }
      List matchesForToken = findMatches(tokenReadings, sentence);
      ruleMatches.addAll(matchesForToken);
    }
    return toRuleMatchArray(ruleMatches);
  }

  protected List findMatches(AnalyzedTokenReadings tokenReadings, AnalyzedSentence sentence) throws IOException {
    List ruleMatches = new ArrayList<>();

    String originalTokenStr = tokenReadings.getToken();
    String tokenString = cleanup(originalTokenStr);
    boolean isAllUppercase = StringTools.isAllUppercase(originalTokenStr);

    // try first with the original word, then with the all lower-case version
    List possibleReplacements = getWrongWords().get(originalTokenStr);
    if (possibleReplacements == null) {
      possibleReplacements = getWrongWords().get(tokenString);
    }

    if (possibleReplacements == null && checkLemmas) {
      possibleReplacements = new ArrayList<>();

      List lemmas = new ArrayList<>();
      for (AnalyzedToken analyzedToken : tokenReadings.getReadings()) {
        String lemma = analyzedToken.getLemma();
        if (lemma != null && getWrongWords().containsKey(lemma) && ! lemmas.contains(lemma) ) {
          lemmas.add(cleanup(lemma));
        }
      }

      for (String lemma : lemmas) {
        List replacements = getWrongWords().get(lemma);
        if (replacements != null) {
          Synthesizer synth = getSynthesizer();
          if (synth != null) {
            for (String replacementLemma : replacements) {
              for (AnalyzedToken at : tokenReadings.getReadings()) {
                if (at.getLemma() == null) {
                  logger.warn("at.getLemma() == null for " + at + ", replacementLemma: " + replacementLemma);
                }
                AnalyzedToken newAt = new AnalyzedToken(at.getLemma(), at.getPOSTag(), replacementLemma);
                String[] s = synth.synthesize(newAt, at.getPOSTag());
                possibleReplacements.addAll(Arrays.asList(s));
              }
            }
          } else {
            possibleReplacements.addAll(replacements);
          }
        }
      }
      possibleReplacements = possibleReplacements.stream().distinct().collect(Collectors.toList());
    }

    if (possibleReplacements != null && possibleReplacements.size() > 0) {
      List replacements = new ArrayList<>();
      if (isAllUppercase) {
        for (String s: possibleReplacements) {
          replacements.add(s.toUpperCase());
        }
      } else {
        replacements = new ArrayList<>(possibleReplacements);  
      }
      replacements.remove(originalTokenStr);
      if (replacements.size() > 0) {
        RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, replacements, sentence, originalTokenStr);
        ruleMatches.add(potentialRuleMatch);
      }
    }
    
    return ruleMatches;
  }

  /**
   * This method allows to override which tags will mark token as tagged
   * @return returns true if token has valid tag
   */
  protected boolean isTagged(AnalyzedTokenReadings tokenReadings) {
    return tokenReadings.isTagged();
  }

  protected RuleMatch createRuleMatch(AnalyzedTokenReadings tokenReadings,
                                      List replacements, AnalyzedSentence sentence, String originalTokenStr) {
    String tokenString = tokenReadings.getToken();
    int pos = tokenReadings.getStartPos();
    
    RuleMatch potentialRuleMatch = null;
    potentialRuleMatch = new RuleMatch(this, sentence, pos, pos
        + tokenString.length(), getMessage(tokenString, replacements), getShort());
    if (subRuleSpecificIds) {
      potentialRuleMatch.setSpecificRuleId(StringTools.toId(getId() + "_" + originalTokenStr));
    }
    if (!isCaseSensitive() && StringTools.startsWithUppercase(tokenString)) {
      for (int i = 0; i < replacements.size(); i++) {
        replacements.set(i, StringTools.uppercaseFirstChar(replacements.get(i)));
      }
    }

    potentialRuleMatch.setSuggestedReplacements(replacements);

    return potentialRuleMatch;
  }

  /**
   * @since 2.5
   */
  public boolean isCheckLemmas() {
    return checkLemmas;
  }

  /**
   * Used to disable matching lemmas.
   * @since 2.5
   */
  public void setCheckLemmas(boolean checkLemmas) {
    this.checkLemmas = checkLemmas;
  }

  /**
   * Synthesizer to generate inflected suggestions
   * @since 5.1
   */
  @Nullable
  public Synthesizer getSynthesizer() {
    return null;
  }

  /*
   * @since 5.2
   */
  protected boolean isTokenException(AnalyzedTokenReadings atr) {
    return false;
  }
  
  /**
   * If this is set, each replacement pair will have its own rule ID, making rule deactivations more specific.
   * @since 5.5
   */
  public void useSubRuleSpecificIds() {
    subRuleSpecificIds = true;
  }
    
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy