org.languagetool.rules.AbstractCompoundRule Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
/* LanguageTool, a natural language style checker
* Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules;
import org.apache.commons.lang3.StringUtils;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.LinguServices;
import org.languagetool.UserConfig;
import org.languagetool.tools.StringTools;
import java.io.IOException;
import java.util.*;
import java.util.stream.Stream;
/**
* Checks that compounds (if in the list) are not written as separate words.
*
* @author Daniel Naber, Marcin Miłkowski (refactoring)
*/
public abstract class AbstractCompoundRule extends Rule {
static final int MAX_TERMS = 5;
private final String withHyphenMessage;
private final String withoutHyphenMessage;
private final String withOrWithoutHyphenMessage;
private final String shortDesc;
protected final LinguServices linguServices; // Linguistic Service of LO/OO used for LO/OO extension is null in other cases
protected final Language lang; // used by LO/OO Linguistic Service
// if true, the first word will be uncapitalized before compared to the entries in CompoundRuleData
protected boolean sentenceStartsWithUpperCase = true;
protected boolean subRuleSpecificIds;
@Override
public abstract String getId();
@Override
public abstract String getDescription();
@Override
public int estimateContextForSureMatch() {
return 1;
}
public void useSubRuleSpecificIds() {
subRuleSpecificIds = true;
}
/** @since 3.0 */
public abstract CompoundRuleData getCompoundRuleData();
/**
* @since 3.0
*/
public AbstractCompoundRule(ResourceBundle messages, Language lang, UserConfig userConfig,
String withHyphenMessage, String withoutHyphenMessage, String withOrWithoutHyphenMessage) throws IOException {
this(messages, lang, userConfig, withHyphenMessage, withoutHyphenMessage, withOrWithoutHyphenMessage, null);
}
/**
* @since 3.0
*/
public AbstractCompoundRule(ResourceBundle messages, Language lang, UserConfig userConfig,
String withHyphenMessage, String withoutHyphenMessage, String withOrWithoutHyphenMessage,
String shortMessage) throws IOException {
super.setCategory(Categories.MISC.getCategory(messages));
this.withHyphenMessage = withHyphenMessage;
this.withoutHyphenMessage = withoutHyphenMessage;
this.withOrWithoutHyphenMessage = withOrWithoutHyphenMessage;
this.shortDesc = shortMessage;
setLocQualityIssueType(ITSIssueType.Misspelling);
this.lang = lang;
if (userConfig != null) {
linguServices = userConfig.getLinguServices();
} else {
linguServices = null;
}
}
@Override
public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
List ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
RuleMatch prevRuleMatch = null;
ArrayDeque prevTokens = new ArrayDeque<>(MAX_TERMS);
boolean containsDigits = false;
for (int i = 0; i < tokens.length + MAX_TERMS; i++) {
AnalyzedTokenReadings token;
// we need to extend the token list so we find matches at the end of the original list:
if (i >= tokens.length) {
token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
} else {
token = tokens[i];
}
if (i == 0) {
addToQueue(token, prevTokens);
continue;
} else if (token.isImmunized()) {
continue;
}
AnalyzedTokenReadings firstMatchToken = prevTokens.peek();
List stringsToCheck = new ArrayList<>(); // no hyphens spelling
List origStringsToCheck = new ArrayList<>(); // original upper/lowercase and hyphens spelling
Map stringToToken =
getStringToTokenMap(prevTokens, stringsToCheck, origStringsToCheck);
// iterate backwards over all potentially incorrect strings to make
// sure we match longer strings first:
for (int k = stringsToCheck.size()-1; k >= 0; k--) {
String stringToCheck = stringsToCheck.get(k);
String origStringToCheck = origStringsToCheck.get(k);
String digitsRegexp = null;
if (Stream.of(stringToCheck.split(" ")).anyMatch(s -> StringUtils.isNumeric(s))) {
containsDigits = true;
}
if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck) ||
(containsDigits && getCompoundRuleData().getIncorrectCompounds().contains(digitsRegexp = stringToCheck.replaceAll("\\d+", "\\\\d+")))) {
AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
String msg = null;
List replacement = new ArrayList<>();
if (getCompoundRuleData().getDashSuggestion().contains(stringToCheck) && !origStringToCheck.contains(" ")) {
// It is already joined
break;
}
if (getCompoundRuleData().getDashSuggestion().contains(stringToCheck) ||
(containsDigits && getCompoundRuleData().getIncorrectCompounds().contains(digitsRegexp))) {
replacement.add(origStringToCheck.replace(' ', '-'));
msg = withHyphenMessage;
}
if (isNotAllUppercase(origStringToCheck) && getCompoundRuleData().getJoinedSuggestion().contains(stringToCheck)) {
replacement.add(mergeCompound(origStringToCheck, getCompoundRuleData().getJoinedLowerCaseSuggestion().stream().anyMatch(s -> stringToCheck.contains(s))));
msg = withoutHyphenMessage;
}
String[] parts = stringToCheck.split(" ");
if (parts.length > 0 && parts[0].length() == 1) {
replacement.clear();
replacement.add(origStringToCheck.replace(' ', '-'));
msg = withHyphenMessage;
} else if (replacement.isEmpty() || replacement.size() == 2) { // isEmpty shouldn't happen
msg = withOrWithoutHyphenMessage;
}
replacement = filterReplacements(replacement,
sentence.getText().substring(firstMatchToken.getStartPos(), atr.getEndPos()));
if (replacement.isEmpty()) {
break;
}
int startPos = firstMatchToken.getStartPos();
int endPos = atr.getEndPos();
RuleMatch ruleMatch = new RuleMatch(this, sentence, startPos, endPos, msg, shortDesc);
if (subRuleSpecificIds) {
String id = StringTools.toId(getId() + "_" + stringToCheck, lang);
String description = getDescription().replace("$match", origStringToCheck);
SpecificIdRule subRuleId = new SpecificIdRule(id, description, isPremium(), getCategory(),
getLocQualityIssueType(), getTags());
ruleMatch = new RuleMatch(subRuleId, sentence, startPos, endPos, msg, shortDesc);
}
ruleMatch.setSuggestedReplacements(replacement);
// avoid duplicate matches:
if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
prevRuleMatch = ruleMatch;
break;
}
prevRuleMatch = ruleMatch;
ruleMatches.add(ruleMatch);
break;
}
}
addToQueue(token, prevTokens);
}
return toRuleMatchArray(ruleMatches);
}
protected List filterReplacements(List replacements, String original) throws IOException {
List newReplacements = new ArrayList();
for (String replacement : replacements) {
String newReplacement = replacement.replaceAll("\\-\\-+", "-");
if (!newReplacement.equals(original) && isCorrectSpell(newReplacement)) {
newReplacements.add(newReplacement);
}
}
return newReplacements;
}
private Map getStringToTokenMap(Queue prevTokens,
List stringsToCheck, List origStringsToCheck) {
StringBuilder sb = new StringBuilder();
Map stringToToken = new HashMap<>();
int j = 0;
boolean isFirstSentStart = false;
for (AnalyzedTokenReadings atr : prevTokens) {
if (atr.isWhitespaceBefore()) {
sb.append(' ');
}
sb.append(atr.getToken());
if (j == 0) {
isFirstSentStart = atr.hasPosTag(JLanguageTool.SENTENCE_START_TAGNAME);
}
if (j >= 1 || (j == 0 && !isFirstSentStart)) {
String stringToCheck = normalize(sb.toString());
if (sentenceStartsWithUpperCase && isFirstSentStart) {
stringToCheck = StringUtils.uncapitalize(stringToCheck);
}
stringsToCheck.add(stringToCheck);
origStringsToCheck.add(sb.toString().trim());
if (!stringToToken.containsKey(stringToCheck)) {
stringToToken.put(stringToCheck, atr);
}
}
j++;
}
return stringToToken;
}
private String normalize(String inStr) {
String str = inStr.trim();
str = str.replace(" - ", " ");
str = str.replace('-', ' ');
str = str.replaceAll("\\s+", " ");
return str;
}
private boolean isNotAllUppercase(String str) {
String[] parts = str.split(" ");
for (String part : parts) {
if (!"-".equals(part)) { // do not treat '-' as an upper-case word
if (StringTools.isAllUppercase(part)) {
return false;
}
}
}
return true;
}
public String mergeCompound(String str, boolean uncapitalizeMidWords) {
String[] stringParts = str.replaceAll("-", " ").split(" ");
StringBuilder sb = new StringBuilder();
for (int k = 0; k < stringParts.length; k++) {
if (k == 0) {
sb.append(stringParts[0]);
} else {
sb.append(uncapitalizeMidWords ? StringUtils.uncapitalize(stringParts[k]) : stringParts[k]);
}
}
return sb.toString();
}
private static void addToQueue(AnalyzedTokenReadings token, ArrayDeque prevTokens) {
if (prevTokens.size() == MAX_TERMS) {
prevTokens.poll();
}
prevTokens.offer(token);
}
private boolean isCorrectSpell(String word) throws IOException {
if (linguServices == null) {
return !isMisspelled(word);
}
return linguServices.isCorrectSpell(word, lang);
}
public boolean isMisspelled(String word) throws IOException {
return false;
}
}