org.languagetool.rules.AbstractCompoundRule Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
/* LanguageTool, a natural language style checker
* Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules;
import org.apache.commons.lang3.StringUtils;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.tools.StringTools;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ArrayBlockingQueue;
/**
* Checks that compounds (if in the list) are not written as separate words.
*
* @author Daniel Naber, Marcin Miłkowski (refactoring)
*/
public abstract class AbstractCompoundRule extends Rule {
static final int MAX_TERMS = 5;
private final String withHyphenMessage;
private final String withoutHyphenMessage;
private final String withOrWithoutHyphenMessage;
private final String shortDesc;
// if true, the first word will be uncapitalized before compared to the entries in CompoundRuleData
protected boolean sentenceStartsWithUpperCase = false;
@Override
public abstract String getId();
@Override
public abstract String getDescription();
/** @since 3.0 */
protected abstract CompoundRuleData getCompoundRuleData();
/**
* @since 3.0
*/
public AbstractCompoundRule(ResourceBundle messages,
String withHyphenMessage, String withoutHyphenMessage, String withOrWithoutHyphenMessage) throws IOException {
this(messages, withHyphenMessage, withoutHyphenMessage, withOrWithoutHyphenMessage, null);
}
/**
* @since 3.0
*/
public AbstractCompoundRule(ResourceBundle messages,
String withHyphenMessage, String withoutHyphenMessage, String withOrWithoutHyphenMessage,
String shortMessage) throws IOException {
super.setCategory(Categories.MISC.getCategory(messages));
this.withHyphenMessage = withHyphenMessage;
this.withoutHyphenMessage = withoutHyphenMessage;
this.withOrWithoutHyphenMessage = withOrWithoutHyphenMessage;
this.shortDesc = shortMessage;
setLocQualityIssueType(ITSIssueType.Misspelling);
}
/**
* Flag to indicate if the hyphen is ignored in the text entered by the user.
* Set this to false if you want the rule to offer suggestions for words
* like [ro] "câte-și-trei" (with hyphen), not only for "câte și trei" (with spaces)
* This is only available for languages with hyphen as a word separator (ie: not
* available for English, available for Romanian). See Language.getWordTokenizer()
*/
public boolean isHyphenIgnored() {
return true;
}
@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
List ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
RuleMatch prevRuleMatch = null;
Queue prevTokens = new ArrayBlockingQueue<>(MAX_TERMS);
for (int i = 0; i < tokens.length + MAX_TERMS-1; i++) {
AnalyzedTokenReadings token;
// we need to extend the token list so we find matches at the end of the original list:
if (i >= tokens.length) {
token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
} else {
token = tokens[i];
}
if (i == 0) {
addToQueue(token, prevTokens);
continue;
} else if (token.isImmunized()) {
continue;
}
AnalyzedTokenReadings firstMatchToken = prevTokens.peek();
List stringsToCheck = new ArrayList<>();
List origStringsToCheck = new ArrayList<>(); // original upper/lowercase spelling
Map stringToToken =
getStringToTokenMap(prevTokens, stringsToCheck, origStringsToCheck);
// iterate backwards over all potentially incorrect strings to make
// sure we match longer strings first:
for (int k = stringsToCheck.size()-1; k >= 0; k--) {
String stringToCheck = stringsToCheck.get(k);
String origStringToCheck = origStringsToCheck.get(k);
if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck)) {
AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
String msg = null;
List replacement = new ArrayList<>();
if (!getCompoundRuleData().getNoDashSuggestion().contains(stringToCheck)) {
replacement.add(origStringToCheck.replace(' ', '-'));
msg = withHyphenMessage;
}
if (isNotAllUppercase(origStringToCheck) && !getCompoundRuleData().getOnlyDashSuggestion().contains(stringToCheck)) {
replacement.add(mergeCompound(origStringToCheck, getCompoundRuleData().getNoDashLowerCaseSuggestion().stream().anyMatch(s -> origStringsToCheck.contains(s))));
msg = withoutHyphenMessage;
}
String[] parts = stringToCheck.split(" ");
if (parts.length > 0 && parts[0].length() == 1) {
replacement.clear();
replacement.add(origStringToCheck.replace(' ', '-'));
msg = withHyphenMessage;
} else if (replacement.isEmpty() || replacement.size() == 2) { // isEmpty shouldn't happen
msg = withOrWithoutHyphenMessage;
}
RuleMatch ruleMatch = new RuleMatch(this, sentence, firstMatchToken.getStartPos(), atr.getEndPos(), msg, shortDesc);
ruleMatch.setSuggestedReplacements(replacement);
// avoid duplicate matches:
if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
prevRuleMatch = ruleMatch;
break;
}
prevRuleMatch = ruleMatch;
ruleMatches.add(ruleMatch);
break;
}
}
addToQueue(token, prevTokens);
}
return toRuleMatchArray(ruleMatches);
}
private Map getStringToTokenMap(Queue prevTokens,
List stringsToCheck, List origStringsToCheck) {
StringBuilder sb = new StringBuilder();
Map stringToToken = new HashMap<>();
int j = 0;
for (AnalyzedTokenReadings atr : prevTokens) {
sb.append(' ');
sb.append(atr.getToken());
if (j >= 1) {
String stringToCheck = normalize(sb.toString());
if (sentenceStartsWithUpperCase && j == 1 ) {
stringToCheck = StringUtils.uncapitalize(stringToCheck);
}
stringsToCheck.add(stringToCheck);
origStringsToCheck.add(sb.toString().trim());
if (!stringToToken.containsKey(stringToCheck)) {
stringToToken.put(stringToCheck, atr);
}
}
j++;
}
return stringToToken;
}
private String normalize(String inStr) {
String str = inStr.trim();
if (str.indexOf('-') != -1 && str.indexOf(' ') != -1) {
if (isHyphenIgnored()) {
// e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected:
str = str.replace('-', ' ');
} else {
str = str.replace(" - ", " ");
}
}
return str;
}
private boolean isNotAllUppercase(String str) {
String[] parts = str.split(" ");
for (String part : parts) {
if (isHyphenIgnored() || !"-".equals(part)) { // do not treat '-' as an upper-case word
if (StringTools.isAllUppercase(part)) {
return false;
}
}
}
return true;
}
private String mergeCompound(String str, boolean uncapitalizeMidWords) {
String[] stringParts = str.split(" ");
StringBuilder sb = new StringBuilder();
for (int k = 0; k < stringParts.length; k++) {
if (isHyphenIgnored() || !"-".equals(stringParts[k])) {
if (k == 0) {
sb.append(stringParts[0]);
} else {
sb.append(uncapitalizeMidWords ? StringUtils.uncapitalize(stringParts[k]) : stringParts[k]);
}
}
}
return sb.toString();
}
private void addToQueue(AnalyzedTokenReadings token, Queue prevTokens) {
boolean inserted = prevTokens.offer(token);
if (!inserted) {
prevTokens.poll();
prevTokens.offer(token);
}
}
}