org.languagetool.rules.GenericUnpairedBracketsRule Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
/* LanguageTool, a natural language style checker
* Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules;
import com.google.common.base.Suppliers;
import org.jetbrains.annotations.Nullable;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedTokenReadings;
import java.text.MessageFormat;
import java.util.*;
import java.util.function.Supplier;
import java.util.regex.Pattern;
/**
* Rule that finds unpaired quotes, brackets etc.
*
* @author Marcin Miłkowski
*/
public class GenericUnpairedBracketsRule extends TextLevelRule {
private static final Pattern NUMERALS_EN =
Pattern.compile("(?i)\\d{1,2}?[a-z']*|M*(D?C{0,3}|C[DM])(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])$");
private static final Pattern PUNCTUATION = Pattern.compile("[\\p{Punct}…–—]");
private static final Pattern PUNCTUATION_NO_DOT =
Pattern.compile("[ldmnstLDMNST]'|[–—\\p{Punct}&&[^.]]");
// "[ldmnst]'" allows dealing with apostrophed words in Catalan (i.e. l'«home)
private final List startSymbols;
private final List endSymbols;
private final Map uniqueMap;
private final String ruleId;
private final Pattern numerals;
public GenericUnpairedBracketsRule(String ruleId, ResourceBundle messages, List startSymbols, List endSymbols) {
this(ruleId, messages, startSymbols, endSymbols, NUMERALS_EN);
}
/**
* @since 3.7
*/
public GenericUnpairedBracketsRule(String ruleId, ResourceBundle messages, List startSymbols, List endSymbols, Pattern numerals) {
super(messages);
this.ruleId = ruleId != null ? ruleId : "UNPAIRED_BRACKETS";
super.setCategory(Categories.PUNCTUATION.getCategory(messages));
if (startSymbols.size() != endSymbols.size()) {
throw new IllegalArgumentException("Different number of start and end symbols: " + startSymbols + " vs. " + endSymbols);
}
this.startSymbols = startSymbols;
this.endSymbols = endSymbols;
this.numerals = Objects.requireNonNull(numerals);
this.uniqueMap = uniqueMapInit();
setLocQualityIssueType(ITSIssueType.Typographical);
}
/**
* @param startSymbols start symbols like "(" - note that the array must be of equal length as the next parameter
* and the sequence of starting symbols must match exactly the sequence of ending symbols.
* @param endSymbols end symbols like ")"
*/
public GenericUnpairedBracketsRule(ResourceBundle messages, List startSymbols, List endSymbols) {
this(null, messages, startSymbols, endSymbols);
}
/**
* @since 3.7
*/
public GenericUnpairedBracketsRule(ResourceBundle messages, List startSymbols, List endSymbols, Pattern numerals) {
this(null, messages, startSymbols, endSymbols, numerals);
}
/**
* Construct rule with a set of default start and end symbols: [] () {} "" ''
*/
public GenericUnpairedBracketsRule(ResourceBundle messages) {
this(null, messages, Arrays.asList("[", "(", "{", "\"", "'"), Arrays.asList("]", ")", "}", "\"", "'"));
}
@Override
public String getId() {
return ruleId;
}
@Override
public String getDescription() {
return messages.getString("desc_unpaired_brackets");
}
/**
* Generic method to specify an exception. For unspecified
* language, it simply returns true (which means no exception) unless
* there's a common smiley like :-) or ;-).
* @param token String token
* @param tokens Sentence tokens
* @param i Current token index
* @param precSpace is preceded with space
* @param follSpace is followed with space
*/
protected boolean isNoException(String token,
AnalyzedTokenReadings[] tokens, int i, int j,
boolean precSpace,
boolean follSpace, UnsyncStack symbolStack) {
String tokenStr = tokens[i].getToken();
if (i > 0 && tokens[i-1].getToken().matches("https?://.+") && tokens[i-1].getToken().contains("(")) {
return false;
}
if (i >= 2) {
String prevPrevToken = tokens[i - 2].getToken();
String prevToken = tokens[i - 1].getToken();
// Smiley ":-)" and ":-("
if (prevPrevToken.equals(":") && prevToken.equals("-") && (tokenStr.equals(")") || tokenStr.equals("("))) {
return false;
}
// Smiley ";-)" and ";-("
if (prevPrevToken.equals(";") && prevToken.equals("-") && (tokenStr.equals(")") || tokenStr.equals("("))) {
return false;
}
// Smiley ")))" TODO: need more testing
// if (prevPrevToken.equals(")") && prevToken.equals(")") && (tokenStr.equals(")") || tokenStr.equals("("))) {
// return false;
// }
}
if (i >= 1) {
String prevToken = tokens[i - 1].getToken();
// Smiley ":)" and ":("
if (prevToken.equals(":") && !tokens[i].isWhitespaceBefore() && (tokenStr.equals(")") || tokenStr.equals("("))) {
return false;
}
// Smiley ";)" and ";("
if (prevToken.equals(";") && !tokens[i].isWhitespaceBefore() && (tokenStr.equals(")") || tokenStr.equals("("))) {
return false;
}
}
return true;
}
@Override
public final RuleMatch[] match(List sentences) {
UnsyncStack symbolStack = new UnsyncStack<>(); // the stack for pairing symbols
UnsyncStack ruleMatchStack = new UnsyncStack<>();
List ruleMatches = new ArrayList<>();
int startPosBase = 0;
int sentenceIdx = 0;
for (AnalyzedSentence sentence : sentences) {
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
for (int i = 1; i < tokens.length; i++) {
for (int j = 0; j < startSymbols.size(); j++) {
if (fillSymbolStack(startPosBase, tokens, i, j, symbolStack, sentence, sentenceIdx)) {
break;
}
}
}
startPosBase += sentence.getCorrectedTextLength();
sentenceIdx++;
}
boolean isSymmetric = false;
//if the stack is odd and symmetric match only the symbol in the middle, e. g. ({"})
int ssSize = symbolStack.size();
if (ssSize > 2 && ssSize % 2 == 1) {
isSymmetric = true;
for (int i = 0; i < ssSize / 2; i++) {
if (startSymbols.indexOf(symbolStack.get(i).getSymbol().symbol) !=
endSymbols.indexOf(symbolStack.get(ssSize - 1).getSymbol().symbol)) {
isSymmetric = false;
break;
}
}
}
Supplier lazyFullText = Suppliers.memoize(() -> {
StringBuilder fullText = new StringBuilder();
for (AnalyzedSentence aSentence : sentences) {
fullText.append(aSentence.getText());
}
return fullText.toString();
});
if (isSymmetric) {
SymbolLocator loc = symbolStack.get(ssSize / 2);
int sentenceIndex = loc.getSentenceIndex();
RuleMatch rMatch = createMatch(ruleMatches, ruleMatchStack, loc.getStartPos(),
loc.getSymbol(), loc.getSentence(), sentenceIndex, lazyFullText);
if (rMatch != null) {
ruleMatches.add(rMatch);
}
} else {
for (SymbolLocator sLoc : symbolStack) {
RuleMatch rMatch = createMatch(ruleMatches, ruleMatchStack, sLoc.getStartPos(), sLoc.getSymbol(),
sLoc.getSentence(), sLoc.getSentenceIndex(), lazyFullText);
if (rMatch != null && (sLoc.getSymbol().symbolType == GenericUnpairedBracketsRule.Symbol.Type.Closing ||
endsLikeRealSentence(sLoc.getSentence().getText()) || sentences.size()-1 > sLoc.getSentenceIndex())) {
ruleMatches.add(rMatch);
}
}
}
return toRuleMatchArray(ruleMatches);
}
private boolean endsLikeRealSentence(String r) {
String s = r.trim();
return s.endsWith(".") || s.endsWith("?") || s.endsWith("!");
}
private Map uniqueMapInit() {
Map uniqueMap = new HashMap<>();
for (String endSymbol : endSymbols) {
int found = 0;
for (String endSymbol1 : endSymbols) {
if (endSymbol1.equals(endSymbol)) {
found++;
}
}
uniqueMap.put(endSymbol, found == 1);
}
return Collections.unmodifiableMap(uniqueMap);
}
private boolean fillSymbolStack(int startPosBase, AnalyzedTokenReadings[] tokens, int i, int j, UnsyncStack symbolStack, AnalyzedSentence sentence, int sentenceIdx) {
String token = tokens[i].getToken();
int startPos = startPosBase + tokens[i].getStartPos();
if (token.equals(startSymbols.get(j)) || token.equals(endSymbols.get(j))) {
boolean precededByWhitespace = getPrecededByWhitespace(tokens, i, j);
boolean isSpecialCase = getSpecialCase(tokens, i, j);
boolean noException = isNoException(token, tokens, i, j,
precededByWhitespace, isSpecialCase, symbolStack);
if (noException && precededByWhitespace && token.equals(startSymbols.get(j))) {
symbolStack.push(new SymbolLocator(new Symbol(startSymbols.get(j), Symbol.Type.Opening), i, startPos, sentence, sentenceIdx));
return true;
} else if (noException && (isSpecialCase || tokens[i].isSentenceEnd())
&& token.equals(endSymbols.get(j))) {
if ((i > 2 && endSymbols.get(j).equals(")")
&& (tokens[i - 3].hasPosTag("SENT_START") || tokens[i - 2].isWhitespaceBefore())
&& tokens[i - 1].getToken().equals(".")
&& (numerals.matcher(tokens[i - 2].getToken()).matches()
&& !(!symbolStack.empty()
&& "(".equals(symbolStack.peek().getSymbol().symbol))))
|| (i > 1 && endSymbols.get(j).equals(")")
&& (numerals.matcher(tokens[i - 1].getToken()).matches()
&& !(!symbolStack.empty()
&& "(".equals(symbolStack.peek().getSymbol().symbol))))) {
} else {
if (symbolStack.empty()) {
symbolStack.push(new SymbolLocator(new Symbol(endSymbols.get(j), Symbol.Type.Closing), i, startPos, sentence, sentenceIdx));
return true;
} else {
if (symbolStack.peek().getSymbol().symbol.equals(startSymbols.get(j))) {
symbolStack.pop();
return true;
} else {
if (isEndSymbolUnique(endSymbols.get(j))) {
symbolStack.push(new SymbolLocator(new Symbol(endSymbols.get(j), Symbol.Type.Closing), i, startPos, sentence, sentenceIdx));
return true;
} else {
if (j == endSymbols.size() - 1) {
symbolStack.push(new SymbolLocator(new Symbol(endSymbols.get(j), Symbol.Type.Closing), i, startPos, sentence, sentenceIdx));
return true;
}
}
}
}
}
}
}
return false;
}
private boolean getPrecededByWhitespace(AnalyzedTokenReadings[] tokens, int i, int j) {
boolean precededByWhitespace = true;
if (startSymbols.get(j).equals(endSymbols.get(j))) {
precededByWhitespace = tokens[i - 1].isSentenceStart()
|| tokens[i].isWhitespaceBefore()
|| PUNCTUATION_NO_DOT.matcher(tokens[i - 1].getToken()).matches()
|| startSymbols.contains(tokens[i - 1].getToken());
}
return precededByWhitespace;
}
private boolean getSpecialCase(AnalyzedTokenReadings[] tokens, int i, int j) {
boolean isException = true;
if (i < tokens.length - 1 && startSymbols.get(j).equals(endSymbols.get(j))) {
isException = tokens[i + 1].isWhitespaceBefore()
|| PUNCTUATION.matcher(tokens[i + 1].getToken()).matches()
|| endSymbols.contains(tokens[i + 1].getToken())
|| (i >= 1 && tokens[i - 1].getToken().endsWith("-")) // e.g. >>xxx-"yyy yyy"-zzz<<
|| tokens[i + 1].getToken().startsWith("-") // e.g. >>"Go"-button<<
|| "s".equals(tokens[i + 1].getToken());// e.g. >>"I"s<< has and needs no space
}
return isException;
}
private boolean isEndSymbolUnique(String str) {
return uniqueMap.get(str);
}
@Nullable
private RuleMatch createMatch(List ruleMatches, UnsyncStack ruleMatchStack, int startPos, Symbol symbol, AnalyzedSentence sentence, int sentenceIdx, Supplier lazyFullText) {
if (!ruleMatchStack.empty()) {
int index = endSymbols.indexOf(symbol.symbol);
if (index >= 0) {
SymbolLocator rLoc = ruleMatchStack.peek();
if (rLoc.getSymbol().symbol.equals(startSymbols.get(index))) {
if (ruleMatches.size() > rLoc.getIndex()) {
ruleMatches.remove(rLoc.getIndex());
ruleMatchStack.pop();
return null;
}
}
}
}
ruleMatchStack.push(new SymbolLocator(symbol, ruleMatches.size(), startPos, sentence, sentenceIdx));
String otherSymbol = findCorrespondingSymbol(symbol);
String message = MessageFormat.format(messages.getString("unpaired_brackets"), otherSymbol);
String fullText = lazyFullText.get();
if (startPos + symbol.symbol.length() < fullText.length()) {
if (startPos >= 2 && startPos + symbol.symbol.length() < fullText.length()) {
String context = fullText.substring(startPos - 2, startPos + symbol.symbol.length());
if (context.matches("\n[a-zA-Z]\\)")) { // prevent error for "b) foo item"
return null;
}
} else if (startPos >= 1) {
String context = fullText.substring(startPos - 1, startPos + symbol.symbol.length());
if (context.matches("[a-zA-Z]\\)")) { // prevent error for "a) foo item" at text start
return null;
}
}
}
if (preventMatch(sentence)) {
return null;
}
RuleMatch match = new RuleMatch(this, sentence, startPos, startPos + symbol.symbol.length(), message);
List repl = getSuggestions(lazyFullText, startPos, startPos + symbol.symbol.length(), symbol, otherSymbol);
if (repl != null) {
match.setSuggestedReplacements(repl);
}
return match;
}
protected boolean preventMatch(AnalyzedSentence sentence) {
return false;
}
protected List getSuggestions(Supplier text, int startPos, int endPos, Symbol symbol, String otherSymbol) {
return null;
}
protected String findCorrespondingSymbol(Symbol symbol) {
int idx1 = startSymbols.indexOf(symbol.symbol);
if (idx1 >= 0) {
return endSymbols.get(idx1);
} else {
int idx2 = endSymbols.indexOf(symbol.symbol);
return startSymbols.get(idx2);
}
}
@Override
public int minToCheckParagraph() {
return -1;
}
protected static class Symbol {
public enum Type {Opening, Closing}
String symbol;
public Type symbolType;
protected Symbol(String symbol, Type symbolType) {
this.symbol = symbol;
this.symbolType = symbolType;
}
@Override
public String toString() {
return symbol;
}
}
}