org.languagetool.rules.GenericUnpairedQuotesRule Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
/* LanguageTool, a natural language style checker
* Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedTokenReadings;
import java.text.MessageFormat;
import java.util.*;
import java.util.regex.Pattern;
/**
* Rule that finds unpaired quotes
*
* @author Fred Kruse
* @since 6.4
*/
public class GenericUnpairedQuotesRule extends TextLevelRule {
// private static final Pattern OPENING_BRACKETS = Pattern.compile("[(\\[{]");
private static final Pattern POSSIBLE_APOSTROPHE = Pattern.compile("[‘’']");
private static final Pattern INCH_PATTERN = Pattern.compile(".*\\d\".*", Pattern.DOTALL);
private static final Pattern PUNCTUATION = Pattern.compile("[\\p{Punct}…–—&&[^\"'_]]");
private static final Pattern PUNCT_MARKS = Pattern.compile("[\\?\\.!,]");
private final List startSymbols;
private final List endSymbols;
private final String ruleId;
// private final Pattern numerals;
public GenericUnpairedQuotesRule(String ruleId, ResourceBundle messages, List startSymbols, List endSymbols) {
super(messages);
this.ruleId = ruleId != null ? ruleId : "UNPAIRED_QUOTES";
super.setCategory(Categories.PUNCTUATION.getCategory(messages));
if (startSymbols.size() != endSymbols.size()) {
throw new IllegalArgumentException("Different number of start and end symbols: " + startSymbols + " vs. " + endSymbols);
}
this.startSymbols = startSymbols;
this.endSymbols = endSymbols;
setLocQualityIssueType(ITSIssueType.Typographical);
}
/**
* @param startSymbols start symbols like "(" - note that the array must be of equal length as the next parameter
* and the sequence of starting symbols must match exactly the sequence of ending symbols.
* @param endSymbols end symbols like ")"
*/
public GenericUnpairedQuotesRule(ResourceBundle messages, List startSymbols, List endSymbols) {
this(null, messages, startSymbols, endSymbols);
}
/**
* Construct rule with a set of default start and end symbols: “” "" ‘’ ''
*/
public GenericUnpairedQuotesRule(ResourceBundle messages) {
this(null, messages, Arrays.asList("“", "\"", "‘", "'"), Arrays.asList("”", "\"", "’", "'"));
}
@Override
public String getId() {
return ruleId;
}
@Override
public String getDescription() {
return messages.getString("desc_unpaired_quotes");
}
@Override
public final RuleMatch[] match(List sentences) {
List openingQuotes = new ArrayList<>();
List ruleMatches = new ArrayList<>();
String lastApostropheSymbol = null;
boolean wasInch = false;
int startPosBase = 0;
for (AnalyzedSentence sentence : sentences) {
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
for (int i = 1; i < tokens.length; i++) {
if (isOpeningQuote(tokens, i)) {
String symbol = tokens[i].getToken();
if (!isNotBeginningApostrophe(tokens, i)) {
lastApostropheSymbol = symbol;
continue;
}
if ("\"".equals(symbol)) {
wasInch = false;
}
if (lastApostropheSymbol != null && lastApostropheSymbol.equals(symbol)) {
lastApostropheSymbol = null;;
}
int index = indexOfOpeningQuote(openingQuotes, symbol);
if (index >= 0) {
removeAllOpenInnerQuotes(index - 1, openingQuotes, ruleMatches);
}
openingQuotes.add(new SymbolLocator(symbol, tokens[i].getStartPos() + startPosBase, sentence));
} else if (isClosingQuote(tokens, i, openingQuotes)) {
String symbol = tokens[i].getToken();
if (!isNotBeginningApostrophe(tokens, i)) {
lastApostropheSymbol = symbol;
continue;
}
boolean isInchSymb = "\"".equals(symbol);
boolean isInch = isInchSymb ? isInchQuote(sentence.getText()) : false;
String startSymbol = findCorrespondingSymbol(symbol);
int index = indexOfOpeningQuote(openingQuotes, startSymbol);
if (index >= 0) {
removeAllOpenInnerQuotes(index, openingQuotes, ruleMatches);
openingQuotes.remove(index);
if (lastApostropheSymbol != null && lastApostropheSymbol.equals(startSymbol)) {
lastApostropheSymbol = null;;
}
if (isInch) {
wasInch = true;
}
} else if (isNotEndingApostrophe(tokens, i)){
if (!isInch && (!isInchSymb || !wasInch)) {
if (lastApostropheSymbol == null || !lastApostropheSymbol.equals(symbol)) {
addMatch(new SymbolLocator(symbol, tokens[i].getStartPos() + startPosBase, sentence), ruleMatches);
} else {
lastApostropheSymbol = null;
}
} else {
wasInch = false;
}
}
}
}
startPosBase += sentence.getCorrectedTextLength();
}
removeAllOpenInnerQuotes(-1, openingQuotes, ruleMatches);
return toRuleMatchArray(ruleMatches);
}
private boolean isStartSymbolbefore(AnalyzedTokenReadings[] tokens, int i) {
for (int j = i - 1; j > 0; j--) {
if (!tokens[i].getToken().equals(tokens[j].getToken()) && startSymbols.contains(tokens[j].getToken())) {
if (tokens[j - 1].isSentenceStart() || tokens[j].isWhitespaceBefore()) {
return true;
}
} else {
return false;
}
}
return true;
}
private boolean isNotOpenSymbol(int j, List openingQuotes) {
if (endSymbols.get(j).equals(startSymbols.get(j))) {
for (SymbolLocator openingQuote : openingQuotes) {
if (endSymbols.get(j).equals(openingQuote.getSymbol())) {
return false;
}
}
}
return true;
}
private boolean isNotQuote (AnalyzedTokenReadings[] tokens, int i, int j) {
if ((tokens[i - 1].isSentenceStart() || tokens[i].isWhitespaceBefore())
&& (i >= tokens.length -1 || tokens[i + 1].isWhitespaceBefore())) {
return true;
}
if (endSymbols.get(j).equals(startSymbols.get(j))) {
if (i < tokens.length - 1
&& !tokens[i].isWhitespaceBefore()
&& !tokens[i + 1].isWhitespaceBefore()
&& PUNCTUATION.matcher(tokens[i - 1].getToken()).matches()
&& !".".equals(tokens[i + 1].getToken())
&& PUNCTUATION.matcher(tokens[i + 1].getToken()).matches()) {
return true;
}
}
return false;
}
private boolean isOpeningQuote(AnalyzedTokenReadings[] tokens, int i) {
for (int j = 0; j < startSymbols.size(); j++) {
if (startSymbols.get(j).equals(tokens[i].getToken())) {
if (isNotQuote (tokens, i, j)) {
return false;
}
if (endSymbols.contains(startSymbols.get(j))) {
return (tokens[i - 1].isSentenceStart()
|| tokens[i].isWhitespaceBefore()
|| (i < tokens.length - 1 && !tokens[i + 1].isWhitespaceBefore()
&& ((!PUNCT_MARKS.matcher(tokens[i + 1].getToken()).matches()
&& PUNCTUATION.matcher(tokens[i - 1].getToken()).matches())
|| (tokens[i - 1].getToken().endsWith("-"))))
|| isStartSymbolbefore(tokens, i));
}
return true;
}
}
return false;
}
private boolean isClosingQuote(AnalyzedTokenReadings[] tokens, int i, List openingQuotes) {
for (int j = 0; j < endSymbols.size(); j++) {
if (endSymbols.get(j).equals(tokens[i].getToken())) {
if (isNotQuote (tokens, i, j) && isNotOpenSymbol(j, openingQuotes)) {
return false;
}
return true;
}
}
return false;
}
private boolean isInchQuote(String text) {
return INCH_PATTERN.matcher(text).matches();
}
protected boolean isNotBeginningApostrophe(AnalyzedTokenReadings[] tokens, int i) {
return !POSSIBLE_APOSTROPHE.matcher(tokens[i].getToken()).matches()
|| i >= tokens.length - 1 || tokens[i + 1].isNonWord() || tokens[i + 1].isWhitespaceBefore();
}
protected boolean isNotEndingApostrophe(AnalyzedTokenReadings[] tokens, int i) {
return !POSSIBLE_APOSTROPHE.matcher(tokens[i].getToken()).matches()
|| tokens[i].isWhitespaceBefore()
|| tokens[i - 1].isNonWord();
}
private int indexOfOpeningQuote(List openingQuotes, String symbol) {
for(int i = 0; i < openingQuotes.size(); i++) {
if (symbol.equals(openingQuotes.get(i).getSymbol())) {
return i;
}
}
return -1;
}
private void addMatch(SymbolLocator openingQuote, List ruleMatches) {
String message = MessageFormat.format(messages.getString("unpaired_brackets"), findCorrespondingSymbol(openingQuote.getSymbol()));
RuleMatch match = new RuleMatch(this, openingQuote.getSentence(), openingQuote.getStartPos(),
openingQuote.getStartPos() + openingQuote.getSymbol().length(), message);
ruleMatches.add(match);
}
private void removeAllOpenInnerQuotes(int index, List openingQuotes, List ruleMatches) {
for (int i = openingQuotes.size() - 1; i > index; i--) {
addMatch(openingQuotes.get(i), ruleMatches);
openingQuotes.remove(i);
}
}
protected String findCorrespondingSymbol(String symbol) {
int idx1 = startSymbols.indexOf(symbol);
if (idx1 >= 0) {
return endSymbols.get(idx1);
} else {
int idx2 = endSymbols.indexOf(symbol);
return startSymbols.get(idx2);
}
}
@Override
public int minToCheckParagraph() {
return -1;
}
protected class SymbolLocator {
private final String symbol;
private final int startPos;
private final AnalyzedSentence sentence;
SymbolLocator(String symbol, int startPos, AnalyzedSentence sentence) {
this.symbol = symbol;
this.startPos = startPos;
this.sentence = sentence;
}
public String getSymbol() {
return symbol;
}
int getStartPos() {
return startPos;
}
AnalyzedSentence getSentence() {
return sentence;
}
}
}