All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.rules.AbstractSimpleReplaceRule2 Maven / Gradle / Ivy

Go to download

LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.

There is a newer version: 6.5
Show newest version
/* LanguageTool, a natural language style checker
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules;

import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import org.apache.commons.lang3.StringUtils;
import org.languagetool.*;
import org.languagetool.tools.StringTools;
import org.languagetool.tools.Tools;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.*;

import static org.languagetool.JLanguageTool.getDataBroker;

/**
 * A rule that matches words which should not be used and suggests correct ones instead.
 * 

Unlike AbstractSimpleReplaceRule, it supports phrases (Ex: "aqua forte" -> "acvaforte"). * Note: Merge this into {@link AbstractSimpleReplaceRule} * * @author Ionuț Păduraru */ public abstract class AbstractSimpleReplaceRule2 extends Rule { private volatile boolean initialized; private Map mStartSpace; private Map mStartNoSpace; private Map mFullSpace; private Map mFullNoSpace; protected boolean isCheckingCase() { return false; } private final static int MAX_TOKENS_IN_MULTIWORD = 20; //only for CheckCaseRule private boolean ignoreShortUppercaseWords = true; //only for CheckCaseRule private int MAX_LENGTH_SHORT_WORDS = 4; private boolean ruleHasSuggestions = true; public enum CaseSensitivy {CS, CI, CSExceptAtSentenceStart} protected final Language language; protected boolean subRuleSpecificIds = false; public abstract List getFileNames(); public List getFilePaths() { return null; } @Override public abstract String getId(); /** * @return A string where {@code $match} will be replaced with the matching word. */ @Override public abstract String getDescription(); public abstract String getShort(); /** * @return A string where {@code $match} will be replaced with the matching word * and {@code $suggestions} will be replaced with the alternatives. This is the string * shown to the user. */ public abstract String getMessage(); /** * @return the word used to separate multiple suggestions; used only before last suggestion, the rest are * comma-separated. */ public String getSuggestionsSeparator() { return ", "; } /** * locale used on case-conversion */ public abstract Locale getLocale(); public AbstractSimpleReplaceRule2(ResourceBundle messages, Language language) { super(messages); this.language = Objects.requireNonNull(language); super.setCategory(Categories.MISC.getCategory(messages)); } /** * If this is set, each replacement pair will have its own rule ID, making rule deactivations more specific. * * @since 5.1 */ public void useSubRuleSpecificIds() { subRuleSpecificIds = true; } public CaseSensitivy getCaseSensitivy() { return CaseSensitivy.CI; } /** * Used if each input form the replacement file has its specific id. */ public String getDescription(String details) { return null; } @Override public RuleMatch[] match(AnalyzedSentence sentence) { lazyInit(); List ruleMatches = new ArrayList<>(); AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace(); int sentStart = 1; while (sentStart < tokens.length && isPunctuationStart(tokens[sentStart].getToken())) { sentStart++; } int[] checkCaseCoveredUpto = new int[1]; checkCaseCoveredUpto[0] = 0; for (int startIndex = sentStart; startIndex < tokens.length; startIndex++) { if (isTokenException(tokens[startIndex])) { continue; } String tok = tokens[startIndex].getToken(); if (tok.length() < 1) { continue; } // If there is no whitespace to the next token, concatenate the next token int k = startIndex + 1; while (k < tokens.length && !tokens[k].isWhitespaceBefore()) { tok = tok + tokens[k].getToken(); k++; } if (getCaseSensitivy() == CaseSensitivy.CI) { tok = tok.toLowerCase(); } if (mStartSpace.containsKey(tok)) { StringBuilder keyBuilder = new StringBuilder(); int maxTokenLen = mStartSpace.get(tok); int endIndex = startIndex; while (endIndex < tokens.length && endIndex - startIndex < MAX_TOKENS_IN_MULTIWORD) { if (endIndex > startIndex && tokens[endIndex].isWhitespaceBefore()) { keyBuilder.append(" "); } keyBuilder.append(tokens[endIndex].getToken()); String originalStr = keyBuilder.toString(); int numberOfSpaces = StringTools.numberOf(originalStr, " "); if (numberOfSpaces + 1 > maxTokenLen) { break; } if (numberOfSpaces > 0) { String keyStr = originalStr; if (getCaseSensitivy() == CaseSensitivy.CI) { keyStr = keyStr.toLowerCase(); } SuggestionWithMessage suggestionWithMessage = mFullSpace.get(keyStr); createMatch(ruleMatches, suggestionWithMessage, startIndex, endIndex, originalStr, tokens, sentence, sentStart, checkCaseCoveredUpto); //No language uses this. It could be removed. if (suggestionWithMessage == null && sentStart == startIndex && getCaseSensitivy() == CaseSensitivy.CSExceptAtSentenceStart && !keyStr.equals(StringTools.lowercaseFirstChar(keyStr))) { keyStr = StringTools.lowercaseFirstChar(keyStr); suggestionWithMessage = mFullSpace.get(keyStr); createMatch(ruleMatches, suggestionWithMessage, startIndex, endIndex, originalStr, tokens, sentence, sentStart, checkCaseCoveredUpto); } } endIndex++; } } if (mStartNoSpace.containsKey(tok.substring(0, 1))) { int endIndex = startIndex; StringBuilder keyBuilder = new StringBuilder(); while (endIndex < tokens.length && endIndex - startIndex < MAX_TOKENS_IN_MULTIWORD) { if (endIndex > startIndex && tokens[endIndex].isWhitespaceBefore()) { break; } keyBuilder.append(tokens[endIndex].getToken()); String originalStr = keyBuilder.toString(); String keyStr = originalStr; if (getCaseSensitivy() == CaseSensitivy.CI) { keyStr = keyStr.toLowerCase(); } SuggestionWithMessage suggestionWithMessage = mFullNoSpace.get(keyStr); createMatch(ruleMatches, suggestionWithMessage, startIndex, endIndex, originalStr, tokens, sentence, sentStart, checkCaseCoveredUpto); //No language uses this. It could be removed. if (suggestionWithMessage == null && sentStart == startIndex && getCaseSensitivy() == CaseSensitivy.CSExceptAtSentenceStart && !keyStr.equals(StringTools.lowercaseFirstChar(keyStr))) { keyStr = StringTools.lowercaseFirstChar(keyStr); suggestionWithMessage = mFullNoSpace.get(keyStr); createMatch(ruleMatches, suggestionWithMessage, startIndex, endIndex, originalStr, tokens, sentence, sentStart, checkCaseCoveredUpto); } endIndex++; } } } return toRuleMatchArray(ruleMatches); } private void createMatch(List ruleMatches, SuggestionWithMessage suggestionWithMessage, int startIndex, int endIndex, String originalStr, AnalyzedTokenReadings[] tokens, AnalyzedSentence sentence, int sentStart, int[] checkCaseCoveredUpto) { if (suggestionWithMessage == null || isException(originalStr)) { return; } List replacements = Arrays.asList(suggestionWithMessage.getSuggestion().split("\\|")); int fromPos = tokens[startIndex].getStartPos(); int toPos = tokens[endIndex].getEndPos(); //keep only the longest match if (ruleMatches.size() > 0) { RuleMatch lastRuleMatch = ruleMatches.get(ruleMatches.size() - 1); if (lastRuleMatch.getFromPos() <= fromPos && lastRuleMatch.getToPos() >= toPos) { return; } } boolean firstWordInSuggIsCamelCase = replacements.stream().anyMatch(k -> StringTools.isCamelCase(k.split(" ")[0])); boolean isAllUppercase = StringTools.isAllUppercase(originalStr); boolean isCapitalized = StringTools.isCapitalizedWord(originalStr.split(" ")[0]); // exceptions for checking case if (isCheckingCase()) { if (endIndex <= checkCaseCoveredUpto[0]) { return; } String replacementCheckCase = replacements.get(0); if ((sentStart == startIndex && originalStr.equals(StringTools.uppercaseFirstChar(replacementCheckCase))) || originalStr.equals(replacementCheckCase)) { if (ruleMatches.size() > 0) { // remove last match if is contained in a correct phrase RuleMatch lastRuleMatch = ruleMatches.get(ruleMatches.size() - 1); if (lastRuleMatch.getToPos() > fromPos) { ruleMatches.remove(ruleMatches.size() - 1); } } checkCaseCoveredUpto[0] = endIndex; return; } //Allow all-upper case, except for CamelCase and short words in Dutch (length MAX_LENGTH_SHORT_WORDS) { checkCaseCoveredUpto[0] = endIndex; return; } } } List finalReplacements = new ArrayList<>(); // adjust casing of suggestions for (String repl : replacements) { String finalRepl = repl; if (!firstWordInSuggIsCamelCase && (sentStart == startIndex || (isCapitalized && !isCheckingCase()))) { finalRepl = StringTools.uppercaseFirstChar(repl); } if (!isCheckingCase() && isAllUppercase) { finalRepl = repl.toUpperCase(); } if (!repl.equals(originalStr) && !finalRepl.equals(originalStr) && !finalReplacements.contains(finalRepl)) { finalReplacements.add(finalRepl); } } if (ruleHasSuggestions && finalReplacements.isEmpty()) { return; } // Begin of match creation String msg = suggestionWithMessage.getMessage(); String url = null; if (msg != null && (msg.startsWith("http://") || msg.startsWith("https://"))) { msg = null; url = msg; } if (msg == null) { String msgSuggestions = ""; for (int k = 0; k < replacements.size(); k++) { if (k > 0) { msgSuggestions += (k == replacements.size() - 1 ? getSuggestionsSeparator() : ", "); } msgSuggestions += "" + replacements.get(k) + ""; } msg = getMessage().replaceFirst("\\$match", originalStr).replaceFirst("\\$suggestions", msgSuggestions); } RuleMatch ruleMatch = new RuleMatch(this, sentence, fromPos, toPos, msg, getShort()); if (subRuleSpecificIds) { String id = StringTools.toId(getId() + "_" + originalStr, language); String desc = getDescription().replace("$match", originalStr); SpecificIdRule specificIdRule = new SpecificIdRule(id, desc, isPremium(), getCategory(), getLocQualityIssueType(), getTags()); ruleMatch = new RuleMatch(specificIdRule, sentence, fromPos, toPos, msg, getShort()); } if (url != null) { ruleMatch.setUrl(Tools.getUrl(url)); } if (ruleHasSuggestions) { ruleMatch.setSuggestedReplacements(finalReplacements); } // End of match creation if (isRuleMatchException(ruleMatch)) { return; } //keep only the longest match if (ruleMatches.size() > 0) { RuleMatch lastRuleMatch = ruleMatches.get(ruleMatches.size() - 1); if (lastRuleMatch.getFromPos() >= fromPos && lastRuleMatch.getToPos() <= toPos) { ruleMatches.remove(ruleMatches.size() - 1); } } ruleMatches.add(ruleMatch); } protected boolean isRuleMatchException(RuleMatch ruleMatch) { return false; } protected boolean isException(String matchedText) { return false; } protected boolean isTokenException(AnalyzedTokenReadings atr) { return false; } /** * Create a warning if a key word of the replacement rule is not allowed by the speller rule. */ public boolean checkKeyWordsAreKnownToSpeller() { return false; } /** * Create a warning if a key word of the replacement rule is allowed by the speller rule. */ public boolean checkKeyWordsAreUnknownToSpeller() { return false; } public boolean separateKeyWordsBySpeller() { return false; } protected boolean isPunctuationStart(String word) { return StringUtils.getDigits(word).length() > 0 // e.g. postal codes || StringTools.isPunctuationMark(word) || StringTools.isNotWordCharacter(word); } private void lazyInit() { if (initialized) { return; } synchronized (this) { if (initialized) return; Object2IntOpenHashMap mStartSpace = new Object2IntOpenHashMap<>(); Object2IntOpenHashMap mStartNoSpace = new Object2IntOpenHashMap<>(); Object2ObjectOpenHashMap mFullSpace = new Object2ObjectOpenHashMap<>(); Object2ObjectOpenHashMap mFullNoSpace = new Object2ObjectOpenHashMap<>(); fillMaps(mStartSpace, mStartNoSpace, mFullSpace, mFullNoSpace); mStartSpace.trim(); mStartNoSpace.trim(); mFullSpace.trim(); mFullNoSpace.trim(); this.mStartSpace = mStartSpace; this.mStartNoSpace = mStartNoSpace; this.mFullSpace = mFullSpace; this.mFullNoSpace = mFullNoSpace; initialized = true; } } private void fillMaps(Map mStartSpace, Map mStartNoSpace, Map mFullSpace, Map mFullNoSpace) { try { for (URL filePath : getAllFilePaths()) { BufferedReader br = new BufferedReader(new InputStreamReader(filePath.openStream(), StandardCharsets.UTF_8)); String line; while ((line = br.readLine()) != null) { line = line.trim(); if (line.isEmpty() || line.charAt(0) == '#') { // ignore comments continue; } if (line.contains(" ") && !language.getShortCode().equals("ar")) { throw new RuntimeException("More than one consecutive space in " + filePath + " - use a tab character as " + "a delimiter for the message: " + line); } line = StringUtils.substringBefore(line, "#").trim(); if (isCheckingCase()) { // It is a rule for checking chase String[] parts = line.split("="); line = parts[0].toLowerCase().trim() + "=" + parts[0].trim(); if (parts.length == 2) { line = line + "\t" + parts[1].trim(); } } String[] parts = line.split("\t"); String confPair = parts[0]; String msg; if (parts.length == 1) { msg = null; } else if (parts.length == 2) { msg = parts[1]; } else { throw new IOException("Format error in file " + filePath + ". Expected at most 1 '=' character and at " + "most 1 tab character. Line: " + line); } String[] confPairParts = confPair.split("="); String suggestion = ""; if (ruleHasSuggestions) { if (confPairParts.length < 2) { throw new IOException("Format error in file " + filePath + ". Missing suggestion after character '='. Line: " + line); } suggestion = confPairParts[1]; } String[] wrongForms = confPairParts[0].split("\\|"); // multiple incorrect forms for (String wrongForm : wrongForms) { String searchKey = getCaseSensitivy() == CaseSensitivy.CI ? wrongForm.toLowerCase() : wrongForm; if (!isCheckingCase() && searchKey.equals(suggestion)) { throw new IOException("Format error in file " + filePath + ". Found same word on left and right side of '='. Line: " + line); } SuggestionWithMessage suggestionWithMessage = new SuggestionWithMessage(suggestion, msg); boolean containsSpace = wrongForm.indexOf(' ') > 0; if (!containsSpace) { String firstChar = searchKey.substring(0, 1); if (mStartNoSpace.containsKey(firstChar)) { if (mStartNoSpace.get(firstChar) < searchKey.length()) { mStartNoSpace.put(firstChar, searchKey.length()); } } else { mStartNoSpace.put(firstChar, searchKey.length()); } mFullNoSpace.put(searchKey, suggestionWithMessage); } else { String[] tokens = searchKey.split(" "); String firstToken = tokens[0]; if (mStartSpace.containsKey(firstToken)) { if (mStartSpace.get(firstToken) < tokens.length) { mStartSpace.put(firstToken, tokens.length); } } else { mStartSpace.put(firstToken, tokens.length); } mFullSpace.put(searchKey, suggestionWithMessage); } } } } } catch (IOException e) { throw new RuntimeException(e); } } private List getAllFilePaths() { List filePaths = new ArrayList<>(); if (getFilePaths() != null) { filePaths.addAll(getFilePaths()); } for (String fileName : getFileNames()) { filePaths.add(getDataBroker().getFromRulesDirAsUrl(fileName)); } return filePaths; } // Only for checking the spelling of the suggestions public List> getWrongWords() { lazyInit(); List> wrongWords = new ArrayList<>(); wrongWords.add(mFullSpace); wrongWords.add(mFullNoSpace); return wrongWords; } // only for check_case rule protected boolean isIgnoreShortUppercaseWords() { return ignoreShortUppercaseWords; } // only for check_case rule protected void setIgnoreShortUppercaseWords(boolean value) { ignoreShortUppercaseWords = value; } protected void setRuleHasSuggestions(boolean value) { ruleHasSuggestions = value; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy