org.languagetool.JLanguageTool Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.languagetool.databroker.DefaultResourceDataBroker;
import org.languagetool.databroker.ResourceDataBroker;
import org.languagetool.language.CommonWords;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.markup.AnnotatedText;
import org.languagetool.markup.AnnotatedTextBuilder;
import org.languagetool.rules.*;
import org.languagetool.rules.neuralnetwork.Word2VecModel;
import org.languagetool.rules.patterns.AbstractPatternRule;
import org.languagetool.rules.patterns.FalseFriendRuleLoader;
import org.languagetool.rules.patterns.PatternRule;
import org.languagetool.rules.patterns.PatternRuleLoader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.net.JarURLConnection;
import java.net.URL;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.FutureTask;
import java.util.function.Function;
import java.util.jar.Manifest;
import java.util.logging.Level;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* The main class used for checking text against different rules:
*
* - built-in Java rules (for English: a vs. an, whitespace after commas, ...)
*
- built-in pattern rules loaded from external XML files (usually called {@code grammar.xml})
*
- your own implementation of the abstract {@link Rule} classes added with {@link #addRule(Rule)}
*
*
* You will probably want to use the sub class {@link MultiThreadedJLanguageTool} for best performance.
*
*
Thread-safety: this class is not thread safe. Create one instance per thread,
* but create the language only once (e.g. {@code new AmericanEnglish()}) and use it for all
* instances of JLanguageTool.
*
* @see MultiThreadedJLanguageTool
*/
public class JLanguageTool {
private static final Logger logger = LoggerFactory.getLogger(JLanguageTool.class);
/** LanguageTool version as a string like {@code 2.3} or {@code 2.4-SNAPSHOT}. */
public static final String VERSION = "4.9";
/** LanguageTool build date and time like {@code 2013-10-17 16:10} or {@code null} if not run from JAR. */
@Nullable public static final String BUILD_DATE = getBuildDate();
/**
* Abbreviated git id or {@code null} if not available.
* @since 4.5
*/
@Nullable public static final String GIT_SHORT_ID = getShortGitId();
/** The name of the file with error patterns. */
public static final String PATTERN_FILE = "grammar.xml";
/** The name of the file with false friend information. */
public static final String FALSE_FRIEND_FILE = "false-friends.xml";
/** The internal tag used to mark the beginning of a sentence. */
public static final String SENTENCE_START_TAGNAME = "SENT_START";
/** The internal tag used to mark the end of a sentence. */
public static final String SENTENCE_END_TAGNAME = "SENT_END";
/** The internal tag used to mark the end of a paragraph. */
public static final String PARAGRAPH_END_TAGNAME = "PARA_END";
/** Name of the message bundle for translations. */
public static final String MESSAGE_BUNDLE = "org.languagetool.MessagesBundle";
/** Extension of dictionary files read by Spellers */
public static final String DICTIONARY_FILENAME_EXTENSION = ".dict";
private final ResultCache cache;
private final UserConfig userConfig;
private final GlobalConfig globalConfig;
private final ShortDescriptionProvider descProvider;
private float maxErrorsPerWordRate;
/**
* Returns the build date or {@code null} if not run from JAR.
*/
@Nullable
private static String getBuildDate() {
try {
URL res = JLanguageTool.class.getResource(JLanguageTool.class.getSimpleName() + ".class");
if (res == null) {
// this will happen on Android, see http://stackoverflow.com/questions/15371274/
return null;
}
Object connObj = res.openConnection();
if (connObj instanceof JarURLConnection) {
JarURLConnection conn = (JarURLConnection) connObj;
Manifest manifest = conn.getManifest();
return manifest.getMainAttributes().getValue("Implementation-Date");
} else {
return null;
}
} catch (IOException e) {
throw new RuntimeException("Could not get build date from JAR", e);
}
}
/**
* Returns the abbreviated git id or {@code null}.
*/
@Nullable
private static String getShortGitId() {
try {
InputStream in = JLanguageTool.class.getClassLoader().getResourceAsStream("git.properties");
if (in != null) {
Properties props = new Properties();
props.load(in);
return props.getProperty("git.commit.id.abbrev");
}
return null;
} catch (IOException e) {
throw new RuntimeException("Could not get git id from 'git.properties'", e);
}
}
/**
* @since 4.2
*/
public static boolean isPremiumVersion() {
return false;
}
private static ResourceDataBroker dataBroker = new DefaultResourceDataBroker();
private final List builtinRules;
private final List userRules = new ArrayList<>(); // rules added via addRule() method
// rules fetched via getRelevantLanguageModelCapableRules()
private final Set optionalLanguageModelRules = new HashSet<>();
private final Set disabledRules = new HashSet<>();
private final Set disabledRuleCategories = new HashSet<>();
private final Set enabledRules = new HashSet<>();
private final Set enabledRuleCategories = new HashSet<>();
private final Language language;
private final List altLanguages;
private final Language motherTongue;
private final List matchFilters = new LinkedList<>();
private PrintStream printStream;
private boolean listUnknownWords;
private Set unknownWords;
private boolean cleanOverlappingMatches;
/**
* Constants for correct paragraph-rule handling.
*/
public enum ParagraphHandling {
/**
* Handle normally - all kinds of rules run.
*/
NORMAL,
/**
* Run only paragraph-level rules.
*/
ONLYPARA,
/**
* Run only sentence-level rules.
*/
ONLYNONPARA
}
public enum Mode {
// IMPORTANT: directly logged via toString into check_log database table.
// column is varchar(32), so take care to not exceed this length here
/** Use all active rules for checking. */
ALL,
/** Use only text-level rules for checking. This is typically much faster then using all rules or {@code ALL_BUT_TEXTLEVEL_ONLY}. */
TEXTLEVEL_ONLY,
/** Use all activate rules for checking except the text-level rules. */
ALL_BUT_TEXTLEVEL_ONLY
}
private static final List temporaryFiles = new ArrayList<>();
/**
* Create a JLanguageTool and setup the built-in rules for the
* given language and false friend rules for the text language / mother tongue pair.
*
* @param lang the language of the text to be checked
* @param motherTongue the user's mother tongue, used for false friend rules, or null
.
* The mother tongue may also be used as a source language for checking bilingual texts.
*/
public JLanguageTool(Language lang, Language motherTongue) {
this(lang, motherTongue, null);
}
/**
* Create a JLanguageTool and setup the built-in Java rules for the
* given language.
*
* @param language the language of the text to be checked
*/
public JLanguageTool(Language language) {
this(language, null, null, null);
}
/**
* Create a JLanguageTool and setup the built-in rules for the
* given language and false friend rules for the text language / mother tongue pair.
*
* @param language the language of the text to be checked
* @param motherTongue the user's mother tongue, used for false friend rules, or null
.
* The mother tongue may also be used as a source language for checking bilingual texts.
* @param cache a cache to speed up checking if the same sentences get checked more than once,
* e.g. when LT is running as a server and texts are re-checked due to changes
* @since 3.7
*/
public JLanguageTool(Language language, Language motherTongue, ResultCache cache) {
this(language, motherTongue, cache, null);
}
/**
* Create a JLanguageTool and setup the built-in rules for the
* given language and false friend rules for the text language / mother tongue pair.
*
* @param language the language of the text to be checked
* @param cache a cache to speed up checking if the same sentences get checked more than once,
* e.g. when LT is running as a server and texts are re-checked due to changes. Use
* {@code null} to deactivate the cache.
* @since 4.2
*/
@Experimental
public JLanguageTool(Language language, ResultCache cache, UserConfig userConfig) {
this(language, null, cache, userConfig);
}
/**
* Create a JLanguageTool and setup the built-in rules for the
* given language and false friend rules for the text language / mother tongue pair.
*
* @param language the language of the text to be checked
* @param altLanguages The languages that are accepted as alternative languages - currently this means
* words are accepted if they are in an alternative language and not similar to
* a word from {@code language}. If there's a similar word in {@code language},
* there will be an error of type {@link RuleMatch.Type#Hint} (EXPERIMENTAL)
* @param motherTongue the user's mother tongue, used for false friend rules, or null
.
* The mother tongue may also be used as a source language for checking bilingual texts.
* @param cache a cache to speed up checking if the same sentences get checked more than once,
* e.g. when LT is running as a server and texts are re-checked due to changes
* @since 4.3
*/
@Experimental
public JLanguageTool(Language language, List altLanguages, Language motherTongue, ResultCache cache,
GlobalConfig globalConfig, UserConfig userConfig) {
this.language = Objects.requireNonNull(language, "language cannot be null");
this.altLanguages = Objects.requireNonNull(altLanguages, "altLanguages cannot be null (but empty)");
this.motherTongue = motherTongue;
if (userConfig == null) {
this.userConfig = new UserConfig();
} else {
this.userConfig = userConfig;
}
this.globalConfig = globalConfig;
ResourceBundle messages = ResourceBundleTools.getMessageBundle(language);
builtinRules = getAllBuiltinRules(language, messages, userConfig, globalConfig);
this.cleanOverlappingMatches = true;
try {
activateDefaultPatternRules();
if (!language.hasNGramFalseFriendRule(motherTongue)) {
// use the old false friends, which always match, not depending on context
activateDefaultFalseFriendRules();
}
updateOptionalLanguageModelRules(null); // start out with rules without language model
} catch (Exception e) {
throw new RuntimeException("Could not activate rules", e);
}
this.cache = cache;
descProvider = new ShortDescriptionProvider();
}
/**
* Create a JLanguageTool and setup the built-in rules for the
* given language and false friend rules for the text language / mother tongue pair.
*
* @param language the language of the text to be checked
* @param motherTongue the user's mother tongue, used for false friend rules, or null
.
* The mother tongue may also be used as a source language for checking bilingual texts.
* @param cache a cache to speed up checking if the same sentences get checked more than once,
* e.g. when LT is running as a server and texts are re-checked due to changes
* @since 4.2
*/
@Experimental
public JLanguageTool(Language language, Language motherTongue, ResultCache cache, UserConfig userConfig) {
this(language, Collections.emptyList(), motherTongue, cache, null, userConfig);
}
/**
* The grammar checker needs resources from following
* directories:
*
* - {@code /resource}
* - {@code /rules}
*
* @return The currently set data broker which allows to obtain
* resources from the mentioned directories above. If no
* data broker was set, a new {@link DefaultResourceDataBroker} will
* be instantiated and returned.
* @since 1.0.1
*/
public static synchronized ResourceDataBroker getDataBroker() {
if (JLanguageTool.dataBroker == null) {
JLanguageTool.dataBroker = new DefaultResourceDataBroker();
}
return JLanguageTool.dataBroker;
}
/**
* The grammar checker needs resources from following
* directories:
*
* - {@code /resource}
* - {@code /rules}
*
* @param broker The new resource broker to be used.
* @since 1.0.1
*/
public static synchronized void setDataBroker(ResourceDataBroker broker) {
JLanguageTool.dataBroker = broker;
}
/**
* Whether the {@link #check(String)} methods store unknown words. If set to
* true
(default: false), you can get the list of unknown words
* using {@link #getUnknownWords()}.
*/
public void setListUnknownWords(boolean listUnknownWords) {
this.listUnknownWords = listUnknownWords;
}
/**
* Whether the {@link #check(String)} methods return overlapping errors. If set to
* true
(default: true), it removes overlapping errors according to
* the priorities established for the language.
* @since 3.6
*/
public void setCleanOverlappingMatches(boolean cleanOverlappingMatches) {
this.cleanOverlappingMatches = cleanOverlappingMatches;
}
/**
* Maximum errors per word rate, checking will stop with an exception if the rate is higher.
* For example, with a rate of 0.33, the checking would stop if the user's
* text has so many errors that more than every 3rd word causes a rule match.
* Note that this may not apply for very short texts.
* @since 4.0
*/
@Experimental
public void setMaxErrorsPerWordRate(float maxErrorsPerWordRate) {
this.maxErrorsPerWordRate = maxErrorsPerWordRate;
}
/**
* Gets the ResourceBundle (i18n strings) for the default language of the user's system.
*/
public static ResourceBundle getMessageBundle() {
return ResourceBundleTools.getMessageBundle();
}
/**
* Gets the ResourceBundle (i18n strings) for the given user interface language.
* @since 2.4 (public since 2.4)
*/
public static ResourceBundle getMessageBundle(Language lang) {
return ResourceBundleTools.getMessageBundle(lang);
}
private List getAllBuiltinRules(Language language, ResourceBundle messages, UserConfig userConfig, GlobalConfig globalConfig) {
try {
List rules = new ArrayList<>(language.getRelevantRules(messages, userConfig, motherTongue, altLanguages));
rules.addAll(language.getRelevantRulesGlobalConfig(messages, globalConfig, userConfig, motherTongue, altLanguages));
return rules;
} catch (IOException e) {
throw new RuntimeException("Could not get rules of language " + language, e);
}
}
/**
* Set a PrintStream that will receive verbose output. Set to
* {@code null} (which is the default) to disable verbose output.
*/
public void setOutput(PrintStream printStream) {
this.printStream = printStream;
}
/**
* Load pattern rules from an XML file. Use {@link #addRule(Rule)} to add these
* rules to the checking process.
* @param filename path to an XML file in the classpath or in the filesystem - the classpath is checked first
* @return a List of {@link PatternRule} objects
*/
public List loadPatternRules(String filename) throws IOException {
PatternRuleLoader ruleLoader = new PatternRuleLoader();
try (InputStream is = this.getClass().getResourceAsStream(filename)) {
if (is == null) {
// happens for external rules plugged in as an XML file or testing files:
if (filename.contains("-test-")) {
// ignore, for testing
return Collections.emptyList();
} else {
return ruleLoader.getRules(new File(filename));
}
} else {
return ruleLoader.getRules(is, filename);
}
}
}
/**
* Load false friend rules from an XML file. Only those pairs will be loaded
* that match the current text language and the mother tongue specified in the
* JLanguageTool constructor. Use {@link #addRule(Rule)} to add these rules to the
* checking process.
* @param filename path to an XML file in the classpath or in the filesystem - the classpath is checked first
* @return a List of {@link PatternRule} objects, or an empty list if mother tongue is not set
*/
public List loadFalseFriendRules(String filename)
throws ParserConfigurationException, SAXException, IOException {
if (motherTongue == null) {
return Collections.emptyList();
}
FalseFriendRuleLoader ruleLoader = new FalseFriendRuleLoader(motherTongue);
try (InputStream is = this.getClass().getResourceAsStream(filename)) {
if (is == null) {
return ruleLoader.getRules(new File(filename), language, motherTongue);
} else {
return ruleLoader.getRules(is, language, motherTongue);
}
}
}
/**
* Remove rules that can profit from a language model, recreate them with the given model and add them again
* @param lm the language model or null if none is available
*/
private void updateOptionalLanguageModelRules(@Nullable LanguageModel lm) {
ResourceBundle messages = getMessageBundle(language);
try {
List rules = language.getRelevantLanguageModelCapableRules(messages, lm, globalConfig, userConfig, motherTongue, altLanguages);
userRules.removeIf(rule -> optionalLanguageModelRules.contains(rule.getId()));
optionalLanguageModelRules.clear();
rules.stream().map(Rule::getId).forEach(optionalLanguageModelRules::add);
userRules.addAll(rules);
} catch(Exception e) {
throw new RuntimeException("Could not load language model capable rules.", e);
}
}
/**
* Activate rules that depend on pretrained neural network models.
* @param modelDir root dir of exported models
* @since 4.4
*/
public void activateNeuralNetworkRules(File modelDir) throws IOException {
ResourceBundle messages = getMessageBundle(language);
List rules = language.getRelevantNeuralNetworkModels(messages, modelDir);
userRules.addAll(rules);
}
/**
* Activate rules that depend on a language model. The language model currently
* consists of Lucene indexes with ngram occurrence counts.
* @param indexDir directory with a '3grams' sub directory which contains a Lucene index with 3gram occurrence counts
* @since 2.7
*/
public void activateLanguageModelRules(File indexDir) throws IOException {
LanguageModel languageModel = language.getLanguageModel(indexDir);
if (languageModel != null) {
ResourceBundle messages = getMessageBundle(language);
List rules = language.getRelevantLanguageModelRules(messages, languageModel, userConfig);
userRules.addAll(rules);
updateOptionalLanguageModelRules(languageModel);
}
}
private void transformRules(Function mapper, List rules) {
// transform this way because variables are final + could log where rule was changed
for (int i = 0; i < rules.size(); i++) {
Rule original = rules.get(i);
Rule transformed = mapper.apply(original);
if (transformed != original) {
rules.set(i, transformed);
}
}
}
public void activateRemoteRules(@Nullable File configFile) throws IOException {
try {
List configs;
if (configFile != null) {
configs = RemoteRuleConfig.load(configFile);
} else {
configs = Collections.emptyList();
}
List rules = language.getRelevantRemoteRules(getMessageBundle(language), configs,
globalConfig, userConfig, motherTongue, altLanguages);
userRules.addAll(rules);
Function enhanced = language.getRemoteEnhancedRules(getMessageBundle(language), configs, userConfig, motherTongue, altLanguages);
transformRules(enhanced, builtinRules);
transformRules(enhanced, userRules);
} catch (IOException e) {
throw new IOException("Could not load remote rules.", e);
} catch (ExecutionException e) {
throw new IOException("Could not load remote rules configuration at " + configFile.getAbsolutePath(), e);
}
}
/**
* Activate rules that depend on a word2vec language model.
* @param indexDir directory with a subdirectories like 'en', each containing dictionary.txt and final_embeddings.txt
* @since 4.0
*/
public void activateWord2VecModelRules(File indexDir) throws IOException {
Word2VecModel word2vecModel = language.getWord2VecModel(indexDir);
if (word2vecModel != null) {
ResourceBundle messages = getMessageBundle(language);
List rules = language.getRelevantWord2VecModelRules(messages, word2vecModel);
userRules.addAll(rules);
}
}
/**
* Loads and activates the pattern rules from
* {@code org/languagetool/rules//grammar.xml}.
*/
private void activateDefaultPatternRules() throws IOException {
List patternRules = language.getPatternRules();
List enabledRules = language.getDefaultEnabledRulesForVariant();
List disabledRules = language.getDefaultDisabledRulesForVariant();
if (!enabledRules.isEmpty() || !disabledRules.isEmpty()) {
for (AbstractPatternRule patternRule : patternRules) {
if (enabledRules.contains(patternRule.getId())) {
patternRule.setDefaultOn();
}
if (disabledRules.contains(patternRule.getId())) {
patternRule.setDefaultOff();
}
}
}
userRules.addAll(patternRules);
}
/**
* Loads and activates the false friend rules from
* rules/false-friends.xml
.
*/
private void activateDefaultFalseFriendRules()
throws ParserConfigurationException, SAXException, IOException {
String falseFriendRulesFilename = JLanguageTool.getDataBroker().getRulesDir() + "/" + FALSE_FRIEND_FILE;
userRules.addAll(loadFalseFriendRules(falseFriendRulesFilename));
}
/**
* Add a {@link RuleMatchFilter} for post-processing of rule matches
* Filters are called sequentially in the same order as added
* @since 4.7
* @param filter filter to add
*/
public void addMatchFilter(@NotNull RuleMatchFilter filter) {
matchFilters.add(Objects.requireNonNull(filter));
}
/**
* Add a rule to be used by the next call to the check methods like {@link #check(String)}.
*/
public void addRule(Rule rule) {
userRules.add(rule);
}
/**
* Disable a given rule so the check methods like {@link #check(String)} won't use it.
* @param ruleId the id of the rule to disable - no error will be thrown if the id does not exist
* @see #enableRule(String)
*/
public void disableRule(String ruleId) {
disabledRules.add(ruleId);
enabledRules.remove(ruleId);
}
/**
* Disable the given rules so the check methods like {@link #check(String)} won't use them.
* @param ruleIds the ids of the rules to disable - no error will be thrown if the id does not exist
* @since 2.4
*/
public void disableRules(List ruleIds) {
disabledRules.addAll(ruleIds);
enabledRules.removeAll(ruleIds);
}
/**
* Disable the given rule category so the check methods like {@link #check(String)} won't use it.
* @param id the id of the category to disable - no error will be thrown if the id does not exist
* @since 3.3
* @see #enableRuleCategory(CategoryId)
*/
public void disableCategory(CategoryId id) {
disabledRuleCategories.add(id);
enabledRuleCategories.remove(id);
}
/**
* Returns true if a category is explicitly disabled.
*
* @param id the id of the category to check - no error will be thrown if the id does not exist
* @return true if this category is explicitly disabled.
* @since 3.5
* @see #disableCategory(org.languagetool.rules.CategoryId)
*/
public boolean isCategoryDisabled(CategoryId id) {
return disabledRuleCategories.contains(id);
}
/**
* Get the language that was used to configure this instance.
*/
public Language getLanguage() {
return language;
}
/**
* Get rule ids of the rules that have been explicitly disabled.
*/
public Set getDisabledRules() {
return disabledRules;
}
/**
* Enable a given rule so the check methods like {@link #check(String)} will use it.
* This will not throw an exception if the given rule id doesn't exist.
* @param ruleId the id of the rule to enable
* @see #disableRule(String)
*/
public void enableRule(String ruleId) {
disabledRules.remove(ruleId);
enabledRules.add(ruleId);
}
/**
* Enable all rules of the given category so the check methods like {@link #check(String)} will use it.
* This will not throw an exception if the given rule id doesn't exist.
* @since 3.3
* @see #disableCategory(org.languagetool.rules.CategoryId)
*/
public void enableRuleCategory(CategoryId id) {
disabledRuleCategories.remove(id);
enabledRuleCategories.add(id);
}
/**
* Tokenizes the given text into sentences.
*/
public List sentenceTokenize(String text) {
return language.getSentenceTokenizer().tokenize(text);
}
/**
* The main check method. Tokenizes the text into sentences and matches these
* sentences against all currently active rules.
*
* @param text the text to be checked
* @return a List of {@link RuleMatch} objects
*/
public List check(String text) throws IOException {
return check(text, true, ParagraphHandling.NORMAL);
}
/**
* The main check method. Tokenizes the text into sentences and matches these
* sentences against all currently active rules.
*
* @param text the text to be checked
* @return a List of {@link RuleMatch} objects
* @since 3.7
*/
public List check(String text, RuleMatchListener listener) throws IOException {
return check(text, true, ParagraphHandling.NORMAL, listener);
}
public List check(String text, boolean tokenizeText, ParagraphHandling paraMode) throws IOException {
return check(new AnnotatedTextBuilder().addText(text).build(), tokenizeText, paraMode);
}
/**
* @since 3.7
*/
public List check(String text, boolean tokenizeText, ParagraphHandling paraMode, RuleMatchListener listener) throws IOException {
return check(new AnnotatedTextBuilder().addText(text).build(), tokenizeText, paraMode, listener);
}
/**
* The main check method. Tokenizes the text into sentences and matches these
* sentences against all currently active rules, adjusting error positions so they refer
* to the original text including markup.
* @since 2.3
*/
public List check(AnnotatedText text) throws IOException {
return check(text, true, ParagraphHandling.NORMAL);
}
/**
* @since 3.9
*/
public List check(AnnotatedText text, RuleMatchListener listener) throws IOException {
return check(text, true, ParagraphHandling.NORMAL, listener);
}
/**
* The main check method. Tokenizes the text into sentences and matches these
* sentences against all currently active rules.
* @param annotatedText The text to be checked, created with {@link AnnotatedTextBuilder}.
* Call this method with the complete text to be checked. If you call it
* repeatedly with smaller chunks like paragraphs or sentence, those rules that work across
* paragraphs/sentences won't work (their status gets reset whenever this method is called).
* @param tokenizeText If true, then the text is tokenized into sentences.
* Otherwise, it is assumed it's already tokenized, i.e. it is only one sentence
* @param paraMode Uses paragraph-level rules only if true.
* @return a List of {@link RuleMatch} objects, describing potential errors in the text
* @since 2.3
*/
public List check(AnnotatedText annotatedText, boolean tokenizeText, ParagraphHandling paraMode) throws IOException {
return check(annotatedText, tokenizeText, paraMode, null);
}
/**
* The main check method. Tokenizes the text into sentences and matches these
* sentences against all currently active rules.
* @since 3.7
*/
public List check(AnnotatedText annotatedText, boolean tokenizeText, ParagraphHandling paraMode, RuleMatchListener listener) throws IOException {
Mode mode;
if(paraMode == ParagraphHandling.ONLYNONPARA) {
mode = Mode.ALL_BUT_TEXTLEVEL_ONLY;
} else if(paraMode == ParagraphHandling.ONLYPARA) {
mode = Mode.TEXTLEVEL_ONLY;
} else {
mode = Mode.ALL;
}
return check(annotatedText, tokenizeText, paraMode, listener, mode);
}
/**
* The main check method. Tokenizes the text into sentences and matches these
* sentences against all currently active rules depending on {@code mode}.
* @since 4.3
*/
public List check(AnnotatedText annotatedText, boolean tokenizeText, ParagraphHandling paraMode, RuleMatchListener listener, Mode mode) throws IOException {
return check(annotatedText, tokenizeText, paraMode, listener, mode, null);
}
/**
* The main check method. Tokenizes the text into sentences and matches these
* sentences against all currently active rules depending on {@code mode}.
* @param remoteRulesThreadPool when given, starts evaluating remote rules asynchronously before checking other rules,
* then waits on result afterwards
* @since 4.6
*/
public List check(AnnotatedText annotatedText, boolean tokenizeText, ParagraphHandling paraMode, RuleMatchListener listener, Mode mode, @Nullable ExecutorService remoteRulesThreadPool) throws IOException {
List sentences;
if (tokenizeText) {
sentences = sentenceTokenize(annotatedText.getPlainText());
} else {
sentences = new ArrayList<>();
sentences.add(annotatedText.getPlainText());
}
List allRules = getAllRules();
if (printStream != null) {
printIfVerbose(allRules.size() + " rules activated for language " + language);
}
unknownWords = new HashSet<>();
List analyzedSentences = analyzeSentences(sentences);
List remoteMatches = Collections.emptyList();
List>> remoteRuleTasks = null;
if (remoteRulesThreadPool != null && mode != Mode.TEXTLEVEL_ONLY) {
remoteRuleTasks = allRules.stream()
.filter(rule -> rule instanceof RemoteRule)
.map(rule -> ((RemoteRule) rule).run(analyzedSentences))
.collect(Collectors.toList());
remoteRuleTasks.forEach(remoteRulesThreadPool::submit);
}
List ruleMatches = performCheck(analyzedSentences, sentences, allRules,
paraMode, annotatedText, listener, mode, remoteRulesThreadPool == null);
if (remoteRuleTasks != null) {
remoteMatches = remoteRuleTasks.stream()
.flatMap(task -> {
try {
return task.get().stream(); // can wait without timeout here, implemented in RemoteRule and TextChecker
} catch (InterruptedException | ExecutionException e) {
logger.warn("Failed to fetch result from remote rule.", e);
return Stream.empty();
}
}).collect(Collectors.toList());
for (RuleMatch match : remoteMatches) {
match.setSuggestedReplacementObjects(extendSuggestions(match.getSuggestedReplacementObjects()));
}
}
ruleMatches.addAll(remoteMatches);
ruleMatches = new SameRuleGroupFilter().filter(ruleMatches);
// no sorting: SameRuleGroupFilter sorts rule matches already
if (cleanOverlappingMatches) {
ruleMatches = new CleanOverlappingFilter(language).filter(ruleMatches);
}
ruleMatches = new LanguageDependentFilter(language, this.enabledRules, this.disabledRuleCategories).filter(ruleMatches);
ruleMatches = applyCustomFilters(ruleMatches, annotatedText);
return ruleMatches;
}
/**
* Use this method if you want to access LanguageTool's otherwise
* internal analysis of the text. For actual text checking, use the {@code check...} methods instead.
* @param text The text to be analyzed
* @since 2.5
*/
public List analyzeText(String text) throws IOException {
List sentences = sentenceTokenize(text);
return analyzeSentences(sentences);
}
protected List analyzeSentences(List sentences) throws IOException {
List analyzedSentences = new ArrayList<>();
int j = 0;
for (String sentence : sentences) {
AnalyzedSentence analyzedSentence = getAnalyzedSentence(sentence);
rememberUnknownWords(analyzedSentence);
if (++j == sentences.size()) {
AnalyzedTokenReadings[] anTokens = analyzedSentence.getTokens();
anTokens[anTokens.length - 1].setParagraphEnd();
analyzedSentence = new AnalyzedSentence(anTokens);
}
analyzedSentences.add(analyzedSentence);
printSentenceInfo(analyzedSentence);
}
return analyzedSentences;
}
protected void printSentenceInfo(AnalyzedSentence analyzedSentence) {
if (printStream != null) {
printIfVerbose(analyzedSentence.toString());
printIfVerbose(analyzedSentence.getAnnotations());
}
}
protected List performCheck(List analyzedSentences, List sentences,
List allRules, ParagraphHandling paraMode, AnnotatedText annotatedText, Mode mode) throws IOException {
return performCheck(analyzedSentences, sentences, allRules, paraMode, annotatedText, null, mode, true);
}
/**
* @since 3.7
*/
protected List performCheck(List analyzedSentences, List sentences,
List allRules, ParagraphHandling paraMode, AnnotatedText annotatedText, RuleMatchListener listener, Mode mode, boolean checkRemoteRules) throws IOException {
Callable> matcher = new TextCheckCallable(allRules, sentences, analyzedSentences, paraMode, annotatedText, 0, 0, 1, listener, mode, checkRemoteRules);
try {
return matcher.call();
} catch (IOException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/**
* This is an internal method that's public only for technical reasons, please use one
* of the {@link #check(String)} methods instead.
* @since 2.3
*/
public List checkAnalyzedSentence(ParagraphHandling paraMode,
List rules, AnalyzedSentence analyzedSentence) throws IOException {
return checkAnalyzedSentence(paraMode, rules, analyzedSentence, false);
}
/**
* This is an internal method that's public only for technical reasons, please use one
* of the {@link #check(String)} methods instead.
* @since 4.9
*/
public List checkAnalyzedSentence(ParagraphHandling paraMode,
List rules, AnalyzedSentence analyzedSentence, boolean checkRemoteRules) throws IOException {
List sentenceMatches = new ArrayList<>();
RuleLoggerManager logger = RuleLoggerManager.getInstance();
for (Rule rule : rules) {
if (rule instanceof TextLevelRule) {
continue;
}
if (!checkRemoteRules && rule instanceof RemoteRule) {
continue;
}
if (ignoreRule(rule)) {
continue;
}
if (rule instanceof PatternRule && ((PatternRule)rule).canBeIgnoredFor(analyzedSentence)) {
// this is a performance optimization, it should have no effect on matching logic
continue;
}
if (paraMode == ParagraphHandling.ONLYPARA) {
continue;
}
long time = System.currentTimeMillis();
RuleMatch[] thisMatches = rule.match(analyzedSentence);
logger.log(new RuleCheckTimeMessage(rule.getId(), language.getShortCodeWithCountryAndVariant(),
time, analyzedSentence.getText().length()), Level.FINE);
for (RuleMatch elem : thisMatches) {
sentenceMatches.add(elem);
}
}
AnnotatedText text = new AnnotatedTextBuilder().addText(analyzedSentence.getText()).build();
return applyCustomFilters(new SameRuleGroupFilter().filter(sentenceMatches),text);
}
private boolean ignoreRule(Rule rule) {
Category ruleCategory = rule.getCategory();
boolean isCategoryDisabled = (disabledRuleCategories.contains(ruleCategory.getId()) || rule.getCategory().isDefaultOff())
&& !enabledRuleCategories.contains(ruleCategory.getId());
boolean isRuleDisabled = disabledRules.contains(rule.getId())
|| (rule.isDefaultOff() && !enabledRules.contains(rule.getId()));
boolean isDisabled;
if (isCategoryDisabled) {
isDisabled = !enabledRules.contains(rule.getId());
} else {
isDisabled = isRuleDisabled;
}
return isDisabled;
}
/**
* Change RuleMatch positions so they are relative to the complete text,
* not just to the sentence.
* @param charCount Count of characters in the sentences before
* @param columnCount Current column number
* @param lineCount Current line number
* @param sentence The text being checked
* @return The RuleMatch object with adjustments
*/
public RuleMatch adjustRuleMatchPos(RuleMatch match, int charCount,
int columnCount, int lineCount, String sentence, AnnotatedText annotatedText) {
int fromPos = match.getFromPos() + charCount;
int toPos = match.getToPos() + charCount;
if (annotatedText != null) {
fromPos = annotatedText.getOriginalTextPositionFor(fromPos, false);
toPos = annotatedText.getOriginalTextPositionFor(toPos -1, true) + 1;
}
RuleMatch thisMatch = new RuleMatch(match);
thisMatch.setOffsetPosition(fromPos, toPos);
List replacements = match.getSuggestedReplacementObjects();
thisMatch.setSuggestedReplacementObjects(extendSuggestions(replacements));
String sentencePartToError = sentence.substring(0, match.getFromPos());
String sentencePartToEndOfError = sentence.substring(0, match.getToPos());
int lastLineBreakPos = sentencePartToError.lastIndexOf('\n');
int column;
int endColumn;
if (lastLineBreakPos == -1) {
column = sentencePartToError.length() + columnCount;
} else {
column = sentencePartToError.length() - lastLineBreakPos;
}
int lastLineBreakPosInError = sentencePartToEndOfError.lastIndexOf('\n');
if (lastLineBreakPosInError == -1) {
endColumn = sentencePartToEndOfError.length() + columnCount;
} else {
endColumn = sentencePartToEndOfError.length() - lastLineBreakPosInError;
}
int lineBreaksToError = countLineBreaks(sentencePartToError);
int lineBreaksToEndOfError = countLineBreaks(sentencePartToEndOfError);
thisMatch.setLine(lineCount + lineBreaksToError);
thisMatch.setEndLine(lineCount + lineBreaksToEndOfError);
thisMatch.setColumn(column);
thisMatch.setEndColumn(endColumn);
return thisMatch;
}
private List extendSuggestions(List replacements) {
List extended = new ArrayList<>();
for (SuggestedReplacement replacement : replacements) {
SuggestedReplacement newReplacement = new SuggestedReplacement(replacement);
if (replacement.getShortDescription() == null) { // don't overwrite more specific suggestions from the rule
String descOrNull = descProvider.getShortDescription(replacement.getReplacement(), language);
newReplacement.setShortDescription(descOrNull);
newReplacement.setSuffix(replacement.getSuffix());
}
extended.add(newReplacement);
}
return extended;
}
protected void rememberUnknownWords(AnalyzedSentence analyzedText) {
if (listUnknownWords) {
AnalyzedTokenReadings[] atr = analyzedText.getTokensWithoutWhitespace();
for (AnalyzedTokenReadings reading : atr) {
if (!reading.isTagged()) {
unknownWords.add(reading.getToken());
}
}
}
}
/**
* Get the alphabetically sorted list of unknown words in the latest run of one of the {@link #check(String)} methods.
* @throws IllegalStateException if {@link #setListUnknownWords(boolean)} has been set to {@code false}
*/
public List getUnknownWords() {
if (!listUnknownWords) {
throw new IllegalStateException("listUnknownWords is set to false, unknown words not stored");
}
List words = new ArrayList<>(unknownWords);
Collections.sort(words);
return words;
}
// non-private only for test case
static int countLineBreaks(String s) {
int pos = -1;
int count = 0;
while (true) {
int nextPos = s.indexOf('\n', pos + 1);
if (nextPos == -1) {
break;
}
pos = nextPos;
count++;
}
return count;
}
/**
* Tokenizes the given {@code sentence} into words and analyzes it,
* and then disambiguates POS tags.
* @param sentence sentence to be analyzed
*/
public AnalyzedSentence getAnalyzedSentence(String sentence) throws IOException {
SimpleInputSentence cacheKey = new SimpleInputSentence(sentence, language);
AnalyzedSentence cachedSentence = cache != null ? cache.getIfPresent(cacheKey) : null;
if (cachedSentence != null) {
return cachedSentence;
} else {
AnalyzedSentence raw = getRawAnalyzedSentence(sentence);
AnalyzedSentence disambig = language.getDisambiguator().disambiguate(raw);
AnalyzedSentence analyzedSentence = new AnalyzedSentence(disambig.getTokens(), raw.getTokens());
if (language.getPostDisambiguationChunker() != null) {
language.getPostDisambiguationChunker().addChunkTags(Arrays.asList(analyzedSentence.getTokens()));
}
if (cache != null) {
cache.put(cacheKey, analyzedSentence);
}
return analyzedSentence;
}
}
/**
* Tokenizes the given {@code sentence} into words and analyzes it.
* This is the same as {@link #getAnalyzedSentence(String)} but it does not run
* the disambiguator.
* @param sentence sentence to be analyzed
* @since 0.9.8
*/
public AnalyzedSentence getRawAnalyzedSentence(String sentence) throws IOException {
List tokens = language.getWordTokenizer().tokenize(sentence);
Map softHyphenTokens = replaceSoftHyphens(tokens);
List aTokens = language.getTagger().tag(tokens);
if (language.getChunker() != null) {
language.getChunker().addChunkTags(aTokens);
}
AnalyzedTokenReadings[] tokenArray = new AnalyzedTokenReadings[tokens.size() + 1];
AnalyzedToken[] startTokenArray = new AnalyzedToken[1];
int toArrayCount = 0;
AnalyzedToken sentenceStartToken = new AnalyzedToken("", SENTENCE_START_TAGNAME, null);
startTokenArray[0] = sentenceStartToken;
tokenArray[toArrayCount++] = new AnalyzedTokenReadings(startTokenArray, 0);
int startPos = 0;
for (AnalyzedTokenReadings posTag : aTokens) {
posTag.setStartPos(startPos);
tokenArray[toArrayCount++] = posTag;
startPos += posTag.getToken().length();
}
int numTokens = aTokens.size();
int posFix = 0;
for (int i = 0; i < numTokens; i++) {
if( i > 0 ) {
aTokens.get(i).setWhitespaceBefore(aTokens.get(i - 1).getToken());
aTokens.get(i).setStartPos(aTokens.get(i).getStartPos() + posFix);
}
if (!softHyphenTokens.isEmpty() && softHyphenTokens.get(i) != null) {
// addReading() modifies a readings.token if last token is longer - need to use it first
posFix += softHyphenTokens.get(i).length() - aTokens.get(i).getToken().length();
AnalyzedToken newToken = language.getTagger().createToken(softHyphenTokens.get(i), null);
aTokens.get(i).addReading(newToken, "softHyphenTokens");
}
}
// add additional tags
int lastToken = toArrayCount - 1;
// make SENT_END appear at last not whitespace token
for (int i = 0; i < toArrayCount - 1; i++) {
if (!tokenArray[lastToken - i].isWhitespace()) {
lastToken -= i;
break;
}
}
tokenArray[lastToken].setSentEnd();
if (tokenArray.length == lastToken + 1 && tokenArray[lastToken].isLinebreak()) {
tokenArray[lastToken].setParagraphEnd();
}
return new AnalyzedSentence(tokenArray);
}
private Map replaceSoftHyphens(List tokens) {
Pattern ignoredCharacterRegex = language.getIgnoredCharactersRegex();
Map ignoredCharsTokens = new HashMap<>();
if (ignoredCharacterRegex == null) {
return ignoredCharsTokens;
}
for (int i = 0; i < tokens.size(); i++) {
Matcher matcher = ignoredCharacterRegex.matcher(tokens.get(i));
if (matcher.find()) {
ignoredCharsTokens.put(i, tokens.get(i));
tokens.set(i, matcher.replaceAll(""));
}
}
return ignoredCharsTokens;
}
/**
* Get all rule categories for the current language.
*
* @return a map of {@link Category Categories}, keyed by their {@link CategoryId id}.
* @since 3.5
*/
public Map getCategories() {
Map map = new HashMap<>();
for (Rule rule : getAllRules()) {
map.put(rule.getCategory().getId(), rule.getCategory());
}
return map;
}
/**
* Get all rules for the current language that are built-in or that have been
* added using {@link #addRule(Rule)}. Please note that XML rules that are grouped
* will appear as multiple rules with the same id. To tell them apart, check if
* they are of type {@code AbstractPatternRule}, cast them to that type and call
* their {@link AbstractPatternRule#getSubId()} method.
* @return a List of {@link Rule} objects
*/
public List getAllRules() {
List rules = new ArrayList<>();
rules.addAll(builtinRules);
rules.addAll(userRules);
return rules;
}
/**
* Get all active (not disabled) rules for the current language that are built-in or that
* have been added using e.g. {@link #addRule(Rule)}. See {@link #getAllRules()} for hints
* about rule ids.
* @return a List of {@link Rule} objects
*/
public List getAllActiveRules() {
List rules = new ArrayList<>();
List rulesActive = new ArrayList<>();
rules.addAll(builtinRules);
rules.addAll(userRules);
// Some rules have an internal state so they can do checks over sentence
// boundaries. These need to be reset so the checks don't suddenly
// work on different texts with the same data. However, it could be useful
// to keep the state information if we're checking a continuous text.
for (Rule rule : rules) {
if (!ignoreRule(rule)) {
rulesActive.add(rule);
}
}
return rulesActive;
}
/**
* Works like getAllActiveRules but overrides defaults by office defaults
* @return a List of {@link Rule} objects
* @since 4.0
*/
public List getAllActiveOfficeRules() {
List rules = new ArrayList<>();
List rulesActive = new ArrayList<>();
rules.addAll(builtinRules);
rules.addAll(userRules);
for (Rule rule : rules) {
if (!ignoreRule(rule) && !rule.isOfficeDefaultOff()) {
rulesActive.add(rule);
} else if (rule.isOfficeDefaultOn() && !disabledRules.contains(rule.getId())) {
rulesActive.add(rule);
enableRule(rule.getId());
} else if (!ignoreRule(rule) && rule.isOfficeDefaultOff() && !enabledRules.contains(rule.getId())) {
disableRule(rule.getId());
}
}
return rulesActive;
}
/**
* Get pattern rules by Id and SubId. This returns a list because rules that use {@code ... }
* are internally expanded into several rules.
* @return a List of {@link Rule} objects
* @since 2.3
*/
public List getPatternRulesByIdAndSubId(String id, String subId) {
List rules = getAllRules();
List rulesById = new ArrayList<>();
for (Rule rule : rules) {
if (rule instanceof AbstractPatternRule &&
rule.getId().equals(id) && ((AbstractPatternRule) rule).getSubId().equals(subId)) {
rulesById.add((AbstractPatternRule) rule);
}
}
return rulesById;
}
protected void printIfVerbose(String s) {
if (printStream != null) {
printStream.println(s);
}
}
/**
* Adds a temporary file to the internal list
* (internal method, you should never need to call this as a user of LanguageTool)
* @param file the file to be added.
*/
public static void addTemporaryFile(File file) {
temporaryFiles.add(file);
}
/**
* Clean up all temporary files, if there are any.
*/
public static void removeTemporaryFiles() {
for (File file : temporaryFiles) {
file.delete();
}
}
/**
* should be called just once with complete list of matches, before returning them to caller
* @param matches matches after applying rules and default filters
* @param text text that matches refer to
* @return transformed matches (after applying filters in {@link matchFilters})
* @since 4.7
*/
protected List applyCustomFilters(List matches, AnnotatedText text) {
List transformed = matches;
for (RuleMatchFilter filter : matchFilters) {
transformed = filter.filter(transformed, text);
}
return transformed;
}
class TextCheckCallable implements Callable> {
private final List rules;
private final boolean checkRemoteRules;
private final ParagraphHandling paraMode;
private final AnnotatedText annotatedText;
private final List sentences;
private final List analyzedSentences;
private final RuleMatchListener listener;
private final Mode mode;
private int charCount;
private int lineCount;
private int columnCount;
TextCheckCallable(List rules, List sentences, List analyzedSentences,
ParagraphHandling paraMode, AnnotatedText annotatedText, int charCount, int lineCount, int columnCount,
RuleMatchListener listener, Mode mode, boolean checkRemoteRules) {
this.rules = rules;
this.checkRemoteRules = checkRemoteRules;
if (sentences.size() != analyzedSentences.size()) {
throw new IllegalArgumentException("sentences and analyzedSentences do not have the same length : " + sentences.size() + " != " + analyzedSentences.size());
}
this.sentences = Objects.requireNonNull(sentences);
this.analyzedSentences = Objects.requireNonNull(analyzedSentences);
this.paraMode = Objects.requireNonNull(paraMode);
this.annotatedText = Objects.requireNonNull(annotatedText);
this.charCount = charCount;
this.lineCount = lineCount;
this.columnCount = columnCount;
this.listener = listener;
this.mode = Objects.requireNonNull(mode);
}
@Override
public List call() throws Exception {
List ruleMatches = new ArrayList<>();
if (mode == Mode.ALL) {
ruleMatches.addAll(getTextLevelRuleMatches());
ruleMatches.addAll(getOtherRuleMatches());
} else if (mode == Mode.ALL_BUT_TEXTLEVEL_ONLY) {
ruleMatches.addAll(getOtherRuleMatches());
} else if (mode == Mode.TEXTLEVEL_ONLY) {
ruleMatches.addAll(getTextLevelRuleMatches());
} else {
throw new IllegalArgumentException("Unknown mode: " + mode);
}
// can't call applyCustomRuleFilters here, done in performCheck ->
// should run just once w/ complete list of matches
return ruleMatches;
}
private List getTextLevelRuleMatches() throws IOException {
List ruleMatches = new ArrayList<>();
RuleLoggerManager logger = RuleLoggerManager.getInstance();
String lang = language.getShortCodeWithCountryAndVariant();
for (Rule rule : rules) {
if (rule instanceof TextLevelRule && !ignoreRule(rule) && paraMode != ParagraphHandling.ONLYNONPARA) {
long time = System.currentTimeMillis();
RuleMatch[] matches = ((TextLevelRule) rule).match(analyzedSentences, annotatedText);
logger.log(new RuleCheckTimeMessage(rule.getId(), lang,
time, annotatedText.getPlainText().length()), Level.FINE);
List adaptedMatches = new ArrayList<>();
for (RuleMatch match : matches) {
LineColumnRange range = getLineColumnRange(match);
int newFromPos = annotatedText.getOriginalTextPositionFor(match.getFromPos(), false);
int newToPos = annotatedText.getOriginalTextPositionFor(match.getToPos() - 1, true) + 1;
RuleMatch newMatch = new RuleMatch(match);
newMatch.setOffsetPosition(newFromPos, newToPos);
newMatch.setLine(range.from.line);
newMatch.setEndLine(range.to.line);
if (match.getLine() == 0) {
newMatch.setColumn(range.from.column + 1);
} else {
newMatch.setColumn(range.from.column);
}
newMatch.setEndColumn(range.to.column);
newMatch.setSuggestedReplacementObjects(extendSuggestions(match.getSuggestedReplacementObjects()));
adaptedMatches.add(newMatch);
}
ruleMatches.addAll(adaptedMatches);
if (listener != null) {
for (RuleMatch adaptedMatch : adaptedMatches) {
listener.matchFound(adaptedMatch);
}
}
}
}
return ruleMatches;
}
private List getOtherRuleMatches() {
List ruleMatches = new ArrayList<>();
int i = 0;
int wordCounter = 0;
for (AnalyzedSentence analyzedSentence : analyzedSentences) {
String sentence = sentences.get(i++);
wordCounter += analyzedSentence.getTokensWithoutWhitespace().length;
try {
List sentenceMatches = null;
InputSentence cacheKey = null;
if (cache != null) {
cacheKey = new InputSentence(analyzedSentence.getText(), language, motherTongue,
disabledRules, disabledRuleCategories,
enabledRules, enabledRuleCategories, userConfig, altLanguages, mode);
sentenceMatches = cache.getIfPresent(cacheKey);
}
if (sentenceMatches == null) {
sentenceMatches = checkAnalyzedSentence(paraMode, rules, analyzedSentence, checkRemoteRules);
}
if (cache != null) {
cache.put(cacheKey, sentenceMatches);
}
List adaptedMatches = new ArrayList<>();
for (RuleMatch elem : sentenceMatches) {
RuleMatch thisMatch = adjustRuleMatchPos(elem, charCount, columnCount, lineCount, sentence, annotatedText);
adaptedMatches.add(thisMatch);
if (listener != null) {
listener.matchFound(thisMatch);
}
}
ruleMatches.addAll(adaptedMatches);
float errorsPerWord = ruleMatches.size() / (float)wordCounter;
//System.out.println("errorPerWord " + errorsPerWord + " (matches: " + ruleMatches.size() + " / " + wordCounter + ")");
if (maxErrorsPerWordRate > 0 && errorsPerWord > maxErrorsPerWordRate && wordCounter > 25) {
CommonWords commonWords = new CommonWords();
throw new ErrorRateTooHighException("Text checking was stopped due to too many errors (more than " + String.format("%.0f", maxErrorsPerWordRate*100) +
"% of words seem to have an error). Are you sure you have set the correct text language? Language set: " + JLanguageTool.this.language.getName() +
", text length: " + annotatedText.getPlainText().length() + ", common word count: " + commonWords.getKnownWordsPerLanguage(annotatedText.getPlainText()));
}
charCount += sentence.length();
lineCount += countLineBreaks(sentence);
// calculate matching column:
int lineBreakPos = sentence.lastIndexOf('\n');
if (lineBreakPos == -1) {
columnCount += sentence.length();
} else {
if (lineBreakPos == 0) {
columnCount = sentence.length();
if (!language.getSentenceTokenizer().singleLineBreaksMarksPara()) {
columnCount--;
}
} else {
columnCount = sentence.length() - lineBreakPos;
}
}
} catch (ErrorRateTooHighException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException("Could not check sentence (language: " + language + "): '"
+ StringUtils.abbreviate(analyzedSentence.toTextString(), 500) + "'", e);
}
}
return ruleMatches;
}
private LineColumnRange getLineColumnRange(RuleMatch match) {
LineColumnPosition fromPos = new LineColumnPosition(-1, -1);
LineColumnPosition toPos = new LineColumnPosition(-1, -1);
LineColumnPosition pos = new LineColumnPosition(0, 0);
int charCount = 0;
for (AnalyzedSentence analyzedSentence : analyzedSentences) {
for (AnalyzedTokenReadings readings : analyzedSentence.getTokens()) {
String token = readings.getToken();
if ("\n".equals(token)) {
pos.line++;
pos.column = 0;
}
pos.column += token.length();
charCount += token.length();
if (charCount == match.getFromPos()) {
fromPos = new LineColumnPosition(pos.line, pos.column);
}
if (charCount == match.getToPos()) {
toPos = new LineColumnPosition(pos.line, pos.column);
}
}
}
return new LineColumnRange(fromPos, toPos);
}
private class LineColumnPosition {
int line;
int column;
private LineColumnPosition(int line, int column) {
this.line = line;
this.column = column;
}
}
private class LineColumnRange {
LineColumnPosition from;
LineColumnPosition to;
private LineColumnRange(LineColumnPosition from, LineColumnPosition to) {
this.from = from;
this.to = to;
}
}
}
public void setConfigValues(Map v) {
userConfig.insertConfigValues(v);
}
}