All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.JLanguageTool Maven / Gradle / Ivy

Go to download

LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.

There is a newer version: 6.5
Show newest version
/* LanguageTool, a natural language style checker 
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool;

import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable;
import org.languagetool.databroker.DefaultResourceDataBroker;
import org.languagetool.databroker.ResourceDataBroker;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.markup.AnnotatedText;
import org.languagetool.markup.AnnotatedTextBuilder;
import org.languagetool.rules.*;
import org.languagetool.rules.neuralnetwork.Word2VecModel;
import org.languagetool.rules.patterns.AbstractPatternRule;
import org.languagetool.rules.patterns.FalseFriendRuleLoader;
import org.languagetool.rules.patterns.PatternRule;
import org.languagetool.rules.patterns.PatternRuleLoader;
import org.xml.sax.SAXException;

import javax.xml.parsers.ParserConfigurationException;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.net.JarURLConnection;
import java.net.URL;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.jar.Manifest;
import java.util.regex.Pattern;

/**
 * The main class used for checking text against different rules:
 * 
    *
  • built-in Java rules (for English: a vs. an, whitespace after commas, ...) *
  • built-in pattern rules loaded from external XML files (usually called {@code grammar.xml}) *
  • your own implementation of the abstract {@link Rule} classes added with {@link #addRule(Rule)} *
* *

You will probably want to use the sub class {@link MultiThreadedJLanguageTool} for best performance. * *

Thread-safety: this class is not thread safe. Create one instance per thread, * but create the language only once (e.g. {@code new AmericanEnglish()}) and use it for all * instances of JLanguageTool.

* * @see MultiThreadedJLanguageTool */ public class JLanguageTool { /** LanguageTool version as a string like {@code 2.3} or {@code 2.4-SNAPSHOT}. */ public static final String VERSION = "4.1"; /** LanguageTool build date and time like {@code 2013-10-17 16:10} or {@code null} if not run from JAR. */ @Nullable public static final String BUILD_DATE = getBuildDate(); /** The name of the file with error patterns. */ public static final String PATTERN_FILE = "grammar.xml"; /** The name of the file with false friend information. */ public static final String FALSE_FRIEND_FILE = "false-friends.xml"; /** The internal tag used to mark the beginning of a sentence. */ public static final String SENTENCE_START_TAGNAME = "SENT_START"; /** The internal tag used to mark the end of a sentence. */ public static final String SENTENCE_END_TAGNAME = "SENT_END"; /** The internal tag used to mark the end of a paragraph. */ public static final String PARAGRAPH_END_TAGNAME = "PARA_END"; /** Name of the message bundle for translations. */ public static final String MESSAGE_BUNDLE = "org.languagetool.MessagesBundle"; private final ResultCache cache; private float maxErrorsPerWordRate; /** * Returns the build date or {@code null} if not run from JAR. */ @Nullable private static String getBuildDate() { try { URL res = JLanguageTool.class.getResource(JLanguageTool.class.getSimpleName() + ".class"); if (res == null) { // this will happen on Android, see http://stackoverflow.com/questions/15371274/ return null; } Object connObj = res.openConnection(); if (connObj instanceof JarURLConnection) { JarURLConnection conn = (JarURLConnection) connObj; Manifest manifest = conn.getManifest(); return manifest.getMainAttributes().getValue("Implementation-Date"); } else { return null; } } catch (IOException e) { throw new RuntimeException("Could not get build date from JAR", e); } } private static ResourceDataBroker dataBroker = new DefaultResourceDataBroker(); private final List builtinRules; private final List userRules = new ArrayList<>(); // rules added via addRule() method private final Set disabledRules = new HashSet<>(); private final Set disabledRuleCategories = new HashSet<>(); private final Set enabledRules = new HashSet<>(); private final Set enabledRuleCategories = new HashSet<>(); private final Language language; private final Language motherTongue; private PrintStream printStream; private boolean listUnknownWords; private Set unknownWords; private boolean cleanOverlappingMatches; /** * Constants for correct paragraph-rule handling. */ public enum ParagraphHandling { /** * Handle normally - all kinds of rules run. */ NORMAL, /** * Run only paragraph-level rules. */ ONLYPARA, /** * Run only sentence-level rules. */ ONLYNONPARA } private static final List temporaryFiles = new ArrayList<>(); /** * Create a JLanguageTool and setup the built-in rules for the * given language and false friend rules for the text language / mother tongue pair. * * @param lang the language of the text to be checked * @param motherTongue the user's mother tongue, used for false friend rules, or null. * The mother tongue may also be used as a source language for checking bilingual texts. */ public JLanguageTool(Language lang, Language motherTongue) { this(lang, motherTongue, null); } /** * Create a JLanguageTool and setup the built-in Java rules for the * given language. * * @param language the language of the text to be checked */ public JLanguageTool(Language language) { this(language, null, null); } /** * Create a JLanguageTool and setup the built-in rules for the * given language and false friend rules for the text language / mother tongue pair. * * @param language the language of the text to be checked * @param motherTongue the user's mother tongue, used for false friend rules, or null. * The mother tongue may also be used as a source language for checking bilingual texts. * @param cache a cache to speed up checking if the same sentences get checked more than once, * e.g. when LT is running as a server and texts are re-checked due to changes * @since 3.7 */ @Experimental public JLanguageTool(Language language, Language motherTongue, ResultCache cache) { this.language = Objects.requireNonNull(language, "language cannot be null"); this.motherTongue = motherTongue; ResourceBundle messages = ResourceBundleTools.getMessageBundle(language); builtinRules = getAllBuiltinRules(language, messages); this.cleanOverlappingMatches = true; try { activateDefaultPatternRules(); activateDefaultFalseFriendRules(); } catch (Exception e) { throw new RuntimeException("Could not activate rules", e); } this.cache = cache; } /** * The grammar checker needs resources from following * directories: *
    *
  • {@code /resource}
  • *
  • {@code /rules}
  • *
* @return The currently set data broker which allows to obtain * resources from the mentioned directories above. If no * data broker was set, a new {@link DefaultResourceDataBroker} will * be instantiated and returned. * @since 1.0.1 */ public static synchronized ResourceDataBroker getDataBroker() { if (JLanguageTool.dataBroker == null) { JLanguageTool.dataBroker = new DefaultResourceDataBroker(); } return JLanguageTool.dataBroker; } /** * The grammar checker needs resources from following * directories: *
    *
  • {@code /resource}
  • *
  • {@code /rules}
  • *
* @param broker The new resource broker to be used. * @since 1.0.1 */ public static synchronized void setDataBroker(ResourceDataBroker broker) { JLanguageTool.dataBroker = broker; } /** * Whether the {@link #check(String)} methods store unknown words. If set to * true (default: false), you can get the list of unknown words * using {@link #getUnknownWords()}. */ public void setListUnknownWords(boolean listUnknownWords) { this.listUnknownWords = listUnknownWords; } /** * Whether the {@link #check(String)} methods return overlapping errors. If set to * true (default: true), it removes overlapping errors according to * the priorities established for the language. * @since 3.6 */ public void setCleanOverlappingMatches(boolean cleanOverlappingMatches) { this.cleanOverlappingMatches = cleanOverlappingMatches; } /** * Maximum errors per word rate, checking will stop with an exception if the rate is higher. * For example, with a rate of 0.33, the checking would stop if the user's * text has so many errors that more than every 3rd word causes a rule match. * Note that this may not apply for very short texts. * @since 4.0 */ @Experimental public void setMaxErrorsPerWordRate(float maxErrorsPerWordRate) { this.maxErrorsPerWordRate = maxErrorsPerWordRate; } /** * Gets the ResourceBundle (i18n strings) for the default language of the user's system. */ public static ResourceBundle getMessageBundle() { return ResourceBundleTools.getMessageBundle(); } /** * Gets the ResourceBundle (i18n strings) for the given user interface language. * @since 2.4 (public since 2.4) */ public static ResourceBundle getMessageBundle(Language lang) { return ResourceBundleTools.getMessageBundle(lang); } private List getAllBuiltinRules(Language language, ResourceBundle messages) { try { return language.getRelevantRules(messages); } catch (IOException e) { throw new RuntimeException("Could not get rules of language " + language, e); } } /** * Set a PrintStream that will receive verbose output. Set to * {@code null} (which is the default) to disable verbose output. */ public void setOutput(PrintStream printStream) { this.printStream = printStream; } /** * Load pattern rules from an XML file. Use {@link #addRule(Rule)} to add these * rules to the checking process. * @param filename path to an XML file in the classpath or in the filesystem - the classpath is checked first * @return a List of {@link PatternRule} objects */ public List loadPatternRules(String filename) throws IOException { PatternRuleLoader ruleLoader = new PatternRuleLoader(); try (InputStream is = this.getClass().getResourceAsStream(filename)) { if (is == null) { // happens for external rules plugged in as an XML file: return ruleLoader.getRules(new File(filename)); } else { return ruleLoader.getRules(is, filename); } } } /** * Load false friend rules from an XML file. Only those pairs will be loaded * that match the current text language and the mother tongue specified in the * JLanguageTool constructor. Use {@link #addRule(Rule)} to add these rules to the * checking process. * @param filename path to an XML file in the classpath or in the filesystem - the classpath is checked first * @return a List of {@link PatternRule} objects, or an empty list if mother tongue is not set */ public List loadFalseFriendRules(String filename) throws ParserConfigurationException, SAXException, IOException { if (motherTongue == null) { return Collections.emptyList(); } FalseFriendRuleLoader ruleLoader = new FalseFriendRuleLoader(); try (InputStream is = this.getClass().getResourceAsStream(filename)) { if (is == null) { return ruleLoader.getRules(new File(filename), language, motherTongue); } else { return ruleLoader.getRules(is, language, motherTongue); } } } /** * Activate rules that depend on a language model. The language model currently * consists of Lucene indexes with ngram occurrence counts. * @param indexDir directory with a '3grams' sub directory which contains a Lucene index with 3gram occurrence counts * @since 2.7 */ public void activateLanguageModelRules(File indexDir) throws IOException { LanguageModel languageModel = language.getLanguageModel(indexDir); if (languageModel != null) { ResourceBundle messages = getMessageBundle(language); List rules = language.getRelevantLanguageModelRules(messages, languageModel); userRules.addAll(rules); } } /** * Activate rules that depend on a word2vec language model. * @param indexDir directory with a subdirectories like 'en', each containing dictionary.txt and final_embeddings.txt * @since 4.0 */ public void activateWord2VecModelRules(File indexDir) throws IOException { Word2VecModel word2vecModel = language.getWord2VecModel(indexDir); if (word2vecModel != null) { ResourceBundle messages = getMessageBundle(language); List rules = language.getRelevantWord2VecModelRules(messages, word2vecModel); userRules.addAll(rules); } } /** * Loads and activates the pattern rules from * {@code org/languagetool/rules//grammar.xml}. */ private void activateDefaultPatternRules() throws IOException { List patternRules = language.getPatternRules(); List enabledRules = language.getDefaultEnabledRulesForVariant(); List disabledRules = language.getDefaultDisabledRulesForVariant(); if (!enabledRules.isEmpty() || !disabledRules.isEmpty()) { for (AbstractPatternRule patternRule : patternRules) { if (enabledRules.contains(patternRule.getId())) { patternRule.setDefaultOn(); } if (disabledRules.contains(patternRule.getId())) { patternRule.setDefaultOff(); } } } userRules.addAll(patternRules); } /** * Loads and activates the false friend rules from * rules/false-friends.xml. */ private void activateDefaultFalseFriendRules() throws ParserConfigurationException, SAXException, IOException { String falseFriendRulesFilename = JLanguageTool.getDataBroker().getRulesDir() + "/" + FALSE_FRIEND_FILE; userRules.addAll(loadFalseFriendRules(falseFriendRulesFilename)); } /** * Add a rule to be used by the next call to the check methods like {@link #check(String)}. */ public void addRule(Rule rule) { userRules.add(rule); } /** * Disable a given rule so the check methods like {@link #check(String)} won't use it. * @param ruleId the id of the rule to disable - no error will be thrown if the id does not exist * @see #enableRule(String) */ public void disableRule(String ruleId) { disabledRules.add(ruleId); enabledRules.remove(ruleId); } /** * Disable the given rules so the check methods like {@link #check(String)} won't use them. * @param ruleIds the ids of the rules to disable - no error will be thrown if the id does not exist * @since 2.4 */ public void disableRules(List ruleIds) { disabledRules.addAll(ruleIds); enabledRules.removeAll(ruleIds); } /** * Disable the given rule category so the check methods like {@link #check(String)} won't use it. * @param id the id of the category to disable - no error will be thrown if the id does not exist * @since 3.3 * @see #enableRuleCategory(CategoryId) */ public void disableCategory(CategoryId id) { disabledRuleCategories.add(id); enabledRuleCategories.remove(id); } /** * Returns true if a category is explicitly disabled. * * @param id the id of the category to check - no error will be thrown if the id does not exist * @return true if this category is explicitly disabled. * @since 3.5 * @see #disableCategory(org.languagetool.rules.CategoryId) */ public boolean isCategoryDisabled(CategoryId id) { return disabledRuleCategories.contains(id); } /** * Get the language that was used to configure this instance. */ public Language getLanguage() { return language; } /** * Get rule ids of the rules that have been explicitly disabled. */ public Set getDisabledRules() { return disabledRules; } /** * Enable a given rule so the check methods like {@link #check(String)} will use it. * This will not throw an exception if the given rule id doesn't exist. * @param ruleId the id of the rule to enable * @see #disableRule(String) */ public void enableRule(String ruleId) { disabledRules.remove(ruleId); enabledRules.add(ruleId); } /** * Enable all rules of the given category so the check methods like {@link #check(String)} will use it. * This will not throw an exception if the given rule id doesn't exist. * @since 3.3 * @see #disableCategory(org.languagetool.rules.CategoryId) */ public void enableRuleCategory(CategoryId id) { disabledRuleCategories.remove(id); enabledRuleCategories.add(id); } /** * Tokenizes the given text into sentences. */ public List sentenceTokenize(String text) { return language.getSentenceTokenizer().tokenize(text); } /** * The main check method. Tokenizes the text into sentences and matches these * sentences against all currently active rules. * * @param text the text to be checked * @return a List of {@link RuleMatch} objects */ public List check(String text) throws IOException { return check(text, true, ParagraphHandling.NORMAL); } /** * The main check method. Tokenizes the text into sentences and matches these * sentences against all currently active rules. * * @param text the text to be checked * @return a List of {@link RuleMatch} objects * @since 3.7 */ @Experimental public List check(String text, RuleMatchListener listener) throws IOException { return check(text, true, ParagraphHandling.NORMAL, listener); } public List check(String text, boolean tokenizeText, ParagraphHandling paraMode) throws IOException { return check(new AnnotatedTextBuilder().addText(text).build(), tokenizeText, paraMode); } /** * @since 3.7 */ @Experimental public List check(String text, boolean tokenizeText, ParagraphHandling paraMode, RuleMatchListener listener) throws IOException { return check(new AnnotatedTextBuilder().addText(text).build(), tokenizeText, paraMode, listener); } /** * The main check method. Tokenizes the text into sentences and matches these * sentences against all currently active rules, adjusting error positions so they refer * to the original text including markup. * @since 2.3 */ public List check(AnnotatedText text) throws IOException { return check(text, true, ParagraphHandling.NORMAL); } /** * @since 3.9 */ @Experimental public List check(AnnotatedText text, RuleMatchListener listener) throws IOException { return check(text, true, ParagraphHandling.NORMAL, listener); } /** * The main check method. Tokenizes the text into sentences and matches these * sentences against all currently active rules. * @param annotatedText The text to be checked, created with {@link AnnotatedTextBuilder}. * Call this method with the complete text to be checked. If you call it * repeatedly with smaller chunks like paragraphs or sentence, those rules that work across * paragraphs/sentences won't work (their status gets reset whenever this method is called). * @param tokenizeText If true, then the text is tokenized into sentences. * Otherwise, it is assumed it's already tokenized, i.e. it is only one sentence * @param paraMode Uses paragraph-level rules only if true. * @return a List of {@link RuleMatch} objects, describing potential errors in the text * @since 2.3 */ public List check(AnnotatedText annotatedText, boolean tokenizeText, ParagraphHandling paraMode) throws IOException { return check(annotatedText, tokenizeText, paraMode, null); } /** * The main check method. Tokenizes the text into sentences and matches these * sentences against all currently active rules. * @since 3.7 */ @Experimental public List check(AnnotatedText annotatedText, boolean tokenizeText, ParagraphHandling paraMode, RuleMatchListener listener) throws IOException { List sentences; if (tokenizeText) { sentences = sentenceTokenize(annotatedText.getPlainText()); } else { sentences = new ArrayList<>(); sentences.add(annotatedText.getPlainText()); } List allRules = getAllRules(); if (printStream != null) { printIfVerbose(allRules.size() + " rules activated for language " + language); } unknownWords = new HashSet<>(); List analyzedSentences = analyzeSentences(sentences); List ruleMatches = performCheck(analyzedSentences, sentences, allRules, paraMode, annotatedText, listener); ruleMatches = new SameRuleGroupFilter().filter(ruleMatches); // no sorting: SameRuleGroupFilter sorts rule matches already if (cleanOverlappingMatches) { ruleMatches = new CleanOverlappingFilter(language).filter(ruleMatches); } return ruleMatches; } /** * Use this method if you want to access LanguageTool's otherwise * internal analysis of the text. For actual text checking, use the {@code check...} methods instead. * @param text The text to be analyzed * @since 2.5 */ public List analyzeText(String text) throws IOException { List sentences = sentenceTokenize(text); return analyzeSentences(sentences); } protected List analyzeSentences(List sentences) throws IOException { List analyzedSentences = new ArrayList<>(); int j = 0; for (String sentence : sentences) { AnalyzedSentence analyzedSentence = getAnalyzedSentence(sentence); rememberUnknownWords(analyzedSentence); if (++j == sentences.size()) { AnalyzedTokenReadings[] anTokens = analyzedSentence.getTokens(); anTokens[anTokens.length - 1].setParagraphEnd(); analyzedSentence = new AnalyzedSentence(anTokens); } analyzedSentences.add(analyzedSentence); printSentenceInfo(analyzedSentence); } return analyzedSentences; } protected void printSentenceInfo(AnalyzedSentence analyzedSentence) { if (printStream != null) { printIfVerbose(analyzedSentence.toString()); printIfVerbose(analyzedSentence.getAnnotations()); } } protected List performCheck(List analyzedSentences, List sentences, List allRules, ParagraphHandling paraMode, AnnotatedText annotatedText) throws IOException { return performCheck(analyzedSentences, sentences, allRules, paraMode, annotatedText, null); } /** * @since 3.7 */ @Experimental protected List performCheck(List analyzedSentences, List sentences, List allRules, ParagraphHandling paraMode, AnnotatedText annotatedText, RuleMatchListener listener) throws IOException { Callable> matcher = new TextCheckCallable(allRules, sentences, analyzedSentences, paraMode, annotatedText, 0, 0, 1, listener); try { return matcher.call(); } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } } /** * This is an internal method that's public only for technical reasons, please use one * of the {@link #check(String)} methods instead. * @since 2.3 */ public List checkAnalyzedSentence(ParagraphHandling paraMode, List rules, AnalyzedSentence analyzedSentence) throws IOException { List sentenceMatches = new ArrayList<>(); for (Rule rule : rules) { if (rule instanceof TextLevelRule) { continue; } if (ignoreRule(rule)) { continue; } if (rule instanceof PatternRule && ((PatternRule)rule).canBeIgnoredFor(analyzedSentence)) { // this is a performance optimization, it should have no effect on matching logic continue; } if (paraMode == ParagraphHandling.ONLYPARA) { continue; } RuleMatch[] thisMatches = rule.match(analyzedSentence); for (RuleMatch elem : thisMatches) { sentenceMatches.add(elem); } } return new SameRuleGroupFilter().filter(sentenceMatches); } private boolean ignoreRule(Rule rule) { Category ruleCategory = rule.getCategory(); boolean isCategoryDisabled = (disabledRuleCategories.contains(ruleCategory.getId()) || rule.getCategory().isDefaultOff()) && !enabledRuleCategories.contains(ruleCategory.getId()); boolean isRuleDisabled = disabledRules.contains(rule.getId()) || (rule.isDefaultOff() && !enabledRules.contains(rule.getId())); boolean isDisabled; if (isCategoryDisabled) { isDisabled = !enabledRules.contains(rule.getId()); } else { isDisabled = isRuleDisabled; } return isDisabled; } /** * Change RuleMatch positions so they are relative to the complete text, * not just to the sentence. * @param charCount Count of characters in the sentences before * @param columnCount Current column number * @param lineCount Current line number * @param sentence The text being checked * @return The RuleMatch object with adjustments */ public RuleMatch adjustRuleMatchPos(RuleMatch match, int charCount, int columnCount, int lineCount, String sentence, AnnotatedText annotatedText) { int fromPos = match.getFromPos() + charCount; int toPos = match.getToPos() + charCount; if (annotatedText != null) { fromPos = annotatedText.getOriginalTextPositionFor(fromPos); toPos = annotatedText.getOriginalTextPositionFor(toPos - 1) + 1; } RuleMatch thisMatch = new RuleMatch(match.getRule(), match.getSentence(), fromPos, toPos, match.getMessage(), match.getShortMessage()); thisMatch.setSuggestedReplacements(match.getSuggestedReplacements()); thisMatch.setUrl(match.getUrl()); String sentencePartToError = sentence.substring(0, match.getFromPos()); String sentencePartToEndOfError = sentence.substring(0, match.getToPos()); int lastLineBreakPos = sentencePartToError.lastIndexOf('\n'); int column; int endColumn; if (lastLineBreakPos == -1) { column = sentencePartToError.length() + columnCount; } else { column = sentencePartToError.length() - lastLineBreakPos; } int lastLineBreakPosInError = sentencePartToEndOfError.lastIndexOf('\n'); if (lastLineBreakPosInError == -1) { endColumn = sentencePartToEndOfError.length() + columnCount; } else { endColumn = sentencePartToEndOfError.length() - lastLineBreakPosInError; } int lineBreaksToError = countLineBreaks(sentencePartToError); int lineBreaksToEndOfError = countLineBreaks(sentencePartToEndOfError); thisMatch.setLine(lineCount + lineBreaksToError); thisMatch.setEndLine(lineCount + lineBreaksToEndOfError); thisMatch.setColumn(column); thisMatch.setEndColumn(endColumn); return thisMatch; } protected void rememberUnknownWords(AnalyzedSentence analyzedText) { if (listUnknownWords) { AnalyzedTokenReadings[] atr = analyzedText.getTokensWithoutWhitespace(); for (AnalyzedTokenReadings reading : atr) { if (!reading.isTagged()) { unknownWords.add(reading.getToken()); } } } } /** * Get the alphabetically sorted list of unknown words in the latest run of one of the {@link #check(String)} methods. * @throws IllegalStateException if {@link #setListUnknownWords(boolean)} has been set to {@code false} */ public List getUnknownWords() { if (!listUnknownWords) { throw new IllegalStateException("listUnknownWords is set to false, unknown words not stored"); } List words = new ArrayList<>(unknownWords); Collections.sort(words); return words; } // non-private only for test case static int countLineBreaks(String s) { int pos = -1; int count = 0; while (true) { int nextPos = s.indexOf('\n', pos + 1); if (nextPos == -1) { break; } pos = nextPos; count++; } return count; } /** * Tokenizes the given {@code sentence} into words and analyzes it, * and then disambiguates POS tags. * @param sentence sentence to be analyzed */ public AnalyzedSentence getAnalyzedSentence(String sentence) throws IOException { SimpleInputSentence cacheKey = new SimpleInputSentence(sentence, language); AnalyzedSentence cachedSentence = cache != null ? cache.getIfPresent(cacheKey) : null; if (cachedSentence != null) { return cachedSentence; } else { AnalyzedSentence analyzedSentence = language.getDisambiguator().disambiguate(getRawAnalyzedSentence(sentence)); if (language.getPostDisambiguationChunker() != null) { language.getPostDisambiguationChunker().addChunkTags(Arrays.asList(analyzedSentence.getTokens())); } if (cache != null) { cache.put(cacheKey, analyzedSentence); } return analyzedSentence; } } /** * Tokenizes the given {@code sentence} into words and analyzes it. * This is the same as {@link #getAnalyzedSentence(String)} but it does not run * the disambiguator. * @param sentence sentence to be analyzed * @since 0.9.8 */ public AnalyzedSentence getRawAnalyzedSentence(String sentence) throws IOException { List tokens = language.getWordTokenizer().tokenize(sentence); Map softHyphenTokens = replaceSoftHyphens(tokens); List aTokens = language.getTagger().tag(tokens); if (language.getChunker() != null) { language.getChunker().addChunkTags(aTokens); } int numTokens = aTokens.size(); int posFix = 0; for (int i = 1; i < numTokens; i++) { aTokens.get(i).setWhitespaceBefore(aTokens.get(i - 1).isWhitespace()); aTokens.get(i).setStartPos(aTokens.get(i).getStartPos() + posFix); if (!softHyphenTokens.isEmpty()) { if (softHyphenTokens.get(i) != null) { aTokens.get(i).addReading(language.getTagger().createToken(softHyphenTokens.get(i), null)); posFix += softHyphenTokens.get(i).length() - aTokens.get(i).getToken().length(); } } } AnalyzedTokenReadings[] tokenArray = new AnalyzedTokenReadings[tokens.size() + 1]; AnalyzedToken[] startTokenArray = new AnalyzedToken[1]; int toArrayCount = 0; AnalyzedToken sentenceStartToken = new AnalyzedToken("", SENTENCE_START_TAGNAME, null); startTokenArray[0] = sentenceStartToken; tokenArray[toArrayCount++] = new AnalyzedTokenReadings(startTokenArray, 0); int startPos = 0; for (AnalyzedTokenReadings posTag : aTokens) { posTag.setStartPos(startPos); tokenArray[toArrayCount++] = posTag; startPos += posTag.getToken().length(); } // add additional tags int lastToken = toArrayCount - 1; // make SENT_END appear at last not whitespace token for (int i = 0; i < toArrayCount - 1; i++) { if (!tokenArray[lastToken - i].isWhitespace()) { lastToken -= i; break; } } tokenArray[lastToken].setSentEnd(); if (tokenArray.length == lastToken + 1 && tokenArray[lastToken].isLinebreak()) { tokenArray[lastToken].setParagraphEnd(); } return new AnalyzedSentence(tokenArray); } private Map replaceSoftHyphens(List tokens) { Pattern ignoredCharacterRegex = language.getIgnoredCharactersRegex(); Map ignoredCharsTokens = new HashMap<>(); if (ignoredCharacterRegex == null) { return ignoredCharsTokens; } for (int i = 0; i < tokens.size(); i++) { if (ignoredCharacterRegex.matcher(tokens.get(i)).find()) { ignoredCharsTokens.put(i, tokens.get(i)); tokens.set(i, ignoredCharacterRegex.matcher(tokens.get(i)).replaceAll("")); } } return ignoredCharsTokens; } /** * Get all rule categories for the current language. * * @return a map of {@link Category Categories}, keyed by their {@link CategoryId id}. * @since 3.5 */ public Map getCategories() { Map map = new HashMap<>(); for (Rule rule : getAllRules()) { map.put(rule.getCategory().getId(), rule.getCategory()); } return map; } /** * Get all rules for the current language that are built-in or that have been * added using {@link #addRule(Rule)}. Please note that XML rules that are grouped * will appear as multiple rules with the same id. To tell them apart, check if * they are of type {@code AbstractPatternRule}, cast them to that type and call * their {@link AbstractPatternRule#getSubId()} method. * @return a List of {@link Rule} objects */ public List getAllRules() { List rules = new ArrayList<>(); rules.addAll(builtinRules); rules.addAll(userRules); return rules; } /** * Get all active (not disabled) rules for the current language that are built-in or that * have been added using e.g. {@link #addRule(Rule)}. See {@link #getAllRules()} for hints * about rule ids. * @return a List of {@link Rule} objects */ public List getAllActiveRules() { List rules = new ArrayList<>(); List rulesActive = new ArrayList<>(); rules.addAll(builtinRules); rules.addAll(userRules); // Some rules have an internal state so they can do checks over sentence // boundaries. These need to be reset so the checks don't suddenly // work on different texts with the same data. However, it could be useful // to keep the state information if we're checking a continuous text. for (Rule rule : rules) { if (!ignoreRule(rule)) { rulesActive.add(rule); } } return rulesActive; } /** * Works like getAllActiveRules but overrides defaults by officeefaults * @return a List of {@link Rule} objects * @since 4.0 */ public List getAllActiveOfficeRules() { List rules = new ArrayList<>(); List rulesActive = new ArrayList<>(); rules.addAll(builtinRules); rules.addAll(userRules); for (Rule rule : rules) { if (!ignoreRule(rule) && !rule.isOfficeDefaultOff()) { rulesActive.add(rule); } else if (rule.isOfficeDefaultOn()) { rulesActive.add(rule); enableRule(rule.getId()); } else if (!ignoreRule(rule) && rule.isOfficeDefaultOff()) { disableRule(rule.getId()); } } return rulesActive; } /** * Get pattern rules by Id and SubId. This returns a list because rules that use {@code ...} * are internally expanded into several rules. * @return a List of {@link Rule} objects * @since 2.3 */ public List getPatternRulesByIdAndSubId(String Id, String subId) { List rules = getAllRules(); List rulesById = new ArrayList<>(); for (Rule rule : rules) { if (rule instanceof AbstractPatternRule) { if (rule.getId().equals(Id) && ((AbstractPatternRule) rule).getSubId().equals(subId)) { rulesById.add((AbstractPatternRule) rule); } } } return rulesById; } protected void printIfVerbose(String s) { if (printStream != null) { printStream.println(s); } } /** * Adds a temporary file to the internal list * (internal method, you should never need to call this as a user of LanguageTool) * @param file the file to be added. */ public static void addTemporaryFile(File file) { temporaryFiles.add(file); } /** * Clean up all temporary files, if there are any. */ public static void removeTemporaryFiles() { for (File file : temporaryFiles) { file.delete(); } } class TextCheckCallable implements Callable> { private final List rules; private final ParagraphHandling paraMode; private final AnnotatedText annotatedText; private final List sentences; private final List analyzedSentences; private final RuleMatchListener listener; private int charCount; private int lineCount; private int columnCount; TextCheckCallable(List rules, List sentences, List analyzedSentences, ParagraphHandling paraMode, AnnotatedText annotatedText, int charCount, int lineCount, int columnCount, RuleMatchListener listener) { this.rules = rules; if (sentences.size() != analyzedSentences.size()) { throw new IllegalArgumentException("sentences and analyzedSentences do not have the same length : " + sentences.size() + " != " + analyzedSentences.size()); } this.sentences = Objects.requireNonNull(sentences); this.analyzedSentences = Objects.requireNonNull(analyzedSentences); this.paraMode = Objects.requireNonNull(paraMode); this.annotatedText = Objects.requireNonNull(annotatedText); this.charCount = charCount; this.lineCount = lineCount; this.columnCount = columnCount; this.listener = listener; } @Override public List call() throws Exception { List ruleMatches = new ArrayList<>(); ruleMatches.addAll(getTextLevelRuleMatches()); ruleMatches.addAll(getOtherRuleMatches()); return ruleMatches; } private List getTextLevelRuleMatches() throws IOException { List ruleMatches = new ArrayList<>(); for (Rule rule : rules) { if (rule instanceof TextLevelRule && !ignoreRule(rule) && paraMode != ParagraphHandling.ONLYNONPARA) { RuleMatch[] matches = ((TextLevelRule) rule).match(analyzedSentences, annotatedText); List adaptedMatches = new ArrayList<>(); for (RuleMatch match : matches) { LineColumnRange range = getLineColumnRange(match); int newFromPos = annotatedText.getOriginalTextPositionFor(match.getFromPos()); int newToPos = annotatedText.getOriginalTextPositionFor(match.getToPos() - 1) + 1; RuleMatch newMatch = new RuleMatch(match.getRule(), match.getSentence(), newFromPos, newToPos, match.getMessage(), match.getShortMessage()); newMatch.setUrl(match.getUrl()); newMatch.setLine(range.from.line); newMatch.setEndLine(range.to.line); if (match.getLine() == 0) { newMatch.setColumn(range.from.column + 1); } else { newMatch.setColumn(range.from.column); } newMatch.setEndColumn(range.to.column); newMatch.setSuggestedReplacements(match.getSuggestedReplacements()); adaptedMatches.add(newMatch); } ruleMatches.addAll(adaptedMatches); if (listener != null) { for (RuleMatch adaptedMatch : adaptedMatches) { listener.matchFound(adaptedMatch); } } } } return ruleMatches; } private List getOtherRuleMatches() { List ruleMatches = new ArrayList<>(); int i = 0; int wordCounter = 0; for (AnalyzedSentence analyzedSentence : analyzedSentences) { String sentence = sentences.get(i++); wordCounter += analyzedSentence.getTokensWithoutWhitespace().length; try { List sentenceMatches = null; InputSentence cacheKey = null; if (cache != null) { cacheKey = new InputSentence(analyzedSentence.getText(), language, motherTongue, disabledRules, disabledRuleCategories, enabledRules, enabledRuleCategories); sentenceMatches = cache.getIfPresent(cacheKey); } if (sentenceMatches == null) { sentenceMatches = checkAnalyzedSentence(paraMode, rules, analyzedSentence); } if (cache != null) { cache.put(cacheKey, sentenceMatches); } List adaptedMatches = new ArrayList<>(); for (RuleMatch elem : sentenceMatches) { RuleMatch thisMatch = adjustRuleMatchPos(elem, charCount, columnCount, lineCount, sentence, annotatedText); adaptedMatches.add(thisMatch); if (listener != null) { listener.matchFound(thisMatch); } } ruleMatches.addAll(adaptedMatches); float errorsPerWord = ruleMatches.size() / (float)wordCounter; //System.out.println("errorPerWord " + errorsPerWord + " (matches: " + ruleMatches.size() + " / " + wordCounter + ")"); if (maxErrorsPerWordRate > 0 && errorsPerWord > maxErrorsPerWordRate && wordCounter > 25) { throw new ErrorRateTooHighException("Text checking was stopped due to too many errors (more than " + String.format("%.0f", maxErrorsPerWordRate*100) + "% of words seem to have an error). Are you sure you have set the correct text language? Language set: " + language.getName()); } charCount += sentence.length(); lineCount += countLineBreaks(sentence); // calculate matching column: int lineBreakPos = sentence.lastIndexOf('\n'); if (lineBreakPos == -1) { columnCount += sentence.length(); } else { if (lineBreakPos == 0) { columnCount = sentence.length(); if (!language.getSentenceTokenizer().singleLineBreaksMarksPara()) { columnCount--; } } else { columnCount = sentence.length() - lineBreakPos; } } } catch (ErrorRateTooHighException e) { throw e; } catch (Exception e) { throw new RuntimeException("Could not check sentence (language: " + language + "): '" + StringUtils.abbreviate(analyzedSentence.toTextString(), 200) + "'", e); } } return ruleMatches; } private LineColumnRange getLineColumnRange(RuleMatch match) { LineColumnPosition fromPos = new LineColumnPosition(-1, -1); LineColumnPosition toPos = new LineColumnPosition(-1, -1); LineColumnPosition pos = new LineColumnPosition(0, 0); int charCount = 0; for (AnalyzedSentence analyzedSentence : analyzedSentences) { for (AnalyzedTokenReadings readings : analyzedSentence.getTokens()) { String token = readings.getToken(); if ("\n".equals(token)) { pos.line++; pos.column = 0; } pos.column += token.length(); charCount += token.length(); if (charCount == match.getFromPos()) { fromPos = new LineColumnPosition(pos.line, pos.column); } if (charCount == match.getToPos()) { toPos = new LineColumnPosition(pos.line, pos.column); } } } return new LineColumnRange(fromPos, toPos); } private class LineColumnPosition { int line; int column; private LineColumnPosition(int line, int column) { this.line = line; this.column = column; } } private class LineColumnRange { LineColumnPosition from; LineColumnPosition to; private LineColumnRange(LineColumnPosition from, LineColumnPosition to) { this.from = from; this.to = to; } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy