org.languagetool.JLanguageTool Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
There is a newer version: 6.5
Show newest version
/* LanguageTool, a natural language style checker 
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool;

import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable;
import org.languagetool.databroker.DefaultResourceDataBroker;
import org.languagetool.databroker.ResourceDataBroker;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.markup.AnnotatedText;
import org.languagetool.markup.AnnotatedTextBuilder;
import org.languagetool.rules.*;
import org.languagetool.rules.neuralnetwork.Word2VecModel;
import org.languagetool.rules.patterns.AbstractPatternRule;
import org.languagetool.rules.patterns.FalseFriendRuleLoader;
import org.languagetool.rules.patterns.PatternRule;
import org.languagetool.rules.patterns.PatternRuleLoader;
import org.xml.sax.SAXException;

import javax.xml.parsers.ParserConfigurationException;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.net.JarURLConnection;
import java.net.URL;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.jar.Manifest;
import java.util.regex.Pattern;

/**
 * The main class used for checking text against different rules:
 * 
 * built-in Java rules (for English: a vs. an, whitespace after commas, ...)
 * 
built-in pattern rules loaded from external XML files (usually called {@code grammar.xml})
 * 
your own implementation of the abstract {@link Rule} classes added with {@link #addRule(Rule)}
 * 
 * 
 * You will probably want to use the sub class {@link MultiThreadedJLanguageTool} for best performance.
 * 
 * 
Thread-safety: this class is not thread safe. Create one instance per thread,
 * but create the language only once (e.g. {@code new AmericanEnglish()}) and use it for all
 * instances of JLanguageTool.
 * 
 * @see MultiThreadedJLanguageTool
 */
public class JLanguageTool {

  /** LanguageTool version as a string like {@code 2.3} or {@code 2.4-SNAPSHOT}. */
  public static final String VERSION = "4.1";
  /** LanguageTool build date and time like {@code 2013-10-17 16:10} or {@code null} if not run from JAR. */
  @Nullable public static final String BUILD_DATE = getBuildDate();

  /** The name of the file with error patterns. */
  public static final String PATTERN_FILE = "grammar.xml";
  /** The name of the file with false friend information. */
  public static final String FALSE_FRIEND_FILE = "false-friends.xml";
  /** The internal tag used to mark the beginning of a sentence. */
  public static final String SENTENCE_START_TAGNAME = "SENT_START";
  /** The internal tag used to mark the end of a sentence. */
  public static final String SENTENCE_END_TAGNAME = "SENT_END";
  /** The internal tag used to mark the end of a paragraph. */
  public static final String PARAGRAPH_END_TAGNAME = "PARA_END";
  /** Name of the message bundle for translations. */
  public static final String MESSAGE_BUNDLE = "org.languagetool.MessagesBundle";

  private final ResultCache cache;
  private float maxErrorsPerWordRate;

  /**
   * Returns the build date or {@code null} if not run from JAR.
   */
  @Nullable
  private static String getBuildDate() {
    try {
      URL res = JLanguageTool.class.getResource(JLanguageTool.class.getSimpleName() + ".class");
      if (res == null) {
        // this will happen on Android, see http://stackoverflow.com/questions/15371274/
        return null;
      }
      Object connObj = res.openConnection();
      if (connObj instanceof JarURLConnection) {
        JarURLConnection conn = (JarURLConnection) connObj;
        Manifest manifest = conn.getManifest();
        return manifest.getMainAttributes().getValue("Implementation-Date");
      } else {
        return null;
      }
    } catch (IOException e) {
      throw new RuntimeException("Could not get build date from JAR", e);
    }
  }
  
  private static ResourceDataBroker dataBroker = new DefaultResourceDataBroker();

  private final List builtinRules;
  private final List userRules = new ArrayList<>(); // rules added via addRule() method
  private final Set disabledRules = new HashSet<>();
  private final Set disabledRuleCategories = new HashSet<>();
  private final Set enabledRules = new HashSet<>();
  private final Set enabledRuleCategories = new HashSet<>();
  private final Language language;
  private final Language motherTongue;

  private PrintStream printStream;
  private boolean listUnknownWords;
  private Set unknownWords;
  private boolean cleanOverlappingMatches;

  /**
   * Constants for correct paragraph-rule handling.
   */
  public enum ParagraphHandling {
    /**
     * Handle normally - all kinds of rules run.
     */
    NORMAL,
    /**
     * Run only paragraph-level rules.
     */
    ONLYPARA,
    /**
     * Run only sentence-level rules.
     */
    ONLYNONPARA
  }
  
  private static final List temporaryFiles = new ArrayList<>();

  /**
   * Create a JLanguageTool and setup the built-in rules for the
   * given language and false friend rules for the text language / mother tongue pair.
   *
   * @param lang the language of the text to be checked
   * @param motherTongue the user's mother tongue, used for false friend rules, or null.
   *          The mother tongue may also be used as a source language for checking bilingual texts.
   */
  public JLanguageTool(Language lang, Language motherTongue) {
    this(lang, motherTongue, null);
  }

  /**
   * Create a JLanguageTool and setup the built-in Java rules for the
   * given language.
   *
   * @param language the language of the text to be checked
   */
  public JLanguageTool(Language language) {
    this(language, null, null);
  }

  /**
   * Create a JLanguageTool and setup the built-in rules for the
   * given language and false friend rules for the text language / mother tongue pair.
   * 
   * @param language the language of the text to be checked
   * @param motherTongue the user's mother tongue, used for false friend rules, or null.
   *          The mother tongue may also be used as a source language for checking bilingual texts.
   * @param cache a cache to speed up checking if the same sentences get checked more than once,
   *              e.g. when LT is running as a server and texts are re-checked due to changes
   * @since 3.7
   */
  @Experimental
  public JLanguageTool(Language language, Language motherTongue, ResultCache cache) {
    this.language = Objects.requireNonNull(language, "language cannot be null");
    this.motherTongue = motherTongue;
    ResourceBundle messages = ResourceBundleTools.getMessageBundle(language);
    builtinRules = getAllBuiltinRules(language, messages);
    this.cleanOverlappingMatches = true;
    try {
      activateDefaultPatternRules();
      activateDefaultFalseFriendRules();
    } catch (Exception e) {
      throw new RuntimeException("Could not activate rules", e);
    }
    this.cache = cache;
  }
  
  /**
   * The grammar checker needs resources from following
   * directories:
   * 
   *   {@code /resource}
   *   {@code /rules}
   * 
   * @return The currently set data broker which allows to obtain
   * resources from the mentioned directories above. If no
   * data broker was set, a new {@link DefaultResourceDataBroker} will
   * be instantiated and returned.
   * @since 1.0.1
   */
  public static synchronized ResourceDataBroker getDataBroker() {
    if (JLanguageTool.dataBroker == null) {
      JLanguageTool.dataBroker = new DefaultResourceDataBroker();
    }
    return JLanguageTool.dataBroker;
  }
  
  /**
   * The grammar checker needs resources from following
   * directories:
   * 
   * {@code /resource}
   * {@code /rules}
   * 
   * @param broker The new resource broker to be used.
   * @since 1.0.1
   */
  public static synchronized void setDataBroker(ResourceDataBroker broker) {
    JLanguageTool.dataBroker = broker;
  }

  /**
   * Whether the {@link #check(String)} methods store unknown words. If set to
   * true (default: false), you can get the list of unknown words
   * using {@link #getUnknownWords()}.
   */
  public void setListUnknownWords(boolean listUnknownWords) {
    this.listUnknownWords = listUnknownWords;
  }
  
  /**
   * Whether the {@link #check(String)} methods return overlapping errors. If set to
   * true (default: true), it removes overlapping errors according to 
   * the priorities established for the language. 
   * @since 3.6
   */
  public void setCleanOverlappingMatches(boolean cleanOverlappingMatches) {
    this.cleanOverlappingMatches = cleanOverlappingMatches;
  }

  /**
   * Maximum errors per word rate, checking will stop with an exception if the rate is higher.
   * For example, with a rate of 0.33, the checking would stop if the user's
   * text has so many errors that more than every 3rd word causes a rule match.
   * Note that this may not apply for very short texts.
   * @since 4.0
   */
  @Experimental
  public void setMaxErrorsPerWordRate(float maxErrorsPerWordRate) {
    this.maxErrorsPerWordRate = maxErrorsPerWordRate;
  }
  
  /**
   * Gets the ResourceBundle (i18n strings) for the default language of the user's system.
   */
  public static ResourceBundle getMessageBundle() {
    return ResourceBundleTools.getMessageBundle();
  }

  /**
   * Gets the ResourceBundle (i18n strings) for the given user interface language.
   * @since 2.4 (public since 2.4)
   */
  public static ResourceBundle getMessageBundle(Language lang) {
    return ResourceBundleTools.getMessageBundle(lang);
  }
  
  private List getAllBuiltinRules(Language language, ResourceBundle messages) {
    try {
      return language.getRelevantRules(messages);
    } catch (IOException e) {
      throw new RuntimeException("Could not get rules of language " + language, e);
    }
  }

  /**
   * Set a PrintStream that will receive verbose output. Set to
   * {@code null} (which is the default) to disable verbose output.
   */
  public void setOutput(PrintStream printStream) {
    this.printStream = printStream;
  }

  /**
   * Load pattern rules from an XML file. Use {@link #addRule(Rule)} to add these
   * rules to the checking process.
   * @param filename path to an XML file in the classpath or in the filesystem - the classpath is checked first
   * @return a List of {@link PatternRule} objects
   */
  public List loadPatternRules(String filename) throws IOException {
    PatternRuleLoader ruleLoader = new PatternRuleLoader();
    try (InputStream is = this.getClass().getResourceAsStream(filename)) {
      if (is == null) {
        // happens for external rules plugged in as an XML file:
        return ruleLoader.getRules(new File(filename));
      } else {
        return ruleLoader.getRules(is, filename);
      }
    }
  }

  /**
   * Load false friend rules from an XML file. Only those pairs will be loaded
   * that match the current text language and the mother tongue specified in the
   * JLanguageTool constructor. Use {@link #addRule(Rule)} to add these rules to the
   * checking process.
   * @param filename path to an XML file in the classpath or in the filesystem - the classpath is checked first
   * @return a List of {@link PatternRule} objects, or an empty list if mother tongue is not set
   */
  public List loadFalseFriendRules(String filename)
      throws ParserConfigurationException, SAXException, IOException {
    if (motherTongue == null) {
      return Collections.emptyList();
    }
    FalseFriendRuleLoader ruleLoader = new FalseFriendRuleLoader();
    try (InputStream is = this.getClass().getResourceAsStream(filename)) {
      if (is == null) {
        return ruleLoader.getRules(new File(filename), language, motherTongue);
      } else {
        return ruleLoader.getRules(is, language, motherTongue);
      }
    }
  }

  /**
   * Activate rules that depend on a language model. The language model currently
   * consists of Lucene indexes with ngram occurrence counts.
   * @param indexDir directory with a '3grams' sub directory which contains a Lucene index with 3gram occurrence counts
   * @since 2.7
   */
  public void activateLanguageModelRules(File indexDir) throws IOException {
    LanguageModel languageModel = language.getLanguageModel(indexDir);
    if (languageModel != null) {
      ResourceBundle messages = getMessageBundle(language);
      List rules = language.getRelevantLanguageModelRules(messages, languageModel);
      userRules.addAll(rules);
    }
  }

  /**
   * Activate rules that depend on a word2vec language model.
   * @param indexDir directory with a subdirectories like 'en', each containing dictionary.txt and final_embeddings.txt
   * @since 4.0
   */
  public void activateWord2VecModelRules(File indexDir) throws IOException {
    Word2VecModel word2vecModel = language.getWord2VecModel(indexDir);
    if (word2vecModel != null) {
      ResourceBundle messages = getMessageBundle(language);
      List rules = language.getRelevantWord2VecModelRules(messages, word2vecModel);
      userRules.addAll(rules);
    }
  }

  /**
   * Loads and activates the pattern rules from
   * {@code org/languagetool/rules//grammar.xml}.
   */
  private void activateDefaultPatternRules() throws IOException {
    List patternRules = language.getPatternRules();
    List enabledRules = language.getDefaultEnabledRulesForVariant();
    List disabledRules = language.getDefaultDisabledRulesForVariant();
    if (!enabledRules.isEmpty() || !disabledRules.isEmpty()) {
      for (AbstractPatternRule patternRule : patternRules) {
        if (enabledRules.contains(patternRule.getId())) {
          patternRule.setDefaultOn();
        }
        if (disabledRules.contains(patternRule.getId())) {
          patternRule.setDefaultOff();
        }
      }
    }
    userRules.addAll(patternRules);
  }

  /**
   * Loads and activates the false friend rules from
   * rules/false-friends.xml.
   */
  private void activateDefaultFalseFriendRules()
      throws ParserConfigurationException, SAXException, IOException {
    String falseFriendRulesFilename = JLanguageTool.getDataBroker().getRulesDir() + "/" + FALSE_FRIEND_FILE;
    userRules.addAll(loadFalseFriendRules(falseFriendRulesFilename));
  }

  /**
   * Add a rule to be used by the next call to the check methods like {@link #check(String)}.
   */
  public void addRule(Rule rule) {
    userRules.add(rule);
  }

  /**
   * Disable a given rule so the check methods like {@link #check(String)} won't use it.
   * @param ruleId the id of the rule to disable - no error will be thrown if the id does not exist
   * @see #enableRule(String) 
   */
  public void disableRule(String ruleId) {
    disabledRules.add(ruleId);
    enabledRules.remove(ruleId);
  }

  /**
   * Disable the given rules so the check methods like {@link #check(String)} won't use them.
   * @param ruleIds the ids of the rules to disable - no error will be thrown if the id does not exist
   * @since 2.4
   */
  public void disableRules(List ruleIds) {
    disabledRules.addAll(ruleIds);
    enabledRules.removeAll(ruleIds);
  }

  /**
   * Disable the given rule category so the check methods like {@link #check(String)} won't use it.
   * @param id the id of the category to disable - no error will be thrown if the id does not exist
   * @since 3.3
   * @see #enableRuleCategory(CategoryId) 
   */
  public void disableCategory(CategoryId id) {
    disabledRuleCategories.add(id);
    enabledRuleCategories.remove(id);
  }

  /**
   * Returns true if a category is explicitly disabled.
   * 
   * @param id the id of the category to check - no error will be thrown if the id does not exist
   * @return true if this category is explicitly disabled.
   * @since 3.5
   * @see #disableCategory(org.languagetool.rules.CategoryId) 
   */
  public boolean isCategoryDisabled(CategoryId id) {
    return disabledRuleCategories.contains(id);
  }

  /**
   * Get the language that was used to configure this instance.
   */
  public Language getLanguage() {
    return language;
  }

  /**
   * Get rule ids of the rules that have been explicitly disabled.
   */
  public Set getDisabledRules() {
    return disabledRules;
  }

  /**
   * Enable a given rule so the check methods like {@link #check(String)} will use it.
   * This will not throw an exception if the given rule id doesn't exist.
   * @param ruleId the id of the rule to enable
   * @see #disableRule(String)
   */
  public void enableRule(String ruleId) {
    disabledRules.remove(ruleId);
    enabledRules.add(ruleId);
  }

  /**
   * Enable all rules of the given category so the check methods like {@link #check(String)} will use it.
   * This will not throw an exception if the given rule id doesn't exist.
   * @since 3.3
   * @see #disableCategory(org.languagetool.rules.CategoryId) 
   */
  public void enableRuleCategory(CategoryId id) {
    disabledRuleCategories.remove(id);
    enabledRuleCategories.add(id);
  }

  /**
   * Tokenizes the given text into sentences.
   */
  public List sentenceTokenize(String text) {
    return language.getSentenceTokenizer().tokenize(text);
  }

  /**
   * The main check method. Tokenizes the text into sentences and matches these
   * sentences against all currently active rules.
   * 
   * @param text the text to be checked
   * @return a List of {@link RuleMatch} objects
   */
  public List check(String text) throws IOException {
    return check(text, true, ParagraphHandling.NORMAL);
  }

  /**
   * The main check method. Tokenizes the text into sentences and matches these
   * sentences against all currently active rules.
   * 
   * @param text the text to be checked
   * @return a List of {@link RuleMatch} objects
   * @since 3.7
   */
  @Experimental
  public List check(String text, RuleMatchListener listener) throws IOException {
    return check(text, true, ParagraphHandling.NORMAL, listener);
  }

  public List check(String text, boolean tokenizeText, ParagraphHandling paraMode) throws IOException {
    return check(new AnnotatedTextBuilder().addText(text).build(), tokenizeText, paraMode);
  }

  /**
   * @since 3.7
   */
  @Experimental
  public List check(String text, boolean tokenizeText, ParagraphHandling paraMode, RuleMatchListener listener) throws IOException {
    return check(new AnnotatedTextBuilder().addText(text).build(), tokenizeText, paraMode, listener);
  }

  /**
   * The main check method. Tokenizes the text into sentences and matches these
   * sentences against all currently active rules, adjusting error positions so they refer 
   * to the original text including markup.
   * @since 2.3
   */
  public List check(AnnotatedText text) throws IOException {
    return check(text, true, ParagraphHandling.NORMAL);
  }
  
  /**
   * @since 3.9
   */
  @Experimental
  public List check(AnnotatedText text, RuleMatchListener listener) throws IOException {
    return check(text, true, ParagraphHandling.NORMAL, listener);
  }
  
  /**
   * The main check method. Tokenizes the text into sentences and matches these
   * sentences against all currently active rules.
   * @param annotatedText The text to be checked, created with {@link AnnotatedTextBuilder}. 
   *          Call this method with the complete text to be checked. If you call it
   *          repeatedly with smaller chunks like paragraphs or sentence, those rules that work across
   *          paragraphs/sentences won't work (their status gets reset whenever this method is called).
   * @param tokenizeText If true, then the text is tokenized into sentences.
   *          Otherwise, it is assumed it's already tokenized, i.e. it is only one sentence
   * @param paraMode Uses paragraph-level rules only if true.
   * @return a List of {@link RuleMatch} objects, describing potential errors in the text
   * @since 2.3
   */
  public List check(AnnotatedText annotatedText, boolean tokenizeText, ParagraphHandling paraMode) throws IOException {
    return check(annotatedText, tokenizeText, paraMode, null);
  }
  
  /**
   * The main check method. Tokenizes the text into sentences and matches these
   * sentences against all currently active rules.
   * @since 3.7
   */
  @Experimental
  public List check(AnnotatedText annotatedText, boolean tokenizeText, ParagraphHandling paraMode, RuleMatchListener listener) throws IOException {
    List sentences;
    if (tokenizeText) { 
      sentences = sentenceTokenize(annotatedText.getPlainText());
    } else {
      sentences = new ArrayList<>();
      sentences.add(annotatedText.getPlainText());
    }
    List allRules = getAllRules();
    if (printStream != null) {
      printIfVerbose(allRules.size() + " rules activated for language " + language);
    }

    unknownWords = new HashSet<>();
    List analyzedSentences = analyzeSentences(sentences);
    
    List ruleMatches = performCheck(analyzedSentences, sentences, allRules, paraMode, annotatedText, listener);
    ruleMatches = new SameRuleGroupFilter().filter(ruleMatches);
    // no sorting: SameRuleGroupFilter sorts rule matches already
    if (cleanOverlappingMatches) {
      ruleMatches = new CleanOverlappingFilter(language).filter(ruleMatches);
    }
    return ruleMatches;
  }
  
  /**
   * Use this method if you want to access LanguageTool's otherwise
   * internal analysis of the text. For actual text checking, use the {@code check...} methods instead.
   * @param text The text to be analyzed 
   * @since 2.5
   */
  public List analyzeText(String text) throws IOException {
    List sentences = sentenceTokenize(text);
    return analyzeSentences(sentences);
  }
  
  protected List analyzeSentences(List sentences) throws IOException {
    List analyzedSentences = new ArrayList<>();
    int j = 0;
    for (String sentence : sentences) {
      AnalyzedSentence analyzedSentence = getAnalyzedSentence(sentence);
      rememberUnknownWords(analyzedSentence);
      if (++j == sentences.size()) {
        AnalyzedTokenReadings[] anTokens = analyzedSentence.getTokens();
        anTokens[anTokens.length - 1].setParagraphEnd();
        analyzedSentence = new AnalyzedSentence(anTokens);
      }
      analyzedSentences.add(analyzedSentence);
      printSentenceInfo(analyzedSentence);
    }
    return analyzedSentences;
  }

  protected void printSentenceInfo(AnalyzedSentence analyzedSentence) {
    if (printStream != null) {
      printIfVerbose(analyzedSentence.toString());
      printIfVerbose(analyzedSentence.getAnnotations());
    }
  }
  
  protected List performCheck(List analyzedSentences, List sentences,
                                         List allRules, ParagraphHandling paraMode, AnnotatedText annotatedText) throws IOException {
    return performCheck(analyzedSentences, sentences, allRules, paraMode, annotatedText, null);
  }

  /**
   * @since 3.7
   */
  @Experimental
  protected List performCheck(List analyzedSentences, List sentences,
                                         List allRules, ParagraphHandling paraMode, AnnotatedText annotatedText, RuleMatchListener listener) throws IOException {
    Callable> matcher = new TextCheckCallable(allRules, sentences, analyzedSentences, paraMode, annotatedText, 0, 0, 1, listener);
    try {
      return matcher.call();
    } catch (IOException e) {
      throw e;
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }

  /**
   * This is an internal method that's public only for technical reasons, please use one
   * of the {@link #check(String)} methods instead. 
   * @since 2.3
   */
  public List checkAnalyzedSentence(ParagraphHandling paraMode,
        List rules, AnalyzedSentence analyzedSentence) throws IOException {
    List sentenceMatches = new ArrayList<>();
    for (Rule rule : rules) {
      if (rule instanceof TextLevelRule) {
        continue;
      }
      if (ignoreRule(rule)) {
        continue;
      }
      if (rule instanceof PatternRule && ((PatternRule)rule).canBeIgnoredFor(analyzedSentence)) {
        // this is a performance optimization, it should have no effect on matching logic
        continue;
      }
      if (paraMode == ParagraphHandling.ONLYPARA) {
        continue;
      }
      RuleMatch[] thisMatches = rule.match(analyzedSentence);
      for (RuleMatch elem : thisMatches) {
        sentenceMatches.add(elem);
      }
    }
    return new SameRuleGroupFilter().filter(sentenceMatches);
  }

  private boolean ignoreRule(Rule rule) {
    Category ruleCategory = rule.getCategory();
    boolean isCategoryDisabled = (disabledRuleCategories.contains(ruleCategory.getId()) || rule.getCategory().isDefaultOff()) 
            && !enabledRuleCategories.contains(ruleCategory.getId());
    boolean isRuleDisabled = disabledRules.contains(rule.getId()) 
            || (rule.isDefaultOff() && !enabledRules.contains(rule.getId()));
    boolean isDisabled;
    if (isCategoryDisabled) {
      isDisabled = !enabledRules.contains(rule.getId());
    } else {
      isDisabled = isRuleDisabled;
    }
    return isDisabled;
  }

  /**
   * Change RuleMatch positions so they are relative to the complete text,
   * not just to the sentence. 
   * @param charCount Count of characters in the sentences before
   * @param columnCount Current column number
   * @param lineCount Current line number
   * @param sentence The text being checked
   * @return The RuleMatch object with adjustments
   */
  public RuleMatch adjustRuleMatchPos(RuleMatch match, int charCount,
      int columnCount, int lineCount, String sentence, AnnotatedText annotatedText) {
    int fromPos = match.getFromPos() + charCount;
    int toPos = match.getToPos() + charCount;
    if (annotatedText != null) {
      fromPos = annotatedText.getOriginalTextPositionFor(fromPos);
      toPos = annotatedText.getOriginalTextPositionFor(toPos - 1) + 1;
    }
    RuleMatch thisMatch = new RuleMatch(match.getRule(), match.getSentence(),
        fromPos, toPos, match.getMessage(), match.getShortMessage());
    thisMatch.setSuggestedReplacements(match.getSuggestedReplacements());
    thisMatch.setUrl(match.getUrl());
    String sentencePartToError = sentence.substring(0, match.getFromPos());
    String sentencePartToEndOfError = sentence.substring(0, match.getToPos());
    int lastLineBreakPos = sentencePartToError.lastIndexOf('\n');
    int column;
    int endColumn;
    if (lastLineBreakPos == -1) {
      column = sentencePartToError.length() + columnCount;
    } else {
      column = sentencePartToError.length() - lastLineBreakPos;
    }
    int lastLineBreakPosInError = sentencePartToEndOfError.lastIndexOf('\n');
    if (lastLineBreakPosInError == -1) {
      endColumn = sentencePartToEndOfError.length() + columnCount;
    } else {
      endColumn = sentencePartToEndOfError.length() - lastLineBreakPosInError;
    }
    int lineBreaksToError = countLineBreaks(sentencePartToError);
    int lineBreaksToEndOfError = countLineBreaks(sentencePartToEndOfError);
    thisMatch.setLine(lineCount + lineBreaksToError);
    thisMatch.setEndLine(lineCount + lineBreaksToEndOfError);
    thisMatch.setColumn(column);
    thisMatch.setEndColumn(endColumn);
    return thisMatch;
  }

  protected void rememberUnknownWords(AnalyzedSentence analyzedText) {
    if (listUnknownWords) {
      AnalyzedTokenReadings[] atr = analyzedText.getTokensWithoutWhitespace();
      for (AnalyzedTokenReadings reading : atr) {
        if (!reading.isTagged()) {
          unknownWords.add(reading.getToken());
        }
      }
    }
  }

  /**
   * Get the alphabetically sorted list of unknown words in the latest run of one of the {@link #check(String)} methods.
   * @throws IllegalStateException if {@link #setListUnknownWords(boolean)} has been set to {@code false}
   */
  public List getUnknownWords() {
    if (!listUnknownWords) {
      throw new IllegalStateException("listUnknownWords is set to false, unknown words not stored");
    }
    List words = new ArrayList<>(unknownWords);
    Collections.sort(words);
    return words;
  }

  // non-private only for test case
  static int countLineBreaks(String s) {
    int pos = -1;
    int count = 0;
    while (true) {
      int nextPos = s.indexOf('\n', pos + 1);
      if (nextPos == -1) {
        break;
      }
      pos = nextPos;
      count++;
    }
    return count;
  }

  /**
   * Tokenizes the given {@code sentence} into words and analyzes it,
   * and then disambiguates POS tags.
   * @param sentence sentence to be analyzed
   */
  public AnalyzedSentence getAnalyzedSentence(String sentence) throws IOException {
    SimpleInputSentence cacheKey = new SimpleInputSentence(sentence, language);
    AnalyzedSentence cachedSentence = cache != null ? cache.getIfPresent(cacheKey) : null;
    if (cachedSentence != null) {
      return cachedSentence;
    } else {
      AnalyzedSentence analyzedSentence = language.getDisambiguator().disambiguate(getRawAnalyzedSentence(sentence));
      if (language.getPostDisambiguationChunker() != null) {
        language.getPostDisambiguationChunker().addChunkTags(Arrays.asList(analyzedSentence.getTokens()));
      }
      if (cache != null) {
        cache.put(cacheKey, analyzedSentence);
      }
      return analyzedSentence;
    }
  }

  /**
   * Tokenizes the given {@code sentence} into words and analyzes it.
   * This is the same as {@link #getAnalyzedSentence(String)} but it does not run
   * the disambiguator.
   * @param sentence sentence to be analyzed
   * @since 0.9.8
   */
  public AnalyzedSentence getRawAnalyzedSentence(String sentence) throws IOException {
    List tokens = language.getWordTokenizer().tokenize(sentence);
    Map softHyphenTokens = replaceSoftHyphens(tokens);

    List aTokens = language.getTagger().tag(tokens);
    if (language.getChunker() != null) {
      language.getChunker().addChunkTags(aTokens);
    }
    int numTokens = aTokens.size();
    int posFix = 0; 
    for (int i = 1; i < numTokens; i++) {
      aTokens.get(i).setWhitespaceBefore(aTokens.get(i - 1).isWhitespace());
      aTokens.get(i).setStartPos(aTokens.get(i).getStartPos() + posFix);
      if (!softHyphenTokens.isEmpty()) {
        if (softHyphenTokens.get(i) != null) {
          aTokens.get(i).addReading(language.getTagger().createToken(softHyphenTokens.get(i), null));
          posFix += softHyphenTokens.get(i).length() - aTokens.get(i).getToken().length();
        }
      }
    }
        
    AnalyzedTokenReadings[] tokenArray = new AnalyzedTokenReadings[tokens.size() + 1];
    AnalyzedToken[] startTokenArray = new AnalyzedToken[1];
    int toArrayCount = 0;
    AnalyzedToken sentenceStartToken = new AnalyzedToken("", SENTENCE_START_TAGNAME, null);
    startTokenArray[0] = sentenceStartToken;
    tokenArray[toArrayCount++] = new AnalyzedTokenReadings(startTokenArray, 0);
    int startPos = 0;
    for (AnalyzedTokenReadings posTag : aTokens) {
      posTag.setStartPos(startPos);
      tokenArray[toArrayCount++] = posTag;
      startPos += posTag.getToken().length();
    }

    // add additional tags
    int lastToken = toArrayCount - 1;
    // make SENT_END appear at last not whitespace token
    for (int i = 0; i < toArrayCount - 1; i++) {
      if (!tokenArray[lastToken - i].isWhitespace()) {
        lastToken -= i;
        break;
      }
    }

    tokenArray[lastToken].setSentEnd();

    if (tokenArray.length == lastToken + 1 && tokenArray[lastToken].isLinebreak()) {
      tokenArray[lastToken].setParagraphEnd();
    }
    return new AnalyzedSentence(tokenArray);
  }

  private Map replaceSoftHyphens(List tokens) {
    Pattern ignoredCharacterRegex = language.getIgnoredCharactersRegex();
    Map ignoredCharsTokens = new HashMap<>();
    if (ignoredCharacterRegex == null) {
      return ignoredCharsTokens;
    }
    for (int i = 0; i < tokens.size(); i++) {
      if (ignoredCharacterRegex.matcher(tokens.get(i)).find()) {
        ignoredCharsTokens.put(i, tokens.get(i));
        tokens.set(i, ignoredCharacterRegex.matcher(tokens.get(i)).replaceAll(""));
      }
    }
    return ignoredCharsTokens;
  }

  /**
   * Get all rule categories for the current language.
   * 
   * @return a map of {@link Category Categories}, keyed by their {@link CategoryId id}.
   * @since 3.5
   */
  public Map getCategories() {
    Map map = new HashMap<>();
    for (Rule rule : getAllRules()) {
      map.put(rule.getCategory().getId(), rule.getCategory());
    }
    return map;
  }

  /**
   * Get all rules for the current language that are built-in or that have been
   * added using {@link #addRule(Rule)}. Please note that XML rules that are grouped
   * will appear as multiple rules with the same id. To tell them apart, check if
   * they are of type {@code AbstractPatternRule}, cast them to that type and call
   * their {@link AbstractPatternRule#getSubId()} method.
   * @return a List of {@link Rule} objects
   */
  public List getAllRules() {
    List rules = new ArrayList<>();
    rules.addAll(builtinRules);
    rules.addAll(userRules);
    return rules;
  }
  
  /**
   * Get all active (not disabled) rules for the current language that are built-in or that 
   * have been added using e.g. {@link #addRule(Rule)}. See {@link #getAllRules()} for hints
   * about rule ids.
   * @return a List of {@link Rule} objects
   */
  public List getAllActiveRules() {
    List rules = new ArrayList<>();
    List rulesActive = new ArrayList<>();
    rules.addAll(builtinRules);
    rules.addAll(userRules);
    // Some rules have an internal state so they can do checks over sentence
    // boundaries. These need to be reset so the checks don't suddenly
    // work on different texts with the same data. However, it could be useful
    // to keep the state information if we're checking a continuous text.    
    for (Rule rule : rules) {
      if (!ignoreRule(rule)) {
        rulesActive.add(rule);
      }
    }    
    return rulesActive;
  }
  
  /**
   * Works like getAllActiveRules but overrides defaults by officeefaults
   * @return a List of {@link Rule} objects
   * @since 4.0
   */
  public List getAllActiveOfficeRules() {
    List rules = new ArrayList<>();
    List rulesActive = new ArrayList<>();
    rules.addAll(builtinRules);
    rules.addAll(userRules);
    for (Rule rule : rules) {
      if (!ignoreRule(rule) && !rule.isOfficeDefaultOff()) {
        rulesActive.add(rule);
      } else if (rule.isOfficeDefaultOn()) {
        rulesActive.add(rule);
        enableRule(rule.getId());
      } else if (!ignoreRule(rule) && rule.isOfficeDefaultOff()) {
        disableRule(rule.getId());
      }
    }    
    return rulesActive;
  }
  
  /**
   * Get pattern rules by Id and SubId. This returns a list because rules that use {@code ...}
   * are internally expanded into several rules.
   * @return a List of {@link Rule} objects
   * @since 2.3
   */
  public List getPatternRulesByIdAndSubId(String Id, String subId) {
    List rules = getAllRules();
    List rulesById = new ArrayList<>();   
    for (Rule rule : rules) {
      if (rule instanceof AbstractPatternRule) {
        if (rule.getId().equals(Id) && ((AbstractPatternRule) rule).getSubId().equals(subId)) {
          rulesById.add((AbstractPatternRule) rule);
        }
      }
    }    
    return rulesById;
  }
  
  protected void printIfVerbose(String s) {
    if (printStream != null) {
      printStream.println(s);
    }
  }
  
  /**
   * Adds a temporary file to the internal list
   * (internal method, you should never need to call this as a user of LanguageTool)
   * @param file the file to be added.
   */
  public static void addTemporaryFile(File file) {
    temporaryFiles.add(file);
  }
  
  /**
   * Clean up all temporary files, if there are any.
   */
  public static void removeTemporaryFiles() {
    for (File file : temporaryFiles) {
      file.delete();
    }
  }

  class TextCheckCallable implements Callable> {

    private final List rules;
    private final ParagraphHandling paraMode;
    private final AnnotatedText annotatedText;
    private final List sentences;
    private final List analyzedSentences;
    private final RuleMatchListener listener;
    
    private int charCount;
    private int lineCount;
    private int columnCount;

    TextCheckCallable(List rules, List sentences, List analyzedSentences,
                      ParagraphHandling paraMode, AnnotatedText annotatedText, int charCount, int lineCount, int columnCount, RuleMatchListener listener) {
      this.rules = rules;
      if (sentences.size() != analyzedSentences.size()) {
        throw new IllegalArgumentException("sentences and analyzedSentences do not have the same length : " + sentences.size() + " != " + analyzedSentences.size());
      }
      this.sentences = Objects.requireNonNull(sentences);
      this.analyzedSentences = Objects.requireNonNull(analyzedSentences);
      this.paraMode = Objects.requireNonNull(paraMode);
      this.annotatedText = Objects.requireNonNull(annotatedText);
      this.charCount = charCount;
      this.lineCount = lineCount;
      this.columnCount = columnCount;
      this.listener = listener;
    }

    @Override
    public List call() throws Exception {
      List ruleMatches = new ArrayList<>();
      ruleMatches.addAll(getTextLevelRuleMatches());
      ruleMatches.addAll(getOtherRuleMatches());
      return ruleMatches;
    }

    private List getTextLevelRuleMatches() throws IOException {
      List ruleMatches = new ArrayList<>();
      for (Rule rule : rules) {
        if (rule instanceof TextLevelRule && !ignoreRule(rule) && paraMode != ParagraphHandling.ONLYNONPARA) {
          RuleMatch[] matches = ((TextLevelRule) rule).match(analyzedSentences, annotatedText);
          List adaptedMatches = new ArrayList<>();
          for (RuleMatch match : matches) {
            LineColumnRange range = getLineColumnRange(match);
            int newFromPos = annotatedText.getOriginalTextPositionFor(match.getFromPos());
            int newToPos = annotatedText.getOriginalTextPositionFor(match.getToPos() - 1) + 1;
            RuleMatch newMatch = new RuleMatch(match.getRule(), match.getSentence(), newFromPos, newToPos, match.getMessage(), match.getShortMessage());
            newMatch.setUrl(match.getUrl());
            newMatch.setLine(range.from.line);
            newMatch.setEndLine(range.to.line);
            if (match.getLine() == 0) {
              newMatch.setColumn(range.from.column + 1);
            } else {
              newMatch.setColumn(range.from.column);
            }
            newMatch.setEndColumn(range.to.column);
            newMatch.setSuggestedReplacements(match.getSuggestedReplacements());
            adaptedMatches.add(newMatch);
          }
          ruleMatches.addAll(adaptedMatches);
          if (listener != null) {
            for (RuleMatch adaptedMatch : adaptedMatches) {
              listener.matchFound(adaptedMatch);
            }
          }
        }
      }
      return ruleMatches;
    }

    private List getOtherRuleMatches() {
      List ruleMatches = new ArrayList<>();
      int i = 0;
      int wordCounter = 0;
      for (AnalyzedSentence analyzedSentence : analyzedSentences) {
        String sentence = sentences.get(i++);
        wordCounter += analyzedSentence.getTokensWithoutWhitespace().length;
        try {
          List sentenceMatches = null;
          InputSentence cacheKey = null;
          if (cache != null) {
            cacheKey = new InputSentence(analyzedSentence.getText(), language, motherTongue,
                    disabledRules, disabledRuleCategories,
                    enabledRules, enabledRuleCategories);
            sentenceMatches = cache.getIfPresent(cacheKey);
          }
          if (sentenceMatches == null) {
            sentenceMatches = checkAnalyzedSentence(paraMode, rules, analyzedSentence);
          }
          if (cache != null) {
            cache.put(cacheKey, sentenceMatches);
          }
          List adaptedMatches = new ArrayList<>();
          for (RuleMatch elem : sentenceMatches) {
            RuleMatch thisMatch = adjustRuleMatchPos(elem,
                    charCount, columnCount, lineCount, sentence, annotatedText);
            adaptedMatches.add(thisMatch);
            if (listener != null) {
              listener.matchFound(thisMatch);
            }
          }
          ruleMatches.addAll(adaptedMatches);
          float errorsPerWord = ruleMatches.size() / (float)wordCounter;
          //System.out.println("errorPerWord " + errorsPerWord + " (matches: " + ruleMatches.size() + " / " + wordCounter + ")");
          if (maxErrorsPerWordRate > 0 && errorsPerWord > maxErrorsPerWordRate && wordCounter > 25) {
            throw new ErrorRateTooHighException("Text checking was stopped due to too many errors (more than " + String.format("%.0f", maxErrorsPerWordRate*100) +
                    "% of words seem to have an error). Are you sure you have set the correct text language? Language set: " + language.getName());
          }
          charCount += sentence.length();
          lineCount += countLineBreaks(sentence);

          // calculate matching column:
          int lineBreakPos = sentence.lastIndexOf('\n');
          if (lineBreakPos == -1) {
            columnCount += sentence.length();
          } else {
            if (lineBreakPos == 0) {
              columnCount = sentence.length();
              if (!language.getSentenceTokenizer().singleLineBreaksMarksPara()) {
                columnCount--;
              }
            } else {
              columnCount = sentence.length() - lineBreakPos;
            }
          }
        } catch (ErrorRateTooHighException e) {
          throw e;
        } catch (Exception e) {
          throw new RuntimeException("Could not check sentence (language: " + language + "): '"
                  + StringUtils.abbreviate(analyzedSentence.toTextString(), 200) + "'", e);
        }
      }
      return ruleMatches;
    }

    private LineColumnRange getLineColumnRange(RuleMatch match) {
      LineColumnPosition fromPos = new LineColumnPosition(-1, -1);
      LineColumnPosition toPos = new LineColumnPosition(-1, -1);
      LineColumnPosition pos = new LineColumnPosition(0, 0);
      int charCount = 0;
      for (AnalyzedSentence analyzedSentence : analyzedSentences) {
        for (AnalyzedTokenReadings readings : analyzedSentence.getTokens()) {
          String token = readings.getToken();
          if ("\n".equals(token)) {
            pos.line++;
            pos.column = 0;
          }
          pos.column += token.length();
          charCount += token.length();
          if (charCount == match.getFromPos()) {
            fromPos = new LineColumnPosition(pos.line, pos.column);
          } 
          if (charCount == match.getToPos()) {
            toPos = new LineColumnPosition(pos.line, pos.column);
          }
        }
      }
      return new LineColumnRange(fromPos, toPos);
    }
    
    private class LineColumnPosition {
      int line;
      int column;
      private LineColumnPosition(int line, int column) {
        this.line = line;
        this.column = column;
      }
    }
  
    private class LineColumnRange {
      LineColumnPosition from;
      LineColumnPosition to;
      private LineColumnRange(LineColumnPosition from, LineColumnPosition to) {
        this.from = from;
        this.to = to;
      }
    }
  
  }

}