org.languagetool.rules.ConfusionSetLoader Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation

LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.

There is a newer version: 6.4

Show newest version

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules;

import org.languagetool.Language;
import org.languagetool.ShortDescriptionProvider;

import java.io.*;
import java.util.*;

/**
 * Loads a confusion set from a plain text file (UTF-8). See {@code confusion_sets.txt}
 * for a description of the file format.
 * @since 2.7
 */
public class ConfusionSetLoader {

  private static final String CHARSET = "utf-8";
  
  private final ShortDescriptionProvider wordDefs;
  private final Language lang;

  public ConfusionSetLoader(Language lang) {
    wordDefs = new ShortDescriptionProvider();
    this.lang = Objects.requireNonNull(lang);
  }

  public Map> loadConfusionPairs(InputStream stream) throws IOException {
    Map> map = new HashMap<>();
    try (
      InputStreamReader reader = new InputStreamReader(stream, CHARSET);
      BufferedReader br = new BufferedReader(reader)
    ) {
      String line;
      while ((line = br.readLine()) != null) {
        if (line.startsWith("#") || line.trim().isEmpty()) {
          continue;
        }
        String[] parts = line.replaceFirst("\\s*#.*", "").split("\\s*(;|->)\\s*");
        if (parts.length != 3) {
          throw new RuntimeException("Unexpected format: '" + line + "' - expected three semicolon-separated values: word1; word2; factor");
        }
        boolean bidirectional = !line.replaceFirst("#.*", "").contains(" -> ");
        List confusionStrings = new ArrayList<>();
        Set loadedForSet = new HashSet<>();
        String prevWord = null;
        for (String part : Arrays.asList(parts).subList(0, parts.length - 1)) {
          String[] subParts = part.split("\\|");
          String word = subParts[0];
          if (bidirectional && prevWord != null && word.compareTo(prevWord) < 0) {
            // Quick hack for reordering lines
            //System.err.println("Delete: " + line);
            //String comment = line.substring(line.indexOf("#"));
            //String newLine = parts[1] + "; " + parts[0] + "; " + parts[2] + "; " + comment;
            //System.err.println("Add: " + newLine);
            throw new RuntimeException("Order words alphabetically per line in the confusion set file: " + line + ": found " + word + " after " + prevWord);
          }
          prevWord = word;
          String description = subParts.length == 2 ? subParts[1] : null;
          if (loadedForSet.contains(word)) {
            throw new RuntimeException("Word appears twice in same confusion set: '" + word + "'");
          }
          if (description == null) {
            description = wordDefs.getShortDescription(word, lang);
          }
          confusionStrings.add(new ConfusionString(word, description));
          loadedForSet.add(word);
        }
        long factor = Long.parseLong(parts[parts.length - 1]);
        if (bidirectional) {
          ConfusionPair confusionSet1 = new ConfusionPair(confusionStrings.get(0), confusionStrings.get(1), factor, false);
          addToMap(map, confusionStrings, confusionSet1);
          ConfusionPair confusionSet2 = new ConfusionPair(confusionStrings.get(1), confusionStrings.get(0), factor, false);
          addToMap(map, confusionStrings, confusionSet2);
        } else {
          ConfusionPair confusionSet = new ConfusionPair(confusionStrings.get(0), confusionStrings.get(1), factor, false);
          addToMap(map, confusionStrings, confusionSet);
        }
      }
    }
    return map;
  }

  private void addToMap(Map> map, List confusionStrings, ConfusionPair confusionSet) {
    for (ConfusionString confusionString : confusionStrings) {
      String key = confusionString.getString();
      List existingEntry = map.get(key);
      if (existingEntry != null) {
        existingEntry.add(confusionSet);
      } else {
        List pairs = new ArrayList<>();
        pairs.add(confusionSet);
        map.put(key, pairs);
      }
    }
  }

}