org.languagetool.rules.ConfusionSetLoader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules;
import org.languagetool.Language;
import org.languagetool.ShortDescriptionProvider;
import java.io.*;
import java.util.*;
/**
* Loads a confusion set from a plain text file (UTF-8). See {@code confusion_sets.txt}
* for a description of the file format.
* @since 2.7
*/
public class ConfusionSetLoader {
private static final String CHARSET = "utf-8";
private final ShortDescriptionProvider wordDefs;
private final Language lang;
public ConfusionSetLoader(Language lang) {
wordDefs = new ShortDescriptionProvider();
this.lang = Objects.requireNonNull(lang);
}
public Map> loadConfusionPairs(InputStream stream) throws IOException {
Map> map = new HashMap<>();
try (
InputStreamReader reader = new InputStreamReader(stream, CHARSET);
BufferedReader br = new BufferedReader(reader)
) {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#") || line.trim().isEmpty()) {
continue;
}
String[] parts = line.replaceFirst("\\s*#.*", "").split("\\s*(;|->)\\s*");
if (parts.length != 3) {
throw new RuntimeException("Unexpected format: '" + line + "' - expected three semicolon-separated values: word1; word2; factor");
}
boolean bidirectional = !line.replaceFirst("#.*", "").contains(" -> ");
List confusionStrings = new ArrayList<>();
Set loadedForSet = new HashSet<>();
String prevWord = null;
for (String part : Arrays.asList(parts).subList(0, parts.length - 1)) {
String[] subParts = part.split("\\|");
String word = subParts[0];
if (bidirectional && prevWord != null && word.compareTo(prevWord) < 0) {
// Quick hack for reordering lines
//System.err.println("Delete: " + line);
//String comment = line.substring(line.indexOf("#"));
//String newLine = parts[1] + "; " + parts[0] + "; " + parts[2] + "; " + comment;
//System.err.println("Add: " + newLine);
throw new RuntimeException("Order words alphabetically per line in the confusion set file: " + line + ": found " + word + " after " + prevWord);
}
prevWord = word;
String description = subParts.length == 2 ? subParts[1] : null;
if (loadedForSet.contains(word)) {
throw new RuntimeException("Word appears twice in same confusion set: '" + word + "'");
}
if (description == null) {
description = wordDefs.getShortDescription(word, lang);
}
confusionStrings.add(new ConfusionString(word, description));
loadedForSet.add(word);
}
ConfusionPair confusionSet = new ConfusionPair(confusionStrings.get(0), confusionStrings.get(1), Long.parseLong(parts[parts.length-1]), bidirectional);
for (ConfusionString confusionString : confusionStrings) {
String key = confusionString.getString();
List existingEntry = map.get(key);
if (existingEntry != null) {
existingEntry.add(confusionSet);
} else {
List pairs = new ArrayList<>();
pairs.add(confusionSet);
map.put(key, pairs);
}
if (!bidirectional) {
break; // "A -> B", so only consider that direction
}
}
}
}
return map;
}
}