org.languagetool.rules.ConfusionSetLoader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules;
import java.io.*;
import java.util.*;
/**
* Loads a confusion set from a plain text file (UTF-8). See {@code confusion_sets.txt}
* for a description of the file format.
* @since 2.7
*/
public class ConfusionSetLoader {
private static final String CHARSET = "utf-8";
public ConfusionSetLoader() {
}
public Map> loadConfusionSet(InputStream stream) throws IOException {
Map> map = new HashMap<>();
try (
InputStreamReader reader = new InputStreamReader(stream, CHARSET);
BufferedReader br = new BufferedReader(reader)
) {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#") || line.trim().isEmpty()) {
continue;
}
String[] parts = line.replaceFirst("\\s*#.*", "").split(";\\s*");
if (parts.length != 3) {
throw new RuntimeException("Unexpected format: '" + line + "' - expected three semicolon-separated values: word1; word2; factor");
}
List confusionStrings = new ArrayList<>();
Set loadedForSet = new HashSet<>();
String prevWord = null;
for (String part : Arrays.asList(parts).subList(0, parts.length-1)) {
String[] subParts = part.split("\\|");
String word = subParts[0];
if (prevWord != null && word.compareTo(prevWord) < 0) {
throw new RuntimeException("Order words alphabetically per line in the confusion set file: " + line);
}
prevWord = word;
String description = subParts.length == 2 ? subParts[1] : null;
if (loadedForSet.contains(word)) {
throw new RuntimeException("Word appears twice in same confusion set: '" + word + "'");
}
confusionStrings.add(new ConfusionString(word, description));
loadedForSet.add(word);
}
ConfusionSet confusionSet = new ConfusionSet(Long.parseLong(parts[parts.length-1]), confusionStrings);
for (ConfusionString confusionString : confusionStrings) {
String key = confusionString.getString();
List existingEntry = map.get(key);
if (existingEntry != null) {
existingEntry.add(confusionSet);
} else {
List sets = new ArrayList<>();
sets.add(confusionSet);
map.put(key, sets);
}
}
}
}
return map;
}
}