org.apache.lucene.analysis.hunspell.HunspellDictionary Maven / Gradle / Ivy
The newest version!
package org.apache.lucene.analysis.hunspell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.util.Version;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
/**
* In-memory structure for the dictionary (.dic) and affix (.aff)
* data of a hunspell dictionary.
*/
public class HunspellDictionary {
static final HunspellWord NOFLAGS = new HunspellWord();
private static final String ALIAS_KEY = "AF";
private static final String PREFIX_KEY = "PFX";
private static final String SUFFIX_KEY = "SFX";
private static final String FLAG_KEY = "FLAG";
private static final String NUM_FLAG_TYPE = "num";
private static final String UTF8_FLAG_TYPE = "UTF-8";
private static final String LONG_FLAG_TYPE = "long";
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
private static final boolean IGNORE_CASE_DEFAULT = false;
private CharArrayMap> words;
private CharArrayMap> prefixes;
private CharArrayMap> suffixes;
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
private boolean ignoreCase = IGNORE_CASE_DEFAULT;
private final Version version;
private String[] aliases;
private int aliasCount = 0;
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
* You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
* @param version Lucene Version
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
}
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
* You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
* @param version Lucene Version
* @param ignoreCase If true, dictionary matching will be case insensitive
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
this(affix, Arrays.asList(dictionary), version, ignoreCase);
}
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
* You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
* @param version Lucene Version
* @param ignoreCase If true, dictionary matching will be case insensitive
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public HunspellDictionary(InputStream affix, List dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
this.version = version;
this.ignoreCase = ignoreCase;
String encoding = getDictionaryEncoding(affix);
CharsetDecoder decoder = getJavaEncoding(encoding);
readAffixFile(affix, decoder);
words = new CharArrayMap>(version, 65535 /* guess */, this.ignoreCase);
for (InputStream dictionary : dictionaries) {
readDictionaryFile(dictionary, decoder);
}
}
/**
* Looks up HunspellWords that match the String created from the given char array, offset and length
*
* @param word Char array to generate the String from
* @param offset Offset in the char array that the String starts at
* @param length Length from the offset that the String is
* @return List of HunspellWords that match the generated String, or {@code null} if none are found
*/
public List lookupWord(char word[], int offset, int length) {
return words.get(word, offset, length);
}
/**
* Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
*
* @param word Char array to generate the String from
* @param offset Offset in the char array that the String starts at
* @param length Length from the offset that the String is
* @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
*/
public List lookupPrefix(char word[], int offset, int length) {
return prefixes.get(word, offset, length);
}
/**
* Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
*
* @param word Char array to generate the String from
* @param offset Offset in the char array that the String starts at
* @param length Length from the offset that the String is
* @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
*/
public List lookupSuffix(char word[], int offset, int length) {
return suffixes.get(word, offset, length);
}
/**
* Reads the affix file through the provided InputStream, building up the prefix and suffix maps
*
* @param affixStream InputStream to read the content of the affix file from
* @param decoder CharsetDecoder to decode the content of the file
* @throws IOException Can be thrown while reading from the InputStream
*/
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
prefixes = new CharArrayMap>(version, 8, ignoreCase);
suffixes = new CharArrayMap>(version, 8, ignoreCase);
BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
String line = null;
while ((line = reader.readLine()) != null) {
if (line.startsWith(ALIAS_KEY)) {
parseAlias(line);
} else if (line.startsWith(PREFIX_KEY)) {
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
} else if (line.startsWith(SUFFIX_KEY)) {
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
} else if (line.startsWith(FLAG_KEY)) {
// Assume that the FLAG line comes before any prefix or suffixes
// Store the strategy so it can be used when parsing the dic file
flagParsingStrategy = getFlagParsingStrategy(line);
}
}
}
/**
* Parses a specific affix rule putting the result into the provided affix map
*
* @param affixes Map where the result of the parsing will be put
* @param header Header line of the affix rule
* @param reader BufferedReader to read the content of the rule from
* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
* pattern
* @throws IOException Can be thrown while reading the rule
*/
private void parseAffix(CharArrayMap> affixes,
String header,
BufferedReader reader,
String conditionPattern) throws IOException {
String args[] = header.split("\\s+");
boolean crossProduct = args[2].equals("Y");
int numLines = Integer.parseInt(args[3]);
for (int i = 0; i < numLines; i++) {
String line = reader.readLine();
String ruleArgs[] = line.split("\\s+");
HunspellAffix affix = new HunspellAffix();
affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);
String affixArg = ruleArgs[3];
int flagSep = affixArg.lastIndexOf('/');
if (flagSep != -1) {
String flagPart = affixArg.substring(flagSep + 1);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(appendFlags);
affix.setAppendFlags(appendFlags);
affix.setAppend(affixArg.substring(0, flagSep));
} else {
affix.setAppend(affixArg);
}
String condition = ruleArgs[4];
affix.setCondition(condition, String.format(conditionPattern, condition));
affix.setCrossProduct(crossProduct);
List list = affixes.get(affix.getAppend());
if (list == null) {
list = new ArrayList();
affixes.put(affix.getAppend(), list);
}
list.add(affix);
}
}
/**
* Parses the encoding specificed in the affix file readable through the provided InputStream
*
* @param affix InputStream for reading the affix file
* @return Encoding specified in the affix file
* @throws IOException Can be thrown while reading from the InputStream
* @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET }
*/
private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
final StringBuilder encoding = new StringBuilder();
for (;;) {
encoding.setLength(0);
int ch;
while ((ch = affix.read()) >= 0) {
if (ch == '\n') {
break;
}
if (ch != '\r') {
encoding.append((char)ch);
}
}
if (
encoding.length() == 0 || encoding.charAt(0) == '#' ||
// this test only at the end as ineffective but would allow lines only containing spaces:
encoding.toString().trim().length() == 0
) {
if (ch < 0) {
throw new ParseException("Unexpected end of affix file.", 0);
}
continue;
}
if ("SET ".equals(encoding.substring(0, 4))) {
// cleanup the encoding string, too (whitespace)
return encoding.substring(4).trim();
}
throw new ParseException("The first non-comment line in the affix file must "+
"be a 'SET charset', was: '" + encoding +"'", 0);
}
}
/**
* Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and
* MICROSOFT-CP1251 etc are allowed...
*
* @param encoding Encoding to retrieve the CharsetDecoder for
* @return CharSetDecoder for the given encoding
*/
private CharsetDecoder getJavaEncoding(String encoding) {
Charset charset = Charset.forName(encoding);
return charset.newDecoder();
}
/**
* Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definiton line taken from the affix file
*
* @param flagLine Line containing the flag information
* @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definiton
*/
private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
String flagType = flagLine.substring(5);
if (NUM_FLAG_TYPE.equals(flagType)) {
return new NumFlagParsingStrategy();
} else if (UTF8_FLAG_TYPE.equals(flagType)) {
return new SimpleFlagParsingStrategy();
} else if (LONG_FLAG_TYPE.equals(flagType)) {
return new DoubleASCIIFlagParsingStrategy();
}
throw new IllegalArgumentException("Unknown flag type: " + flagType);
}
/**
* Reads the dictionary file through the provided InputStream, building up the words map
*
* @param dictionary InputStream to read the dictionary file through
* @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file
*/
private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
// TODO: don't create millions of strings.
String line = reader.readLine(); // first line is number of entries
int numEntries = Integer.parseInt(line);
// TODO: the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
while ((line = reader.readLine()) != null) {
String entry;
HunspellWord wordForm;
int flagSep = line.lastIndexOf('/');
if (flagSep == -1) {
wordForm = NOFLAGS;
entry = line;
} else {
// note, there can be comments (morph description) after a flag.
// we should really look for any whitespace
int end = line.indexOf('\t', flagSep);
if (end == -1)
end = line.length();
String flagPart = line.substring(flagSep + 1, end);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart));
Arrays.sort(wordForm.getFlags());
entry = line.substring(0, flagSep);
if(ignoreCase) {
entry = entry.toLowerCase(Locale.ENGLISH);
}
}
List entries = words.get(entry);
if (entries == null) {
entries = new ArrayList();
words.put(entry, entries);
}
entries.add(wordForm);
}
}
public Version getVersion() {
return version;
}
private void parseAlias(String line) {
String ruleArgs[] = line.split("\\s+");
if (aliases == null) {
//first line should be the aliases count
final int count = Integer.parseInt(ruleArgs[1]);
aliases = new String[count];
} else {
aliases[aliasCount++] = ruleArgs[1];
}
}
private String getAliasValue(int id) {
try {
return aliases[id - 1];
} catch (IndexOutOfBoundsException ex) {
throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
}
}
/**
* Abstraction of the process of parsing flags taken from the affix and dic files
*/
private static abstract class FlagParsingStrategy {
/**
* Parses the given String into a single flag
*
* @param rawFlag String to parse into a flag
* @return Parsed flag
*/
char parseFlag(String rawFlag) {
return parseFlags(rawFlag)[0];
}
/**
* Parses the given String into multiple flags
*
* @param rawFlags String to parse into flags
* @return Parsed flags
*/
abstract char[] parseFlags(String rawFlags);
}
/**
* Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
* Can be used with both the ASCII and UTF-8 flag types.
*/
private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
/**
* {@inheritDoc}
*/
public char[] parseFlags(String rawFlags) {
return rawFlags.toCharArray();
}
}
/**
* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case
* of multiple flags, each number is separated by a comma.
*/
private static class NumFlagParsingStrategy extends FlagParsingStrategy {
/**
* {@inheritDoc}
*/
public char[] parseFlags(String rawFlags) {
String[] rawFlagParts = rawFlags.trim().split(",");
char[] flags = new char[rawFlagParts.length];
for (int i = 0; i < rawFlagParts.length; i++) {
// note, removing the trailing X/leading I for nepali... what is the rule here?!
flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", ""));
}
return flags;
}
}
/**
* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
* must be combined into a single character.
*
* TODO (rmuir) test
*/
private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {
/**
* {@inheritDoc}
*/
public char[] parseFlags(String rawFlags) {
if (rawFlags.length() == 0) {
return new char[0];
}
StringBuilder builder = new StringBuilder();
for (int i = 0; i < rawFlags.length(); i+=2) {
char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
builder.append(cookedFlag);
}
char flags[] = new char[builder.length()];
builder.getChars(0, builder.length(), flags, 0);
return flags;
}
}
public boolean isIgnoreCase() {
return ignoreCase;
}
}