org.apache.lucene.analysis.hunspell.AffixCondition Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;
import java.util.regex.PatternSyntaxException;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
/**
* Checks the "condition" part of affix definition, as in
*
* PFX flag stripping prefix [condition [morphological_fields...]]
*/
abstract class AffixCondition {
public static final String ALWAYS_TRUE_KEY = ".*";
public static final AffixCondition ALWAYS_TRUE = new AffixCondition() {
@Override
public boolean acceptsStem(char[] word, int offset, int length) {
return true;
}
};
public static final AffixCondition ALWAYS_FALSE = new AffixCondition() {
@Override
public boolean acceptsStem(char[] word, int offset, int length) {
return false;
}
};
public boolean acceptsStem(String stem) {
return acceptsStem(stem.toCharArray(), 0, stem.length());
}
/**
* @return whether the given word matches this condition as a stem with both "strip" and "affix"
* removed
*/
public abstract boolean acceptsStem(char[] word, int offset, int length);
/**
* @return a key used to deduplicate same condition+strip+kind triples. For trivial conditions
* that need no check, {@link #ALWAYS_TRUE_KEY} is returned.
*/
public static String uniqueKey(AffixKind kind, String strip, String condition) {
if (".".equals(condition)
|| kind == PREFIX && strip.startsWith(condition)
|| kind == SUFFIX && strip.endsWith(condition) && !isRegexp(condition)) {
return ALWAYS_TRUE_KEY;
}
return condition + " " + kind + " " + strip;
}
/**
* Analyzes the given affix kind, strip and condition and returns an object able to efficiently
* check that condition.
*/
public static AffixCondition compile(AffixKind kind, String strip, String condition, String line) {
if (!isRegexp(condition)) {
if (kind == SUFFIX && condition.endsWith(strip)) {
return substringCondition(
kind, condition.substring(0, condition.length() - strip.length()));
}
if (kind == PREFIX && condition.startsWith(strip)) {
return substringCondition(kind, condition.substring(strip.length()));
}
return ALWAYS_FALSE;
}
int lastBracket = condition.lastIndexOf('[');
if (lastBracket >= 0 && condition.indexOf(']', lastBracket + 1) < 0) {
// unclosed [ is tolerated by Hunspell and occurs in some dictionaries
condition = condition + "]";
}
try {
int conditionChars = countCharPatterns(condition);
if (conditionChars <= strip.length()) {
String regex = kind == PREFIX ? ".*" + condition : condition + ".*";
return strip.matches(regex) ? ALWAYS_TRUE : ALWAYS_FALSE;
}
if (kind == PREFIX) {
int split = skipCharPatterns(condition, strip.length());
if (!strip.matches(condition.substring(0, split))) {
return ALWAYS_FALSE;
}
return regexpCondition(kind, condition.substring(split), conditionChars - strip.length());
}
int split = skipCharPatterns(condition, conditionChars - strip.length());
if (!strip.matches(condition.substring(split))) {
return ALWAYS_FALSE;
}
return regexpCondition(kind, condition.substring(0, split), conditionChars - strip.length());
} catch (
@SuppressWarnings("unused")
PatternSyntaxException e) {
return ALWAYS_FALSE;
} catch (Throwable e) {
throw new IllegalArgumentException("On line: " + line, e);
}
}
public static int skipCharPatterns(String condition, int count) {
int pos = 0;
for (int i = 0; i < count; i++) pos = skipCharPattern(condition, pos);
return pos;
}
public static int countCharPatterns(String condition) {
int conditionChars = 0;
for (int i = 0; i < condition.length(); i = skipCharPattern(condition, i)) conditionChars++;
return conditionChars;
}
public static int skipCharPattern(String condition, int pos) {
if (condition.charAt(pos) == '[') {
pos = condition.indexOf(']', pos + 1);
if (pos < 0) {
throw new AssertionError("Malformed condition " + condition);
}
}
return pos + 1;
}
public static boolean isRegexp(String condition) {
return condition.contains("[") || condition.contains(".") || condition.contains("-");
}
public static AffixCondition substringCondition(AffixKind kind, String stemCondition) {
boolean forSuffix = kind == AffixKind.SUFFIX;
int condLength = stemCondition.length();
return new AffixCondition() {
@Override
public boolean acceptsStem(char[] word, int offset, int length) {
if (length < condLength) {
return false;
}
int matchStart = forSuffix ? offset + length - condLength : offset;
for (int i = 0; i < condLength; i++) {
if (stemCondition.charAt(i) != word[matchStart + i]) {
return false;
}
}
return true;
}
};
}
public static AffixCondition regexpCondition(AffixKind kind, String condition, int charCount) {
boolean forSuffix = kind == AffixKind.SUFFIX;
CharacterRunAutomaton automaton =
new CharacterRunAutomaton(new RegExp(escapeDash(condition), RegExp.NONE).toAutomaton());
return new AffixCondition() {
@Override
public boolean acceptsStem(char[] word, int offset, int length) {
return length >= charCount
&& automaton.run(word, forSuffix ? offset + length - charCount : offset, charCount);
}
};
}
// "dash hasn't got special meaning" (we must escape it)
public static String escapeDash(String re) {
if (!re.contains("-")) return re;
// we have to be careful, even though dash doesn't have a special meaning,
// some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
StringBuilder escaped = new StringBuilder();
for (int i = 0; i < re.length(); i++) {
char c = re.charAt(i);
if (c == '-') {
escaped.append("\\-");
} else {
escaped.append(c);
if (c == '\\' && i + 1 < re.length()) {
escaped.append(re.charAt(i + 1));
i++;
}
}
}
return escaped.toString();
}
}