All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.hunspell.AffixCondition Maven / Gradle / Ivy

There is a newer version: 10.1.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.hunspell;

import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;

import java.util.regex.PatternSyntaxException;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;

/**
 * Checks the "condition" part of affix definition, as in
 *
 * 
PFX flag stripping prefix [condition [morphological_fields...]]
*/ interface AffixCondition { String ALWAYS_TRUE_KEY = ".*"; AffixCondition ALWAYS_TRUE = new AffixCondition() { @Override public boolean acceptsStem(String stem) { return true; } @Override public boolean acceptsStem(char[] word, int offset, int length) { return true; } }; AffixCondition ALWAYS_FALSE = new AffixCondition() { @Override public boolean acceptsStem(String stem) { return false; } @Override public boolean acceptsStem(char[] word, int offset, int length) { return false; } }; default boolean acceptsStem(String stem) { return acceptsStem(stem.toCharArray(), 0, stem.length()); } /** * @return whether the given word matches this condition as a stem with both "strip" and "affix" * removed */ boolean acceptsStem(char[] word, int offset, int length); /** * @return a key used to deduplicate same condition+strip+kind triples. For trivial conditions * that need no check, {@link #ALWAYS_TRUE_KEY} is returned. */ static String uniqueKey(AffixKind kind, String strip, String condition) { if (".".equals(condition) || kind == PREFIX && strip.startsWith(condition) || kind == SUFFIX && strip.endsWith(condition) && !isRegexp(condition)) { return ALWAYS_TRUE_KEY; } return condition + " " + kind + " " + strip; } /** * Analyzes the given affix kind, strip and condition and returns an object able to efficiently * check that condition. */ static AffixCondition compile(AffixKind kind, String strip, String condition, String line) { if (!isRegexp(condition)) { if (kind == SUFFIX && condition.endsWith(strip)) { return substringCondition( kind, condition.substring(0, condition.length() - strip.length())); } if (kind == PREFIX && condition.startsWith(strip)) { return substringCondition(kind, condition.substring(strip.length())); } return ALWAYS_FALSE; } int lastBracket = condition.lastIndexOf('['); if (lastBracket >= 0 && condition.indexOf(']', lastBracket + 1) < 0) { // unclosed [ is tolerated by Hunspell and occurs in some dictionaries condition = condition + "]"; } try { int conditionChars = countCharPatterns(condition); if (conditionChars <= strip.length()) { String regex = kind == PREFIX ? ".*" + condition : condition + ".*"; return strip.matches(regex) ? ALWAYS_TRUE : ALWAYS_FALSE; } if (kind == PREFIX) { int split = skipCharPatterns(condition, strip.length()); if (!strip.matches(condition.substring(0, split))) { return ALWAYS_FALSE; } return regexpCondition(kind, condition.substring(split), conditionChars - strip.length()); } int split = skipCharPatterns(condition, conditionChars - strip.length()); if (!strip.matches(condition.substring(split))) { return ALWAYS_FALSE; } return regexpCondition(kind, condition.substring(0, split), conditionChars - strip.length()); } catch ( @SuppressWarnings("unused") PatternSyntaxException e) { return ALWAYS_FALSE; } catch (Throwable e) { throw new IllegalArgumentException("On line: " + line, e); } } private static int skipCharPatterns(String condition, int count) { int pos = 0; for (int i = 0; i < count; i++) pos = skipCharPattern(condition, pos); return pos; } private static int countCharPatterns(String condition) { int conditionChars = 0; for (int i = 0; i < condition.length(); i = skipCharPattern(condition, i)) conditionChars++; return conditionChars; } private static int skipCharPattern(String condition, int pos) { if (condition.charAt(pos) == '[') { pos = condition.indexOf(']', pos + 1); if (pos < 0) { throw new AssertionError("Malformed condition " + condition); } } return pos + 1; } private static boolean isRegexp(String condition) { return condition.contains("[") || condition.contains(".") || condition.contains("-"); } private static AffixCondition substringCondition(AffixKind kind, String stemCondition) { boolean forSuffix = kind == AffixKind.SUFFIX; int condLength = stemCondition.length(); return (word, offset, length) -> { if (length < condLength) { return false; } int matchStart = forSuffix ? offset + length - condLength : offset; for (int i = 0; i < condLength; i++) { if (stemCondition.charAt(i) != word[matchStart + i]) { return false; } } return true; }; } private static AffixCondition regexpCondition(AffixKind kind, String condition, int charCount) { boolean forSuffix = kind == AffixKind.SUFFIX; CharacterRunAutomaton automaton = new CharacterRunAutomaton(new RegExp(escapeDash(condition), RegExp.NONE).toAutomaton()); return (word, offset, length) -> length >= charCount && automaton.run(word, forSuffix ? offset + length - charCount : offset, charCount); } // "dash hasn't got special meaning" (we must escape it) private static String escapeDash(String re) { if (!re.contains("-")) return re; // we have to be careful, even though dash doesn't have a special meaning, // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it StringBuilder escaped = new StringBuilder(); for (int i = 0; i < re.length(); i++) { char c = re.charAt(i); if (c == '-') { escaped.append("\\-"); } else { escaped.append(c); if (c == '\\' && i + 1 < re.length()) { escaped.append(re.charAt(i + 1)); i++; } } } return escaped.toString(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy