org.apache.lucene.analysis.pt.RSLPStemmerBase Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analysis-common Show documentation
Show all versions of lucene-analysis-common Show documentation
Apache Lucene (module: common)
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.pt;
import static org.apache.lucene.analysis.util.StemmerUtil.*;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.CharArraySet;
/**
* Base class for stemmers that use a set of RSLP-like stemming steps.
*
* RSLP (Removedor de Sufixos da Lingua Portuguesa) is an algorithm designed originally for
* stemming the Portuguese language, described in the paper A Stemming Algorithm for the
* Portuguese Language, Orengo et. al.
*
*
Since this time a plural-only modification (RSLP-S) as well as a modification for the Galician
* language have been implemented. This class parses a configuration file that describes {@link
* Step}s, where each Step contains a set of {@link Rule}s.
*
*
The general rule format is:
*
*
*
* { "suffix", N, "replacement", { "exception1", "exception2", ...}}
*
*
*
* where:
*
*
* suffix
is the suffix to be removed (such as "inho").
* N
is the min stem size, where stem is defined as the candidate stem after
* removing the suffix (but before appending the replacement!)
* replacement
is an optimal string to append after removing the suffix. This can
* be the empty string.
* exceptions
is an optional list of exceptions, patterns that should not be
* stemmed. These patterns can be specified as whole word or suffix (ends-with) patterns,
* depending upon the exceptions format flag in the step header.
*
*
* A step is an ordered list of rules, with a structure in this format:
*
*
*
* { "name", N, B, { "cond1", "cond2", ... } ... rules ... };
*
*
*
* where:
*
*
* name
is a name for the step (such as "Plural").
* N
is the min word size. Words that are less than this length bypass the step
* completely, as an optimization. Note: N can be zero, in this case this implementation will
* automatically calculate the appropriate value from the underlying rules.
* B
is a "boolean" flag specifying how exceptions in the rules are matched. A
* value of 1 indicates whole-word pattern matching, a value of 0 indicates that exceptions
* are actually suffixes and should be matched with ends-with.
* conds
are an optional list of conditions to enter the step at all. If the list
* is non-empty, then a word must end with one of these conditions or it will bypass the step
* completely as an optimization.
*
*
* @see RSLP description
* @lucene.internal
*/
public abstract class RSLPStemmerBase {
/** A basic rule, with no exceptions. */
protected static class Rule {
protected final char[] suffix;
protected final char[] replacement;
protected final int min;
/**
* Create a rule.
*
* @param suffix suffix to remove
* @param min minimum stem length
* @param replacement replacement string
*/
public Rule(String suffix, int min, String replacement) {
this.suffix = suffix.toCharArray();
this.replacement = replacement.toCharArray();
this.min = min;
}
/**
* @return true if the word matches this rule.
*/
public boolean matches(char[] s, int len) {
return (len - suffix.length >= min && endsWith(s, len, suffix));
}
/**
* @return new valid length of the string after firing this rule.
*/
public int replace(char[] s, int len) {
if (replacement.length > 0) {
System.arraycopy(replacement, 0, s, len - suffix.length, replacement.length);
}
return len - suffix.length + replacement.length;
}
}
/** A rule with a set of whole-word exceptions. */
protected static class RuleWithSetExceptions extends Rule {
protected final CharArraySet exceptions;
public RuleWithSetExceptions(String suffix, int min, String replacement, String[] exceptions) {
super(suffix, min, replacement);
for (int i = 0; i < exceptions.length; i++) {
if (!exceptions[i].endsWith(suffix))
throw new RuntimeException(
"useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
}
this.exceptions = new CharArraySet(Arrays.asList(exceptions), false);
}
@Override
public boolean matches(char[] s, int len) {
return super.matches(s, len) && !exceptions.contains(s, 0, len);
}
}
/** A rule with a set of exceptional suffixes. */
protected static class RuleWithSuffixExceptions extends Rule {
// TODO: use a more efficient datastructure: automaton?
protected final char[][] exceptions;
public RuleWithSuffixExceptions(
String suffix, int min, String replacement, String[] exceptions) {
super(suffix, min, replacement);
for (int i = 0; i < exceptions.length; i++) {
if (!exceptions[i].endsWith(suffix))
throw new RuntimeException(
"warning: useless exception '"
+ exceptions[i]
+ "' does not end with '"
+ suffix
+ "'");
}
this.exceptions = new char[exceptions.length][];
for (int i = 0; i < exceptions.length; i++) this.exceptions[i] = exceptions[i].toCharArray();
}
@Override
public boolean matches(char[] s, int len) {
if (!super.matches(s, len)) return false;
for (int i = 0; i < exceptions.length; i++) if (endsWith(s, len, exceptions[i])) return false;
return true;
}
}
/** A step containing a list of rules. */
protected static class Step {
protected final String name;
protected final Rule[] rules;
protected final int min;
protected final char[][] suffixes;
/**
* Create a new step
*
* @param name Step's name.
* @param rules an ordered list of rules.
* @param min minimum word size. if this is 0 it is automatically calculated.
* @param suffixes optional list of conditional suffixes. may be null.
*/
public Step(String name, Rule[] rules, int min, String[] suffixes) {
this.name = name;
this.rules = rules;
if (min == 0) {
min = Integer.MAX_VALUE;
for (Rule r : rules) min = Math.min(min, r.min + r.suffix.length);
}
this.min = min;
if (suffixes == null || suffixes.length == 0) {
this.suffixes = null;
} else {
this.suffixes = new char[suffixes.length][];
for (int i = 0; i < suffixes.length; i++) this.suffixes[i] = suffixes[i].toCharArray();
}
}
/**
* @return new valid length of the string after applying the entire step.
*/
public int apply(char[] s, int len) {
if (len < min) return len;
if (suffixes != null) {
boolean found = false;
for (int i = 0; i < suffixes.length; i++)
if (endsWith(s, len, suffixes[i])) {
found = true;
break;
}
if (!found) return len;
}
for (int i = 0; i < rules.length; i++) {
if (rules[i].matches(s, len)) return rules[i].replace(s, len);
}
return len;
}
}
/**
* Parse a resource file into an RSLP stemmer description.
*
* @return a Map containing the named Steps in this description.
*/
protected static Map parse(
Class extends RSLPStemmerBase> clazz, String resource) {
// TODO: this parser is ugly, but works. use a jflex grammar instead.
try {
InputStream is = clazz.getResourceAsStream(resource);
LineNumberReader r = new LineNumberReader(new InputStreamReader(is, StandardCharsets.UTF_8));
Map steps = new HashMap<>();
String step;
while ((step = readLine(r)) != null) {
Step s = parseStep(r, step);
steps.put(s.name, s);
}
r.close();
return steps;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private static final Pattern headerPattern =
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*(0|1),\\s*\\{(.*)\\},\\s*$");
private static final Pattern stripPattern =
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+)\\s*\\}\\s*(,|(\\}\\s*;))$");
private static final Pattern repPattern =
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\"\\}\\s*(,|(\\}\\s*;))$");
private static final Pattern excPattern =
Pattern.compile(
"^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\",\\s*\\{(.*)\\}\\s*\\}\\s*(,|(\\}\\s*;))$");
private static Step parseStep(LineNumberReader r, String header) throws IOException {
Matcher matcher = headerPattern.matcher(header);
if (!matcher.find()) {
throw new RuntimeException("Illegal Step header specified at line " + r.getLineNumber());
}
assert matcher.groupCount() == 4;
String name = matcher.group(1);
int min = Integer.parseInt(matcher.group(2));
int type = Integer.parseInt(matcher.group(3));
String[] suffixes = parseList(matcher.group(4));
Rule[] rules = parseRules(r, type);
return new Step(name, rules, min, suffixes);
}
private static Rule[] parseRules(LineNumberReader r, int type) throws IOException {
List rules = new ArrayList<>();
String line;
while ((line = readLine(r)) != null) {
Matcher matcher = stripPattern.matcher(line);
if (matcher.matches()) {
rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), ""));
} else {
matcher = repPattern.matcher(line);
if (matcher.matches()) {
rules.add(
new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), matcher.group(3)));
} else {
matcher = excPattern.matcher(line);
if (matcher.matches()) {
if (type == 0) {
rules.add(
new RuleWithSuffixExceptions(
matcher.group(1),
Integer.parseInt(matcher.group(2)),
matcher.group(3),
parseList(matcher.group(4))));
} else {
rules.add(
new RuleWithSetExceptions(
matcher.group(1),
Integer.parseInt(matcher.group(2)),
matcher.group(3),
parseList(matcher.group(4))));
}
} else {
throw new RuntimeException("Illegal Step rule specified at line " + r.getLineNumber());
}
}
}
if (line.endsWith(";")) return rules.toArray(new Rule[rules.size()]);
}
return null;
}
private static String[] parseList(String s) {
if (s.length() == 0) return null;
String[] list = s.split(",");
for (int i = 0; i < list.length; i++) list[i] = parseString(list[i].trim());
return list;
}
private static String parseString(String s) {
return s.substring(1, s.length() - 1);
}
private static String readLine(LineNumberReader r) throws IOException {
String line = null;
while ((line = r.readLine()) != null) {
line = line.trim();
if (line.length() > 0 && line.charAt(0) != '#') return line;
}
return line;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy