opennlp.tools.sentdetect.DefaultSDContextGenerator Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.sentdetect;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import opennlp.tools.util.StringUtil;
/**
* Generate event contexts for maxent decisions for sentence detection.
*
*/
public class DefaultSDContextGenerator implements SDContextGenerator {
/**
* String buffer for generating features.
*/
protected StringBuffer buf;
/**
* List for holding features as they are generated.
*/
protected List collectFeats;
private Set inducedAbbreviations;
private char[] eosCharacters;
/**
* Creates a new SDContextGenerator
instance with
* no induced abbreviations.
*
* @param eosCharacters
*/
public DefaultSDContextGenerator(char[] eosCharacters) {
this(Collections.emptySet(), eosCharacters);
}
/**
* Creates a new SDContextGenerator
instance which uses
* the set of induced abbreviations.
*
* @param inducedAbbreviations a Set
of Strings
* representing induced abbreviations in the training data.
* Example: "Mr."
*
* @param eosCharacters
*/
public DefaultSDContextGenerator(Set inducedAbbreviations, char[] eosCharacters) {
this.inducedAbbreviations = inducedAbbreviations;
this.eosCharacters = eosCharacters;
buf = new StringBuffer();
collectFeats = new ArrayList<>();
}
private static String escapeChar(Character c) {
if (c == '\n') {
return "";
}
if (c == '\r') {
return "";
}
return new String(new char[]{c});
}
/* (non-Javadoc)
* @see opennlp.tools.sentdetect.SDContextGenerator#getContext(java.lang.StringBuffer, int)
*/
public String[] getContext(CharSequence sb, int position) {
/*
* String preceding the eos character in the eos token.
*/
String prefix;
/*
* Space delimited token preceding token containing eos character.
*/
String previous;
/*
* String following the eos character in the eos token.
*/
String suffix;
/*
* Space delimited token following token containing eos character.
*/
String next;
int lastIndex = sb.length() - 1;
{ // compute space previous and space next features.
if (position > 0 && StringUtil.isWhitespace(sb.charAt(position - 1)))
collectFeats.add("sp");
if (position < lastIndex && StringUtil.isWhitespace(sb.charAt(position + 1)))
collectFeats.add("sn");
collectFeats.add("eos=" + escapeChar(sb.charAt(position)));
}
int prefixStart = previousSpaceIndex(sb, position);
int c = position;
{ ///assign prefix, stop if you run into a period though otherwise stop at space
while (--c > prefixStart) {
for (int eci = 0, ecl = eosCharacters.length; eci < ecl; eci++) {
if (sb.charAt(c) == eosCharacters[eci]) {
prefixStart = c;
c++; // this gets us out of while loop.
break;
}
}
}
prefix = String.valueOf(sb.subSequence(prefixStart, position)).trim();
}
int prevStart = previousSpaceIndex(sb, prefixStart);
previous = String.valueOf(sb.subSequence(prevStart, prefixStart)).trim();
int suffixEnd = nextSpaceIndex(sb, position, lastIndex);
{
c = position;
while (++c < suffixEnd) {
for (int eci = 0, ecl = eosCharacters.length; eci < ecl; eci++) {
if (sb.charAt(c) == eosCharacters[eci]) {
suffixEnd = c;
c--; // this gets us out of while loop.
break;
}
}
}
}
int nextEnd = nextSpaceIndex(sb, suffixEnd + 1, lastIndex + 1);
if (position == lastIndex) {
suffix = "";
next = "";
}
else {
suffix = String.valueOf(sb.subSequence(position + 1, suffixEnd)).trim();
next = String.valueOf(sb.subSequence(suffixEnd + 1, nextEnd)).trim();
}
collectFeatures(prefix,suffix,previous,next, sb.charAt(position));
String[] context = new String[collectFeats.size()];
context = collectFeats.toArray(context);
collectFeats.clear();
return context;
}
/**
* Determines some of the features for the sentence detector and adds them to list features.
*
* @param prefix String preceding the eos character in the eos token.
* @param suffix String following the eos character in the eos token.
* @param previous Space delimited token preceding token containing eos character.
* @param next Space delimited token following token containing eos character.
*
* @deprecated use {@link #collectFeatures(String, String, String, String, Character)} instead.
*/
protected void collectFeatures(String prefix, String suffix, String previous, String next) {
collectFeatures(prefix, suffix, previous, next, null);
}
/**
* Determines some of the features for the sentence detector and adds them to list features.
*
* @param prefix String preceding the eos character in the eos token.
* @param suffix String following the eos character in the eos token.
* @param previous Space delimited token preceding token containing eos character.
* @param next Space delimited token following token containing eos character.
* @param eosChar the EOS character been analyzed
*/
protected void collectFeatures(String prefix, String suffix, String previous,
String next, Character eosChar) {
buf.append("x=");
buf.append(prefix);
collectFeats.add(buf.toString());
buf.setLength(0);
if (!prefix.equals("")) {
collectFeats.add(Integer.toString(prefix.length()));
if (isFirstUpper(prefix)) {
collectFeats.add("xcap");
}
if (eosChar != null && inducedAbbreviations.contains(prefix + eosChar)) {
collectFeats.add("xabbrev");
}
}
buf.append("v=");
buf.append(previous);
collectFeats.add(buf.toString());
buf.setLength(0);
if (!previous.equals("")) {
if (isFirstUpper(previous)) {
collectFeats.add("vcap");
}
if (inducedAbbreviations.contains(previous)) {
collectFeats.add("vabbrev");
}
}
buf.append("s=");
buf.append(suffix);
collectFeats.add(buf.toString());
buf.setLength(0);
if (!suffix.equals("")) {
if (isFirstUpper(suffix)) {
collectFeats.add("scap");
}
if (inducedAbbreviations.contains(suffix)) {
collectFeats.add("sabbrev");
}
}
buf.append("n=");
buf.append(next);
collectFeats.add(buf.toString());
buf.setLength(0);
if (!next.equals("")) {
if (isFirstUpper(next)) {
collectFeats.add("ncap");
}
if (inducedAbbreviations.contains(next)) {
collectFeats.add("nabbrev");
}
}
}
private static boolean isFirstUpper(String s) {
return Character.isUpperCase(s.charAt(0));
}
/**
* Finds the index of the nearest space before a specified index which is not itself preceded by a space.
*
* @param sb The string buffer which contains the text being examined.
* @param seek The index to begin searching from.
* @return The index which contains the nearest space.
*/
private static int previousSpaceIndex(CharSequence sb, int seek) {
seek--;
while (seek > 0 && !StringUtil.isWhitespace(sb.charAt(seek))) {
seek--;
}
if (seek > 0 && StringUtil.isWhitespace(sb.charAt(seek))) {
while (seek > 0 && StringUtil.isWhitespace(sb.charAt(seek - 1)))
seek--;
return seek;
}
return 0;
}
/**
* Finds the index of the nearest space after a specified index.
*
* @param sb The string buffer which contains the text being examined.
* @param seek The index to begin searching from.
* @param lastIndex The highest index of the StringBuffer sb.
* @return The index which contains the nearest space.
*/
private static int nextSpaceIndex(CharSequence sb, int seek, int lastIndex) {
seek++;
char c;
while (seek < lastIndex) {
c = sb.charAt(seek);
if (StringUtil.isWhitespace(c)) {
while (sb.length() > seek + 1 && StringUtil.isWhitespace(sb.charAt(seek + 1)))
seek++;
return seek;
}
seek++;
}
return lastIndex;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy