
uk.org.retep.util.string.SentenceUtils Maven / Gradle / Ivy
/*
* Copyright (c) 1998-2010, Peter T Mount
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither the name of the retep.org.uk nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
*
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package uk.org.retep.util.string;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.concurrent.NotThreadSafe;
/**
* A set of common methods for manipulating Sentances
* @author peter
*/
@NotThreadSafe
public class SentenceUtils
{
private static final String PATTERN_START = "\\s*(";
private static final String PATTERN_END = ")\\s*";
private Pattern sentencePattern;
private Pattern wordPattern;
private final Set splitters = new LinkedHashSet();
/**
* Default constructor
*/
public SentenceUtils()
{
}
/**
* Constructor accepting a Collection of AbstractSplitter's
* @param splitters Collection of AbstractSplitter's
*/
public SentenceUtils( Collection splitters )
{
this();
for( AbstractSplitter splitter : splitters )
{
addSplitter( splitter, false );
}
configurePatterns();
}
/**
* Add a WordSplitter
* @param pattern Pattern to add as a WordSplitter
* @return true if the pattern was accepted
*/
public boolean addWordSplitter( String pattern )
{
return addSplitter( new WordSplitter( pattern ) );
}
/**
* Add a SentenceSplitter
* @param pattern Pattern to add as a SentenceSplitter
* @return true if the pattern was accepted
*/
public boolean addSentenceSplitter( String pattern )
{
return addSplitter( new SentenceSplitter( pattern ) );
}
/**
* Add a AbstractSplitter
* @param splitter Pattern to add as a AbstractSplitter
* @return true if the pattern was accepted
*/
public boolean addSplitter( AbstractSplitter splitter )
{
return addSplitter( splitter, true );
}
private boolean addSplitter( AbstractSplitter splitter, boolean rebuild )
{
boolean added = false;
synchronized( splitters )
{
added = splitters.add( splitter );
}
if( added && rebuild )
{
configurePatterns();
}
return added;
}
/** Creates a new instance of SplitterTransform */
private void configurePatterns()
{
StringBuilder sentenceSB = new StringBuilder( PATTERN_START );
StringBuilder wordSB = new StringBuilder( PATTERN_START );
synchronized( splitters )
{
for( AbstractSplitter splitter : splitters )
{
switch( splitter.getType() )
{
case SENTENCE:
sentenceSB.append( splitter.getSplitterPattern() ).append(
'|' );
break;
case WORD:
wordSB.append( splitter.getSplitterPattern() ).append(
'|' );
break;
default:
throw new IllegalStateException( "Invalid splitter type" );
}
}
}
if( sentenceSB.length() > PATTERN_START.length() )
{
sentenceSB.setLength( sentenceSB.length() - 1 );
}
if( wordSB.length() > PATTERN_START.length() )
{
wordSB.setLength( wordSB.length() - 1 );
}
sentenceSB.append( PATTERN_END );
wordSB.append( PATTERN_END );
sentencePattern = Pattern.compile( sentenceSB.toString() );
wordPattern = Pattern.compile( wordSB.toString() );
}
/**
* Create a Sentance based on the given string using the current WordSplitter's
* @param sentence String containing the sentance
* @return Sentance
*/
public Sentence createSentence( String sentence )
{
if( !sentence.startsWith( " " ) )
{
sentence = " " + sentence;
}
if( !sentence.endsWith( " " ) )
{
sentence += " ";
}
return new Sentence( sentence, getMappings( sentence ),
normaliseString( sentence ) );
}
/**
* Normalise a string returning an array of component sentences
* @param original String to normalise into one or more Sentances
* @return Array of Sentance's formed from the original string.
*/
public Sentence[] normaliseSentence( String original )
{
Matcher matcher = sentencePattern.matcher( original );
List sentences = new LinkedList();
int beginIndex = 0;
while( matcher.find() )
{
int endIndex = matcher.start();
String sentence = original.substring( beginIndex, endIndex );
if( StringUtils.isStringNotEmpty( sentence.trim() ) )
{
sentences.add( createSentence( sentence ) );
}
beginIndex = endIndex + matcher.group().length();
}
if( beginIndex <= original.length() )
{
String sentence = original.substring( beginIndex ).trim();
if( StringUtils.isStringNotEmpty( sentence ) )
{
sentence = " " + sentence + " ";
// Sentence s = new Sentence( sentence );
// s.setNormalized( normaliseString( sentence ) );
// sentences.add( s );
sentences.add( new Sentence( sentence, getMappings( sentence ),
normaliseString( sentence ) ) );
}
}
return sentences.toArray( new Sentence[ sentences.size() ] );
}
/**
* Normalise a string returning a single string
* @param sentence String to normalise
* @return Normalised string
*/
public String normaliseString( String sentence )
{
Matcher matcher = wordPattern.matcher( sentence );
List words = new LinkedList();
int beginIndex = 0;
while( matcher.find() )
{
int endIndex = matcher.start();
String word = sentence.substring( beginIndex, endIndex ).trim();
if( StringUtils.isStringNotEmpty( word ) )
{
words.add( word );
}
beginIndex = endIndex + matcher.group().length();
}
if( beginIndex <= sentence.length() )
{
String word = sentence.substring( beginIndex ).trim();
if( StringUtils.isStringNotEmpty( word.trim() ) )
{
words.add( word );
}
}
return StringUtils.join( ' ', words ).toUpperCase();
}
/**
* Form an array which maps the word boundaries within a string
* @param input String to map
* @return Array if word boundaries
*/
public Integer[] getMappings( String input )
{
List mappings = new ArrayList( 2 );
Matcher matcher = wordPattern.matcher( input );
int beginIndex = 0;
while( matcher.find() )
{
mappings.add( matcher.start() );
}
// char[] chars = input.toCharArray();
// for (int i = 0, n = chars.length; i < n; i++)
// {
// if (chars[i] == ' ')
// {
// mappings.add(i);
// }
// }
return mappings.toArray( new Integer[ mappings.size() ] );
}
/**
* Enum defining the type of splitters
*/
public static enum Type
{
/**
* A splitter that defines a word
*/
WORD,
/**
* A splitter that marks a sentance
*/
SENTENCE
};
/**
* Base class for any splitters
*/
public static abstract class AbstractSplitter
{
private String splitterPattern;
/**
* Constructor
* @param pattern regex pattern
*/
public AbstractSplitter( String pattern )
{
splitterPattern = pattern;
}
/**
* Type of splitter
* @return Type of splitter
*/
public abstract Type getType();
/**
* The regex for this splitter
* @return regex
*/
public final String getSplitterPattern()
{
return splitterPattern;
}
/**
* String representation of this splitter
* @return string
*/
public final String toString()
{
return String.format( "%s[%s]", getType(), getSplitterPattern() );
}
}
/**
* A splitter that handles separating Words within a Sentance
*/
public class WordSplitter
extends AbstractSplitter
{
/**
* Constructor
* @param pattern regex pattern
*/
public WordSplitter( String pattern )
{
super( pattern );
}
/**
* Type of splitter
* @return Type of splitter
*/
public Type getType()
{
return Type.WORD;
}
}
/**
* A splitter used to separate individual sentances
*/
public class SentenceSplitter
extends AbstractSplitter
{
/**
* Constructor
* @param pattern regex pattern
*/
public SentenceSplitter( String pattern )
{
super( pattern );
}
/**
* Type of splitter
* @return Type of splitter
*/
public Type getType()
{
return Type.SENTENCE;
}
}
public static class WordMapper
{
private int charIndex;
private int listIndex;
private int spaceCount;
private final List mappings = new LinkedList();
private String input;
private String find;
private String replace;
/**
*
* @param input
*/
public WordMapper( String input )
{
char[] chars = input.toCharArray();
for( int i = 0, n = chars.length; i < n; i++ )
{
if( chars[i] == ' ' )
{
mappings.add( i );
}
}
}
private int spaceCount( String string )
{
return spaceCount( string, 0, string.length() );
}
private int spaceCount( String string, int beginIndex, int endIndex )
{
int spaces = 0;
char[] chars = string.toCharArray();
for( int i = beginIndex, n = endIndex; i < n; i++ )
{
if( chars[i] == ' ' )
{
spaces++;
}
}
return spaces;
}
/**
*
* @param input
* @param find
* @param replace
*/
public void prepare( String input, String find, String replace )
{
this.input = input;
this.find = find;
this.replace = replace;
spaceCount = spaceCount( find );
listIndex = 0;
charIndex = 0;
}
/**
*
* @param beginIndex
*/
public void update( int beginIndex )
{
listIndex += spaceCount( input, charIndex, beginIndex );
charIndex = beginIndex;
int n = spaceCount;
for( int j = 0, m = replace.length(); j < m; j++ )
{
if( replace.charAt( j ) == ' ' && --n < 0 )
{
mappings.add( listIndex++, null );
}
}
while( n-- > 0 )
{
mappings.remove( listIndex );
}
}
/**
*
* @return
*/
public Integer[] toArray()
{
return mappings.toArray( new Integer[ mappings.size() ] );
}
}
}