All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.org.retep.util.string.SentenceUtils Maven / Gradle / Ivy

The newest version!
/*
 * 

Copyright (c) 1998-2010, Peter T Mount
* All rights reserved.

* *

Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met:

* *
    *
  • Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer.
  • * *
  • Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution.
  • * *
  • Neither the name of the retep.org.uk nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission.
  • * *
* *

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/ package uk.org.retep.util.string; import java.util.ArrayList; import java.util.Collection; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.annotation.concurrent.NotThreadSafe; /** * A set of common methods for manipulating Sentances * @author peter */ @NotThreadSafe public class SentenceUtils { private static final String PATTERN_START = "\\s*("; private static final String PATTERN_END = ")\\s*"; private Pattern sentencePattern; private Pattern wordPattern; private final Set splitters = new LinkedHashSet(); /** * Default constructor */ public SentenceUtils() { } /** * Constructor accepting a Collection of AbstractSplitter's * @param splitters Collection of AbstractSplitter's */ public SentenceUtils( Collection splitters ) { this(); for( AbstractSplitter splitter : splitters ) { addSplitter( splitter, false ); } configurePatterns(); } /** * Add a WordSplitter * @param pattern Pattern to add as a WordSplitter * @return true if the pattern was accepted */ public boolean addWordSplitter( String pattern ) { return addSplitter( new WordSplitter( pattern ) ); } /** * Add a SentenceSplitter * @param pattern Pattern to add as a SentenceSplitter * @return true if the pattern was accepted */ public boolean addSentenceSplitter( String pattern ) { return addSplitter( new SentenceSplitter( pattern ) ); } /** * Add a AbstractSplitter * @param splitter Pattern to add as a AbstractSplitter * @return true if the pattern was accepted */ public boolean addSplitter( AbstractSplitter splitter ) { return addSplitter( splitter, true ); } private boolean addSplitter( AbstractSplitter splitter, boolean rebuild ) { boolean added = false; synchronized( splitters ) { added = splitters.add( splitter ); } if( added && rebuild ) { configurePatterns(); } return added; } /** Creates a new instance of SplitterTransform */ private void configurePatterns() { StringBuilder sentenceSB = new StringBuilder( PATTERN_START ); StringBuilder wordSB = new StringBuilder( PATTERN_START ); synchronized( splitters ) { for( AbstractSplitter splitter : splitters ) { switch( splitter.getType() ) { case SENTENCE: sentenceSB.append( splitter.getSplitterPattern() ).append( '|' ); break; case WORD: wordSB.append( splitter.getSplitterPattern() ).append( '|' ); break; default: throw new IllegalStateException( "Invalid splitter type" ); } } } if( sentenceSB.length() > PATTERN_START.length() ) { sentenceSB.setLength( sentenceSB.length() - 1 ); } if( wordSB.length() > PATTERN_START.length() ) { wordSB.setLength( wordSB.length() - 1 ); } sentenceSB.append( PATTERN_END ); wordSB.append( PATTERN_END ); sentencePattern = Pattern.compile( sentenceSB.toString() ); wordPattern = Pattern.compile( wordSB.toString() ); } /** * Create a Sentance based on the given string using the current WordSplitter's * @param sentence String containing the sentance * @return Sentance */ public Sentence createSentence( String sentence ) { if( !sentence.startsWith( " " ) ) { sentence = " " + sentence; } if( !sentence.endsWith( " " ) ) { sentence += " "; } return new Sentence( sentence, getMappings( sentence ), normaliseString( sentence ) ); } /** * Normalise a string returning an array of component sentences * @param original String to normalise into one or more Sentances * @return Array of Sentance's formed from the original string. */ public Sentence[] normaliseSentence( String original ) { Matcher matcher = sentencePattern.matcher( original ); List sentences = new LinkedList(); int beginIndex = 0; while( matcher.find() ) { int endIndex = matcher.start(); String sentence = original.substring( beginIndex, endIndex ); if( StringUtils.isStringNotEmpty( sentence.trim() ) ) { sentences.add( createSentence( sentence ) ); } beginIndex = endIndex + matcher.group().length(); } if( beginIndex <= original.length() ) { String sentence = original.substring( beginIndex ).trim(); if( StringUtils.isStringNotEmpty( sentence ) ) { sentence = " " + sentence + " "; // Sentence s = new Sentence( sentence ); // s.setNormalized( normaliseString( sentence ) ); // sentences.add( s ); sentences.add( new Sentence( sentence, getMappings( sentence ), normaliseString( sentence ) ) ); } } return sentences.toArray( new Sentence[ sentences.size() ] ); } /** * Normalise a string returning a single string * @param sentence String to normalise * @return Normalised string */ public String normaliseString( String sentence ) { Matcher matcher = wordPattern.matcher( sentence ); List words = new LinkedList(); int beginIndex = 0; while( matcher.find() ) { int endIndex = matcher.start(); String word = sentence.substring( beginIndex, endIndex ).trim(); if( StringUtils.isStringNotEmpty( word ) ) { words.add( word ); } beginIndex = endIndex + matcher.group().length(); } if( beginIndex <= sentence.length() ) { String word = sentence.substring( beginIndex ).trim(); if( StringUtils.isStringNotEmpty( word.trim() ) ) { words.add( word ); } } return StringUtils.join( ' ', words ).toUpperCase(); } /** * Form an array which maps the word boundaries within a string * @param input String to map * @return Array if word boundaries */ public Integer[] getMappings( String input ) { List mappings = new ArrayList( 2 ); Matcher matcher = wordPattern.matcher( input ); int beginIndex = 0; while( matcher.find() ) { mappings.add( matcher.start() ); } // char[] chars = input.toCharArray(); // for (int i = 0, n = chars.length; i < n; i++) // { // if (chars[i] == ' ') // { // mappings.add(i); // } // } return mappings.toArray( new Integer[ mappings.size() ] ); } /** * Enum defining the type of splitters */ public static enum Type { /** * A splitter that defines a word */ WORD, /** * A splitter that marks a sentance */ SENTENCE }; /** * Base class for any splitters */ public static abstract class AbstractSplitter { private String splitterPattern; /** * Constructor * @param pattern regex pattern */ public AbstractSplitter( String pattern ) { splitterPattern = pattern; } /** * Type of splitter * @return Type of splitter */ public abstract Type getType(); /** * The regex for this splitter * @return regex */ public final String getSplitterPattern() { return splitterPattern; } /** * String representation of this splitter * @return string */ public final String toString() { return String.format( "%s[%s]", getType(), getSplitterPattern() ); } } /** * A splitter that handles separating Words within a Sentance */ public class WordSplitter extends AbstractSplitter { /** * Constructor * @param pattern regex pattern */ public WordSplitter( String pattern ) { super( pattern ); } /** * Type of splitter * @return Type of splitter */ public Type getType() { return Type.WORD; } } /** * A splitter used to separate individual sentances */ public class SentenceSplitter extends AbstractSplitter { /** * Constructor * @param pattern regex pattern */ public SentenceSplitter( String pattern ) { super( pattern ); } /** * Type of splitter * @return Type of splitter */ public Type getType() { return Type.SENTENCE; } } public static class WordMapper { private int charIndex; private int listIndex; private int spaceCount; private final List mappings = new LinkedList(); private String input; private String find; private String replace; /** * * @param input */ public WordMapper( String input ) { char[] chars = input.toCharArray(); for( int i = 0, n = chars.length; i < n; i++ ) { if( chars[i] == ' ' ) { mappings.add( i ); } } } private int spaceCount( String string ) { return spaceCount( string, 0, string.length() ); } private int spaceCount( String string, int beginIndex, int endIndex ) { int spaces = 0; char[] chars = string.toCharArray(); for( int i = beginIndex, n = endIndex; i < n; i++ ) { if( chars[i] == ' ' ) { spaces++; } } return spaces; } /** * * @param input * @param find * @param replace */ public void prepare( String input, String find, String replace ) { this.input = input; this.find = find; this.replace = replace; spaceCount = spaceCount( find ); listIndex = 0; charIndex = 0; } /** * * @param beginIndex */ public void update( int beginIndex ) { listIndex += spaceCount( input, charIndex, beginIndex ); charIndex = beginIndex; int n = spaceCount; for( int j = 0, m = replace.length(); j < m; j++ ) { if( replace.charAt( j ) == ' ' && --n < 0 ) { mappings.add( listIndex++, null ); } } while( n-- > 0 ) { mappings.remove( listIndex ); } } /** * * @return */ public Integer[] toArray() { return mappings.toArray( new Integer[ mappings.size() ] ); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy