uk.org.retep.util.string.SentenceUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of strings Show documentation
The newest version!
/*
 * Copyright (c) 1998-2010, Peter T Mount

 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 
 *   Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 *   Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 *
 *   Neither the name of the retep.org.uk nor the names of its contributors
 *   may be used to endorse or promote products derived from this software
 *   without specific prior written permission.
 *
 * 
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package uk.org.retep.util.string;

import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.concurrent.NotThreadSafe;

/**
 * A set of common methods for manipulating Sentances
 * @author peter
 */
@NotThreadSafe
public class SentenceUtils
{

    private static final String PATTERN_START = "\\s*(";
    private static final String PATTERN_END = ")\\s*";
    private Pattern sentencePattern;
    private Pattern wordPattern;
    private final Set splitters = new LinkedHashSet();

    /**
     * Default constructor
     */
    public SentenceUtils()
    {
    }

    /**
     * Constructor accepting a Collection of AbstractSplitter's
     * @param splitters Collection of AbstractSplitter's
     */
    public SentenceUtils( Collection splitters )
    {
        this();
        for( AbstractSplitter splitter : splitters )
        {
            addSplitter( splitter, false );
        }
        configurePatterns();
    }

    /**
     * Add a WordSplitter
     * @param pattern Pattern to add as a WordSplitter
     * @return true if the pattern was accepted
     */
    public boolean addWordSplitter( String pattern )
    {
        return addSplitter( new WordSplitter( pattern ) );
    }

    /**
     * Add a SentenceSplitter
     * @param pattern Pattern to add as a SentenceSplitter
     * @return true if the pattern was accepted
     */
    public boolean addSentenceSplitter( String pattern )
    {
        return addSplitter( new SentenceSplitter( pattern ) );
    }

    /**
     * Add a AbstractSplitter
     * @param splitter Pattern to add as a AbstractSplitter
     * @return true if the pattern was accepted
     */
    public boolean addSplitter( AbstractSplitter splitter )
    {
        return addSplitter( splitter, true );
    }

    private boolean addSplitter( AbstractSplitter splitter, boolean rebuild )
    {
        boolean added = false;
        synchronized( splitters )
        {
            added = splitters.add( splitter );
        }
        if( added && rebuild )
        {
            configurePatterns();
        }
        return added;
    }

    /** Creates a new instance of SplitterTransform */
    private void configurePatterns()
    {
        StringBuilder sentenceSB = new StringBuilder( PATTERN_START );
        StringBuilder wordSB = new StringBuilder( PATTERN_START );
        synchronized( splitters )
        {
            for( AbstractSplitter splitter : splitters )
            {
                switch( splitter.getType() )
                {
                    case SENTENCE:
                        sentenceSB.append( splitter.getSplitterPattern() ).append(
                                '|' );
                        break;

                    case WORD:
                        wordSB.append( splitter.getSplitterPattern() ).append(
                                '|' );
                        break;

                    default:
                        throw new IllegalStateException( "Invalid splitter type" );
                }
            }
        }
        if( sentenceSB.length() > PATTERN_START.length() )
        {
            sentenceSB.setLength( sentenceSB.length() - 1 );
        }
        if( wordSB.length() > PATTERN_START.length() )
        {
            wordSB.setLength( wordSB.length() - 1 );
        }
        sentenceSB.append( PATTERN_END );
        wordSB.append( PATTERN_END );

        sentencePattern = Pattern.compile( sentenceSB.toString() );
        wordPattern = Pattern.compile( wordSB.toString() );
    }

    /**
     * Create a Sentance based on the given string using the current WordSplitter's
     * @param sentence String containing the sentance
     * @return Sentance
     */
    public Sentence createSentence( String sentence )
    {
        if( !sentence.startsWith( " " ) )
        {
            sentence = " " + sentence;
        }
        if( !sentence.endsWith( " " ) )
        {
            sentence += " ";
        }
        return new Sentence( sentence, getMappings( sentence ),
                             normaliseString( sentence ) );
    }

    /**
     * Normalise a string returning an array of component sentences
     * @param original String to normalise into one or more Sentances
     * @return Array of Sentance's formed from the original string.
     */
    public Sentence[] normaliseSentence( String original )
    {
        Matcher matcher = sentencePattern.matcher( original );
        List sentences = new LinkedList();
        int beginIndex = 0;
        while( matcher.find() )
        {
            int endIndex = matcher.start();
            String sentence = original.substring( beginIndex, endIndex );
            if( StringUtils.isStringNotEmpty( sentence.trim() ) )
            {
                sentences.add( createSentence( sentence ) );
            }
            beginIndex = endIndex + matcher.group().length();
        }

        if( beginIndex <= original.length() )
        {
            String sentence = original.substring( beginIndex ).trim();
            if( StringUtils.isStringNotEmpty( sentence ) )
            {
                sentence = " " + sentence + " ";
//                Sentence s = new Sentence( sentence );
//                s.setNormalized( normaliseString( sentence ) );
//                sentences.add( s );
                sentences.add( new Sentence( sentence, getMappings( sentence ),
                                             normaliseString( sentence ) ) );
            }
        }

        return sentences.toArray( new Sentence[ sentences.size() ] );
    }

    /**
     * Normalise a string returning a single string
     * @param sentence String to normalise
     * @return Normalised string
     */
    public String normaliseString( String sentence )
    {
        Matcher matcher = wordPattern.matcher( sentence );
        List words = new LinkedList();
        int beginIndex = 0;
        while( matcher.find() )
        {
            int endIndex = matcher.start();
            String word = sentence.substring( beginIndex, endIndex ).trim();
            if( StringUtils.isStringNotEmpty( word ) )
            {
                words.add( word );
            }
            beginIndex = endIndex + matcher.group().length();
        }

        if( beginIndex <= sentence.length() )
        {
            String word = sentence.substring( beginIndex ).trim();
            if( StringUtils.isStringNotEmpty( word.trim() ) )
            {
                words.add( word );
            }
        }

        return StringUtils.join( ' ', words ).toUpperCase();
    }

    /**
     * Form an array which maps the word boundaries within a string
     * @param input String to map
     * @return Array if word boundaries
     */
    public Integer[] getMappings( String input )
    {
        List mappings = new ArrayList( 2 );
        Matcher matcher = wordPattern.matcher( input );
        int beginIndex = 0;
        while( matcher.find() )
        {
            mappings.add( matcher.start() );
        }
//        char[] chars = input.toCharArray();
//        for (int i = 0, n = chars.length; i < n; i++)
//        {
//            if (chars[i] == ' ')
//            {
//                mappings.add(i);
//            }
//        }
        return mappings.toArray( new Integer[ mappings.size() ] );
    }

    /**
     * Enum defining the type of splitters
     */
    public static enum Type
    {

        /**
         * A splitter that defines a word
         */
        WORD,
        /**
         * A splitter that marks a sentance
         */
        SENTENCE
    };

    /**
     * Base class for any splitters
     */
    public static abstract class AbstractSplitter
    {

        private String splitterPattern;

        /**
         * Constructor
         * @param pattern regex pattern
         */
        public AbstractSplitter( String pattern )
        {
            splitterPattern = pattern;
        }

        /**
         * Type of splitter
         * @return Type of splitter
         */
        public abstract Type getType();

        /**
         * The regex for this splitter
         * @return regex
         */
        public final String getSplitterPattern()
        {
            return splitterPattern;
        }

        /**
         * String representation of this splitter
         * @return string
         */
        public final String toString()
        {
            return String.format( "%s[%s]", getType(), getSplitterPattern() );
        }
    }

    /**
     * A splitter that handles separating Words within a Sentance
     */
    public class WordSplitter
            extends AbstractSplitter
    {

        /**
         * Constructor
         * @param pattern regex pattern
         */
        public WordSplitter( String pattern )
        {
            super( pattern );
        }

        /**
         * Type of splitter
         * @return Type of splitter
         */
        public Type getType()
        {
            return Type.WORD;
        }
    }

    /**
     * A splitter used to separate individual sentances
     */
    public class SentenceSplitter
            extends AbstractSplitter
    {

        /**
         * Constructor
         * @param pattern regex pattern
         */
        public SentenceSplitter( String pattern )
        {
            super( pattern );
        }

        /**
         * Type of splitter
         * @return Type of splitter
         */
        public Type getType()
        {
            return Type.SENTENCE;
        }
    }

    public static class WordMapper
    {

        private int charIndex;
        private int listIndex;
        private int spaceCount;
        private final List mappings = new LinkedList();
        private String input;
        private String find;
        private String replace;

        /**
         *
         * @param input
         */
        public WordMapper( String input )
        {
            char[] chars = input.toCharArray();
            for( int i = 0, n = chars.length; i < n; i++ )
            {
                if( chars[i] == ' ' )
                {
                    mappings.add( i );
                }
            }
        }

        private int spaceCount( String string )
        {
            return spaceCount( string, 0, string.length() );
        }

        private int spaceCount( String string, int beginIndex, int endIndex )
        {
            int spaces = 0;
            char[] chars = string.toCharArray();
            for( int i = beginIndex, n = endIndex; i < n; i++ )
            {
                if( chars[i] == ' ' )
                {
                    spaces++;
                }
            }
            return spaces;
        }

        /**
         *
         * @param input
         * @param find
         * @param replace
         */
        public void prepare( String input, String find, String replace )
        {
            this.input = input;
            this.find = find;
            this.replace = replace;
            spaceCount = spaceCount( find );
            listIndex = 0;
            charIndex = 0;
        }

        /**
         *
         * @param beginIndex
         */
        public void update( int beginIndex )
        {
            listIndex += spaceCount( input, charIndex, beginIndex );
            charIndex = beginIndex;

            int n = spaceCount;
            for( int j = 0, m = replace.length(); j < m; j++ )
            {
                if( replace.charAt( j ) == ' ' && --n < 0 )
                {
                    mappings.add( listIndex++, null );
                }
            }

            while( n-- > 0 )
            {
                mappings.remove( listIndex );
            }
        }

        /**
         *
         * @return
         */
        public Integer[] toArray()
        {
            return mappings.toArray( new Integer[ mappings.size() ] );
        }
    }
}