com.aliasi.sentences.SentenceModel Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0

Show newest version

/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.sentences;

import java.util.Collection;

/**
 * The SentenceModel interface specifies a means of doing
 * sentence segmentation from arrays of tokens and whitespaces.  
 * 
 * The sentence model operates over aligned arrays of tokens and
 * whitespaces, as derived from a {@link
 * com.aliasi.tokenizer.Tokenizer}.  There are two methods in the
 * interface.  The standard external interface is {@link
 * #boundaryIndices(String[],String[])}, which returns an array of
 * token indices that are sentence-final.  For instance, with tokens
 * {"John", "ran", ".", "He", "also", "jumped", "!"}, and
 * whitespaces {"", " ", "", "  ", " ", " ", " ", "", ""}.
 * the return result from the Indo-European model would be
 * {2,6}, because the token indexed 2 is a period
 * (.) and the token indexed 6 is an exclamation point
 * (!).  The return result will often depend on the
 * whitespaces as well as the tokens.
 *
 * The second method is {@link
 * #boundaryIndices(String[],String[],int,int,Collection)}, which adds
 * the boundary indexes as Integers to the specified
 * collection for the slice determined by the start and end plus one
 * indices.
 *
 * @author  Bob Carpenter
 * @version 3.0
 * @since   LingPipe1.0
 */
public interface SentenceModel {

    /**
     * Returns an array of indices of sentence-final tokens.
     *
     * @param tokens Array of tokens to annotate.
     * @param whitespaces Array of whitespaces to annotate.
     * @return Array of integers indicating indices of tokens that
     * are sentence final.
     * @throws IllegalArgumentException If the array of whitespaces is
     * not one longer than the array of tokens.
     */
    public int[] boundaryIndices(String[] tokens, String[] whitespaces);


    /**
     * Adds the sentence final token indices as Integer
     * instances to the specified collection, only considering tokens
     * between index start and end-1
     * inclusive.
     *
     * @param tokens Array of tokens to annotate.
     * @param whitespaces Array of whitespaces to annotate.
     * @param start Index of first token to annotate.
     * @param end Index one beyond the last token to annotate.
     * @param indices Collection into which to write the boundary
     * indices.
     * @throws IllegalArgumentException If the array of tokens is 
     * not at least as long as start+end and the
     * array of whitespaces at least as long as start+end+1.
     */
    public void boundaryIndices(String[] tokens, String[] whitespaces,
                                int start, int end,
                                Collection indices);


}