All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aliasi.suffixarray.TokenSuffixArray Maven / Gradle / Ivy

Go to download

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.suffixarray;

import com.aliasi.tokenizer.Tokenization;
import com.aliasi.tokenizer.TokenizerFactory;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

/**
 * A {@code TokenSuffixArray} implements a suffix array of tokens.
 *
 * See {@link CharSuffixArray} for a description of suffix arrays
 * and their applications.
 *
 * 

Constructing a Token Suffix Array

* * A suffix array is constructed from a list of tokens. These may be * provided directly, or as a character array and tokenizer factory. * *

If the maximum length is less than the length of the array, * strings are truncated to be at most this length before comparison. * The result isn't a standard, fully sorted suffix array, but can be * faster to create and will suffice for many applications. The * indexes will be sorted relative to the truncated strings, so they * will be in order up to the specified length. * *

Document Boundary Token

* * The document boundary token is used to separate documents. * When the document boundary token is found when comparing * tokens, it's considered smaller than any other token (no matter * how it would sort as a string) and also as a string terminator. * *

Thus if the tokenization corresponds to multiple documents, * the boundary token should be used to separate them. * *

Tokenization Normalization for Comparison

* * In order to do comparisons that are case insensitive, or ignore * punctuation, the tokenizer should perform the normalization. * *

Using Suffix Arrays

* * Token suffix arrays are used in exactly the same way as character * suffix arrays; see {@link CharSuffixArray} for details and an * example. * *

Thread Safety

* * Once constructed, a tokenized suffix array is thread safe. * * @author Bob Carpenter * @version 4.1.0 * @since 4.0.2 */ public class TokenSuffixArray { private final Tokenization mTokenization; private final int[] mSuffixArray; private final String mDocumentBoundaryToken; private final int mMaxSuffixLength; /** * The default boundary token for documents. */ public static final String DEFAULT_DOCUMENT_BOUNDARY_TOKEN = "\u0000"; /** * Construct at token suffix array with no limit on suffix length * and the default document-boundary token. * * @param tokenization Tokenization on which to base the suffix * array. */ public TokenSuffixArray(Tokenization tokenization) { this(tokenization,Integer.MAX_VALUE); } /** * Construct a suffix array from the specified tokenization, comparing * suffixes using up the specified maximum suffix length using the * default document-boundary token. * * @param tokenization Tokenization on which to base suffix array. * @param maxSuffixLength Maximum length of token sequences to compare. */ public TokenSuffixArray(Tokenization tokenization, int maxSuffixLength) { this(tokenization,maxSuffixLength,DEFAULT_DOCUMENT_BOUNDARY_TOKEN); } /** * Construct a suffix array from the specified tokenization, comparing * suffixes using up the specified maximum suffix length using the * default document-boundary token. * * @param tokenization Tokenization on which to base suffix array. * @param maxSuffixLength Maximum length of token sequences to compare. * @param documentBoundaryToken Token used to separate documents. */ public TokenSuffixArray(Tokenization tokenization, int maxSuffixLength, String documentBoundaryToken) { mTokenization = tokenization; mDocumentBoundaryToken = documentBoundaryToken; mMaxSuffixLength = maxSuffixLength; Integer[] is = new Integer[tokenization.numTokens()]; for (int i = 0; i < is.length; ++i) is[i] = i; Arrays.sort(is,new TokenIndexComparator()); int[] suffixArray = new int[is.length]; for (int i = 0; i < is.length; ++i) suffixArray[i] = is[i]; mSuffixArray = suffixArray; } /** * Returns the token used to separate documents in this suffix * array. * * @return Separator token. */ public String documentBoundaryToken() { return mDocumentBoundaryToken; } /** * Returns the maximum suffix length for this token suffix array. * * @return Maximum length of suffixes. */ public int maxSuffixLength() { return mMaxSuffixLength; } /** * Returns the tokenization underlying this suffix array. * The tokenization may be used to retrieve the processed tokens, * the underlying text, as well as the positions of the tokens * in the text. * * @return The tokenization for this suffix array. */ public Tokenization tokenization() { return mTokenization; } /** * Returns the value of the suffix array at the specified index. * This value is an index into the underlying list of tokens. * * @param idx Suffix array index. * @return Index of the first token of the suffix at the * specified index. */ public int suffixArray(int idx) { return mSuffixArray[idx]; } /** * Returns the number of tokens in the suffix array. * * @return Number of tokens in the suffix array. */ public int suffixArrayLength() { return mSuffixArray.length; } /** * Returns the substring of the original string that's spanned * by the tokens starting at the specified suffix array index * and running the specified maximum number of tokens (or until * the token sequence ends). * * @param idx Index in suffix array of first token. * @param maxTokens Maximum number of tokens to include * in string. * @return Substring starting at the specified index and * running the maximum number of tokens or until the end of * the tokenization. */ public String substring(int idx, int maxTokens) { int start = suffixArray(idx); // must be int because numTokens() is int and taking min int end = (int) Math.min((long)start + (long)maxTokens, mTokenization.numTokens()); int text_start = mTokenization.tokenStart(start); int text_end = mTokenization.tokenEnd(end-1); return mTokenization.text().substring(text_start, text_end); } /** * Returns a list of maximal spans of suffix array indexes * which refer to suffixes that share a prefix of at least * the specified minimum match length. * * @param minMatchLength Minimum number of tokens required to * match. * @return The list of pairs of start (inclusive) and end * (exclsuive) positions in the suffix array that match up * to the specified minimum number of tokens. */ public List prefixMatches(int minMatchLength) { List matches = new ArrayList(); for (int i = 0; i < mSuffixArray.length; ) { int j = suffixesMatchTo(i,minMatchLength,mTokenization.tokenList()); if (i + 1 != j) { matches.add(new int[] { i, j }); i = j; } else { ++i; } } return matches; } private int suffixesMatchTo(int i, int minMatchLength, List tokens) { int index1 = mSuffixArray[i]; int j = i+1; for (; j < mSuffixArray.length; ++j) { int index2 = mSuffixArray[j]; if (!matchTokens(index1,index2,minMatchLength,tokens)) break; } return j; } private boolean matchTokens(int index1, int index2, int minMatches, List tokens) { if (index1 + minMatches > mSuffixArray.length) return false; // not enough toks if (index2 + minMatches > mSuffixArray.length) return false; // not enough toks for (int k = 0; k < minMatches; ++k) { String tok1 = tokens.get(index1 + k); if (tok1.equals(mDocumentBoundaryToken)) return false; String tok2 = tokens.get(index2 + k); if (tok2.equals(mDocumentBoundaryToken)) return false; if (!tokens.get(index1 + k).equals(tokens.get(index2 + k))) return false; } return true; } class TokenIndexComparator implements Comparator { public int compare(Integer i, Integer j) { List tokens = mTokenization.tokenList(); for (int m = i, n = j, k = 0; k < mMaxSuffixLength; ++m, ++n, ++k) { if (m == tokens.size() || tokens.get(m).equals(mDocumentBoundaryToken)) { if (n == tokens.size() || tokens.get(n).equals(mDocumentBoundaryToken)) return 0; return -1; } if (n == tokens.size() || tokens.get(n).equals(mDocumentBoundaryToken)) return 1; int c = tokens.get(m).compareTo(tokens.get(n)); if (c != 0) return c; } return 0; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy