All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aliasi.suffixarray.DocumentTokenSuffixArray Maven / Gradle / Ivy

Go to download

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.suffixarray;

import com.aliasi.tokenizer.Tokenization;
import com.aliasi.tokenizer.TokenizerFactory;

import com.aliasi.util.Strings;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

/**
 * A {@code DocumentTokenSuffixArray} implements a suffix array over a
 * collection of named documents. 
 *
 * 

How it Works

* * The basic idea is that the documents are concatenated and then * stored in a token suffix array. This class provides methods for * extracting the document given a position in the suffix array. * *

The documents are concatenated with a specified distinguished * token as a separator. The separator acts as an end-of-document * marker that terminates comparisons. * *

A document suffix array is constructed from a mapping of * identifiers to documents. A tokenizer factory and separator are * also provided. * *

The underlying suffix array may be retrieved using {@link * #suffixArray()} and manipulated as any other token-based suffix * array. The method {@link #textPositionToDocId(int)} provides * the means to map a position in the underlying token array to * the document that spans the positions. * * @author Bob Carpenter * @version 4.1.0 * @since LingPipe 4.0.2 */ public class DocumentTokenSuffixArray { private final TokenSuffixArray mTsa; private final int[] mDocStarts; private final String[] mDocIds; private final Map mDocIdToIndex; /** * Construct a suffix array from the specified identified document * collection using the specified tokenizer factory, limiting comparisons * to the specified maximum suffix length and separating documents with * the specified boundary token. * *

For this class to work properly, the tokenizer factory must * tokenize the document boundary token into a single token when * surrounded by spaces. * * @param idToDocMap Mapping from document identifiers to document * texts. * @param tf Tokenizer factory to use for matching. * @param maxSuffixLength Maximum suffix length (in tokens) for * comparsions. * @param documentBoundaryToken Distinguished token used to separate * documents. * @throws IllegalArgumentException If the tokenizer factory does not * tokenize the document boundary token surrounded by single whitespaces * into a single token consisting of the boundary token. * // raise exception if find boundary in tokens of doc? */ public DocumentTokenSuffixArray(Map idToDocMap, TokenizerFactory tf, int maxSuffixLength, String documentBoundaryToken) { String test = " " + documentBoundaryToken + " "; String[] test_tokens = tf.tokenizer(test.toCharArray(),0,test.length()).tokenize(); if (test_tokens.length != 1 || !test_tokens[0].equals(documentBoundaryToken)) { String msg = "Tokenizer factory must convert boundary token to self." + " Found documentBoundaryToken=|" + documentBoundaryToken + "|" + " tokenizerFactory=" + tf + " result of tokenizing boundary token=|" + Arrays.asList(test_tokens) + "|"; throw new IllegalArgumentException(msg); } mDocIds = idToDocMap.keySet().toArray(Strings.EMPTY_STRING_ARRAY); Arrays.sort(mDocIds); List docStarts = new ArrayList(idToDocMap.size()); mDocIdToIndex = new HashMap(idToDocMap.size()); Set> entrySet = idToDocMap.entrySet(); int token_pos = 0; int total_chars = 0; int count = 0; for (String id : mDocIds) { String text = idToDocMap.get(id); mDocIdToIndex.put(id,count++); docStarts.add(token_pos); token_pos += tokenCount(tf,text) + 1; total_chars += text.length() + documentBoundaryToken.length() + 2; } mDocStarts = new int[docStarts.size()]; for (int i = 0; i < mDocStarts.length; ++i) mDocStarts[i] = docStarts.get(i); // block uses less memory than StringBuilder w. append(). char[] cs = new char[total_chars]; int char_pos = 0; for (String id : mDocIds) { String text = idToDocMap.get(id); for (int i = 0; i < text.length(); ++i) cs[char_pos++] = text.charAt(i); cs[char_pos++] = ' '; for (int i = 0; i < documentBoundaryToken.length(); ++i) cs[char_pos++] = documentBoundaryToken.charAt(i); cs[char_pos++] = ' '; } Tokenization tokenization = new Tokenization(cs,0,cs.length,tf); mTsa = new TokenSuffixArray(tokenization,maxSuffixLength, documentBoundaryToken); } /** * Return the token suffix array backing this document suffix * array. * * @return Underlying suffix array. */ public TokenSuffixArray suffixArray() { return mTsa; } /** * Return the identifier of the document that contains * the specified position in the underlying text. * * @param textPosition Position in underlying list of concatenated * documents. * @return Position */ public String textPositionToDocId(int textPosition) { if (textPosition < 0 || textPosition > mTsa.tokenization().text().length()) { String msg = "Position must be >= 0 and <= text.length=" + mTsa.tokenization().text().length() + " Found textPosition=" + textPosition; throw new IndexOutOfBoundsException(msg); } return mDocIds[largestWithoutGoingOver(mDocStarts,textPosition)]; } /** * Return the text of the document with the specified name. * * @param docName Name of document. * @return Text for that document. * @throws NullPointerException If the document name is not known. */ public String documentText(String docName) { String boundaryToken = mTsa.documentBoundaryToken(); String text = mTsa.tokenization().text(); int idx = mDocIdToIndex.get(docName); int start = mDocStarts[idx]; int boundaryEnd = ((idx + 1) == mDocStarts.length) ? text.length() : mDocStarts[idx+1]; int end = boundaryEnd - boundaryToken.length() - 2; return text.substring(start,end); } /** * Returns the number of documents in the collection. * * @return Number of documents in the collection. */ public int numDocuments() { return mDocStarts.length; } /** * Returns an unmodifiable view of the set of document names in * the collection. * * @return The set of document names. */ public Set documentNames() { return Collections.unmodifiableSet(mDocIdToIndex.keySet()); } /** * Returns the starting token position in the underlying token * suffix array of the document with the specified identifier in * the overall set of documents. Returns {@code -1} if the * document is not part of the collection. * * @param docId Document identifier. * @return Position of first token in document in the underlying * token suffix array. */ public int docStartToken(String docId) { int idx = Arrays.binarySearch(mDocIds,docId); return idx < 0 ? -1 : mDocStarts[idx]; } /** * Returns the index of the next token past the last token of the * specified document. Returns {@code -1} if the document is not * part of the collection. * * @param docId Document identifier. * @return Position of first token in document in the underlying * token suffix array. */ public int docEndToken(String docId) { int idx = Arrays.binarySearch(mDocIds,docId); if (idx < 0) return -1; int next_idx = idx + 1; if (next_idx == mDocIds.length) return Math.max(1,mTsa.suffixArrayLength() - 1); return Math.max(1,mDocStarts[idx+1] - 1); } /** * Given an increasing array of values and a specified value, * return the largest index into the array such that the array's * value at the index is smaller than or equal to the specified * value. Returns -1 if there are no entries in the array less * than the specified value. * *

Warning: No test is made that the values are in * increasing order. If they are not, the behavior of this * method is not specified. * * @param vals Array of values, sorted in ascending order. * @param val Specified value to search. */ public static int largestWithoutGoingOver(int[] vals, int val) { int start = 0; int end = vals.length; if (vals.length == 0) return -1; if (val < vals[start]) return -1; if (val >= vals[end-1]) return end - 1; // invariant: vals[start] <= val <= vals[end-1] while (start + 1 < end) { int mid = (start + end) / 2; // invariant: start < mid < end if (val < vals[mid]) end = mid; else if (val > vals[mid]) start = mid; else return mid; } return start; } static int tokenCount(TokenizerFactory tf, String text) { int count = 0; for (String token : tf.tokenizer(text.toCharArray(),0,text.length())) ++count; return count; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy