com.aliasi.suffixarray.DocumentTokenSuffixArray Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.
There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.suffixarray;

import com.aliasi.tokenizer.Tokenization;
import com.aliasi.tokenizer.TokenizerFactory;

import com.aliasi.util.Strings;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

/**
 * A {@code DocumentTokenSuffixArray} implements a suffix array over a
 * collection of named documents. 
 *
 * How it Works
 *
 * The basic idea is that the documents are concatenated and then
 * stored in a token suffix array.  This class provides methods for
 * extracting the document given a position in the suffix array.
 *
 * The documents are concatenated with a specified distinguished
 * token as a separator.  The separator acts as an end-of-document
 * marker that terminates comparisons.  
 *
 * 
A document suffix array is constructed from a mapping of
 * identifiers to documents.  A tokenizer factory and separator are
 * also provided. 
 *
 * 
The underlying suffix array may be retrieved using {@link
 * #suffixArray()} and manipulated as any other token-based suffix
 * array.  The method {@link #textPositionToDocId(int)} provides
 * the means to map a position in the underlying token array to
 * the document that spans the positions.
 *
 * @author Bob Carpenter
 * @version 4.1.0
 * @since LingPipe 4.0.2
 */
public class DocumentTokenSuffixArray {

    private final TokenSuffixArray mTsa;
    private final int[] mDocStarts;
    private final String[] mDocIds;
    private final Map mDocIdToIndex;

    /**
     * Construct a suffix array from the specified identified document
     * collection using the specified tokenizer factory, limiting comparisons
     * to the specified maximum suffix length and separating documents with
     * the specified boundary token.
     *
     * 
For this class to work properly, the tokenizer factory must
     * tokenize the document boundary token into a single token when
     * surrounded by spaces.
     *
     * @param idToDocMap Mapping from document identifiers to document
     * texts.
     * @param tf Tokenizer factory to use for matching.
     * @param maxSuffixLength Maximum suffix length (in tokens) for
     * comparsions.
     * @param documentBoundaryToken Distinguished token used to separate
     * documents.
     * @throws IllegalArgumentException If the tokenizer factory does not
     * tokenize the document boundary token surrounded by single whitespaces
     * into a single token consisting of the boundary token.
     * // raise exception if find boundary in tokens of doc?
     */
    public DocumentTokenSuffixArray(Map idToDocMap,
                                    TokenizerFactory tf,
                                    int maxSuffixLength,
                                    String documentBoundaryToken) {
        String test = " " + documentBoundaryToken + " ";
        String[] test_tokens = tf.tokenizer(test.toCharArray(),0,test.length()).tokenize();
        if (test_tokens.length != 1 || !test_tokens[0].equals(documentBoundaryToken)) {
            String msg = "Tokenizer factory must convert boundary token to self."
                + " Found documentBoundaryToken=|" + documentBoundaryToken + "|"
                + " tokenizerFactory=" + tf
                + " result of tokenizing boundary token=|" + Arrays.asList(test_tokens) + "|";
            throw new IllegalArgumentException(msg);
        }

        mDocIds = idToDocMap.keySet().toArray(Strings.EMPTY_STRING_ARRAY);
        Arrays.sort(mDocIds);

        List docStarts = new ArrayList(idToDocMap.size());
        mDocIdToIndex = new HashMap(idToDocMap.size());
        Set> entrySet = idToDocMap.entrySet();
        int token_pos = 0;
        int total_chars = 0;
        int count = 0;
        for (String id : mDocIds) {
            String text = idToDocMap.get(id);
            mDocIdToIndex.put(id,count++);
            docStarts.add(token_pos);
            token_pos += tokenCount(tf,text) + 1; 
            total_chars += text.length() + documentBoundaryToken.length() + 2;
        }

        mDocStarts = new int[docStarts.size()];
        for (int i = 0; i < mDocStarts.length; ++i)
            mDocStarts[i] = docStarts.get(i);

        // block uses less memory than StringBuilder w. append().
        char[] cs = new char[total_chars];
        int char_pos = 0;
        for (String id : mDocIds) {
            String text = idToDocMap.get(id);
            for (int i = 0; i < text.length(); ++i)
                cs[char_pos++] = text.charAt(i);
            cs[char_pos++] = ' ';
            for (int i = 0; i < documentBoundaryToken.length(); ++i)
                cs[char_pos++] = documentBoundaryToken.charAt(i);
            cs[char_pos++] = ' ';
        }

        Tokenization tokenization
            = new Tokenization(cs,0,cs.length,tf);
        mTsa = new TokenSuffixArray(tokenization,maxSuffixLength,
                                    documentBoundaryToken);
    }

    /**
     * Return the token suffix array backing this document suffix
     * array.
     *
     * @return Underlying suffix array.
     */
    public TokenSuffixArray suffixArray() {
        return mTsa;
    }

    /**
     * Return the identifier of the document that contains
     * the specified position in the underlying text.
     *
     * @param textPosition Position in underlying list of concatenated
     * documents.
     * @return Position
     */
    public String textPositionToDocId(int textPosition) {
        if (textPosition < 0 || textPosition > mTsa.tokenization().text().length()) {
            String msg = "Position must be >= 0 and <= text.length="
                + mTsa.tokenization().text().length()
                + " Found textPosition=" + textPosition;
            throw new IndexOutOfBoundsException(msg);
        }
        return mDocIds[largestWithoutGoingOver(mDocStarts,textPosition)];
    }


    /**
     * Return the text of the document with the specified name.
     * 
     * @param docName Name of document.
     * @return Text for that document.
     * @throws NullPointerException If the document name is not known.
     */
    public String documentText(String docName) {
        String boundaryToken = mTsa.documentBoundaryToken();
        String text = mTsa.tokenization().text();
        int idx = mDocIdToIndex.get(docName);
        int start = mDocStarts[idx];
        int boundaryEnd 
            = ((idx + 1) == mDocStarts.length)
            ? text.length() 
            : mDocStarts[idx+1];
        int end = boundaryEnd - boundaryToken.length() - 2;
        return text.substring(start,end);
    }

    /**
     * Returns the number of documents in the collection.
     *
     * @return Number of documents in the collection.
     */
    public int numDocuments() {
        return mDocStarts.length;
    }

    /**
     * Returns an unmodifiable view of the set of document names in
     * the collection.
     *
     * @return The set of document names.
     */
    public Set documentNames() {
        return Collections.unmodifiableSet(mDocIdToIndex.keySet());
    }


    /**
     * Returns the starting token position in the underlying token
     * suffix array of the document with the specified identifier in
     * the overall set of documents.  Returns {@code -1} if the
     * document is not part of the collection.
     *
     * @param docId Document identifier.
     * @return Position of first token in document in the underlying
     * token suffix array.
     */
    public int docStartToken(String docId) {
        int idx = Arrays.binarySearch(mDocIds,docId);
        return idx < 0 ? -1 : mDocStarts[idx];
    }

    /**
     * Returns the index of the next token past the last token of the
     * specified document.  Returns {@code -1} if the document is not
     * part of the collection.
     *
     * @param docId Document identifier.
     * @return Position of first token in document in the underlying
     * token suffix array.
     */
    public int docEndToken(String docId) {
        int idx = Arrays.binarySearch(mDocIds,docId);
        if (idx < 0)
            return -1;
        int next_idx = idx + 1;
        if (next_idx == mDocIds.length)
            return Math.max(1,mTsa.suffixArrayLength() - 1);
        return Math.max(1,mDocStarts[idx+1] - 1);
    }
        

    /**
     * Given an increasing array of values and a specified value,
     * return the largest index into the array such that the array's
     * value at the index is smaller than or equal to the specified
     * value.  Returns -1 if there are no entries in the array less
     * than the specified value.
     *
     * Warning: No test is made that the values are in
     * increasing order.  If they are not, the behavior of this
     * method is not specified.
     * 
     * @param vals Array of values, sorted in ascending order.
     * @param val Specified value to search.
     */
    public static int largestWithoutGoingOver(int[] vals, 
                                              int val) {
        int start = 0;
        int end = vals.length;
        if (vals.length == 0)
            return -1;
        if (val < vals[start])
            return -1;
        if (val >= vals[end-1]) 
            return end - 1;
        // invariant: vals[start] <= val <= vals[end-1] 
        while (start + 1 < end) {
            int mid = (start + end) / 2;
            // invariant: start < mid < end
            if (val < vals[mid]) 
                end = mid;
            else if (val > vals[mid])
                start = mid;
            else
                return mid;
        }
        return start;
    }

    static int tokenCount(TokenizerFactory tf, String text) {
        int count = 0;
        for (String token : tf.tokenizer(text.toCharArray(),0,text.length()))
             ++count;
        return count;
    }



}