com.aliasi.classify.BigVectorClassifier Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.
There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.classify;

import com.aliasi.matrix.SparseFloatVector;
import com.aliasi.matrix.Vector;

import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.BoundedPriorityQueue;
import com.aliasi.util.Scored;
import com.aliasi.util.ScoredObject;

import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.Serializable;


/**
 * A {@code BigVectorClassifier} provides an efficient linear
 * classifier implementation for large numbers of categories.
 * Inputs are vector implementations and outputs are scored
 * classifications pruned to the top N.
 *
 *
 * Computation
 *
 * This class reverses what's typically a category (row) dominant
 * approach to a feature (column) dominant representation, allowing
 * scaling to large number of categories when the columns are sparse.
 *
 * 
The standard approach in linear classifiers is to multiply a
 * (possibly sparse) input vector by each category's vector
 * representation.  The vector representing a category maps features
 * to values, and may be sparse.
 *
 * 
This class reverses the representation.  Rather than a map from
 * categories to features to values, it uses a map from features to
 * categories to values.  For a sparse input, it then iterates over
 * the categories for each feature and adds the results.  If the
 * maps from categories to values for features are very sparse, this
 * saves significant time over multiplying the input by each category's
 * vector representation.

 * 
This class uses a custom heap to efficiently merge the features
 * for each category, and a bounded priority queue for collecting
 * n-best results.
 *
 * 
Input Representation
 *
 * The constructor takes an array of vectors, one for each dimension,
 * or feature of the linear classifier.  Each of these vectors is
 * sparse and has dimensions corresponding to categories with non-zero
 * values for the feature.  It thus corresponds to a term/document
 * matrix in search, with terms being features and documents being
 * categories.
 *
 * Training
 *
 * There are no training methods provided as part of this class.
 * It is meant as a general utility for importing large category
 * linear classifiers.
 *
 *
 * 
Serialization
 *
 * Instances may be serialized.  When read back in they will
 * be members of this class.
 *
 * Thread Safety
 *
 * This class is read-write threadsafe, where the only write operation
 * sets the maximum number of results.  Thus any number of concurrent
 * classifications may be carried out with a single instance of this
 * class.
 *
 * @author  Bob Carpenter
 * @version 4.0.0
 * @since   LingPipe3.9
 */
public class BigVectorClassifier
    implements ScoredClassifier,
               Serializable {

    static final long serialVersionUID = 5149230080619243511L;

    private final int[] mTermIndexes;
    private final int[] mDocumentIds;
    private final float[] mScores;
    private final String[] mCategories;

    private int mMaxResults;

    /**
     * Construct a big vector classifier with the specified term
     * vectors, maximum number of results, and categories equal to the
     * string representations of the category identifiers.
     *
     * See {@link #BigVectorClassifier(Vector[],String[],int)} for
     * more information.
     *
     * @param termVectors Term vectors for classifier.
     * @param maxResults Maximum number of top results returned.
     */
    public BigVectorClassifier(Vector[] termVectors,
                               int maxResults) {
        this(termVectors,categoriesFor(termVectors),maxResults);
    }

    /**
     * Construct a big vector classifier with the specified term
     * vectors, categories, and maximum number of results.  The term
     * vectors have category identifiers as
     *
     * @param termVectors Term vectors for classifier.
     * @param categories Category names indexed by number.
     * @param maxResults Maximum number of top results returned.
     */
    public BigVectorClassifier(Vector[] termVectors,
                               String[] categories,
                               int maxResults) {
        mCategories = categories;
        // does not automatically prune zero values
        mTermIndexes = new int[termVectors.length];
        int size = termVectors.length; // 0 padding
        for (Vector termVector : termVectors)
            size += termVector.nonZeroDimensions().length;
        mDocumentIds = new int[size];
        mScores = new float[size];
        int pos = 0;
        for (int i = 0; i < termVectors.length; ++i) {
            mTermIndexes[i] = pos;
            Vector termVector = termVectors[i];
            int[] nzDims = termVector.nonZeroDimensions();
            for (int k = 0; k < nzDims.length; ++k) {
                int j = nzDims[k];
                mDocumentIds[pos] = j;
                mScores[pos] = (float) termVector.value(j);
                ++pos;
            }
            mDocumentIds[pos] = -1;
            ++pos;
        }
        setMaxResults(maxResults);

        /*
        System.out.println("termIndexes");
        for (int i = 0; i < mTermIndexes.length; ++i)
            System.out.println("  " + i + " " + mTermIndexes[i]);
        System.out.println("documentIds");
        for (int i = 0; i < mDocumentIds.length; ++i)
            System.out.println("  " + i + " " + mDocumentIds[i]);
        System.out.println("scores");
        for (int i = 0; i < mScores.length; ++i)
            System.out.println("  " + i + " " + mScores[i]);
        */

    }

    BigVectorClassifier(int[] termIndexes,
                        int[] documentIds,
                        float[] scores,
                        String[] categories,
                        int maxResults) {
        mTermIndexes = termIndexes;
        mDocumentIds = documentIds;
        mScores = scores;
        setMaxResults(maxResults);
        mCategories = categories;
    }

    static String[] categoriesFor(Vector[] termVectors) {
        int max = 0;
        for (Vector termVector : termVectors) {
            int[] nzDims = termVector.nonZeroDimensions();
            for (int k = 0; k < nzDims.length; ++k)
                max = Math.max(max,nzDims[k]);
        }
        String[] categories = new String[max];
        for (int i = 0; i < categories.length; ++i)
            categories[i] = Integer.toString(i);
        return categories;
    }

    /**
     * Return the maximum number of top results returned
     * by this classifier.
     *
     * @return Maximum number of results from classification.
     */
    public int maxResults() {
        return mMaxResults;
    }

    /**
     * Sets the maximum number of results returned by this
     * classifier.
     *
     * 
This method is a write method which should be read-write
     * synchronized with calls to {@link #classify(Vector)}.
     *
     * @param maxResults Maximum number of top results returned
     * by this classifier.
     */
    public void setMaxResults(int maxResults) {
        if (maxResults < 1) {
            String msg = "Max results must be positive."
                + " Found maxResults=" + maxResults;
            throw new IllegalArgumentException(msg);
        }
        mMaxResults = maxResults;
    }

    /**
     * Return a scored classification consisting of the top results
     * for the specified vector input.
     *
     * The maximum size of the returned scored classification is
     * given by {@link #maxResults()} and set with {@link
     * #setMaxResults(int)}.
     *
     * @param x Vector to classify.
     * @return Classification of the vector.
     *
     */
    public ScoredClassification classify(Vector x) {
        int[] nzDims = x.nonZeroDimensions();
        int heapSize = 0; // number dims in range of terms
        for (int k = 0; k < nzDims.length; ++k)
            if (nzDims[k] < mTermIndexes.length)
                ++heapSize;
        int[] current = new int[heapSize];
        float[] vals = new float[heapSize];
        int j = 0;
        for (int k = 0; k < heapSize; ++k) {
            if (nzDims[k] >= mTermIndexes.length)
                continue;
            current[j] = mTermIndexes[nzDims[k]];
            vals[j] = (float) x.value(nzDims[k]);
            ++j;
        }
        for (int k = (heapSize+1)/2; --k >= 0; )
            heapify(k,heapSize,current,vals,mDocumentIds);

        BoundedPriorityQueue queue
            = new BoundedPriorityQueue(ScoredObject.comparator(),
                                                  mMaxResults);
        int[] documentIds = mDocumentIds;
        while (heapSize > 0) {
            // printHeap(heapSize,current,vals,documentIds);
            int doc = documentIds[current[0]];
            // System.out.println("doc=" + doc);
            double score = 0.0;
            while (heapSize > 0 && documentIds[current[0]] == doc) {
                score += vals[0] * mScores[current[0]];
                ++current[0];
                if (documentIds[current[0]] == -1) {
                    --heapSize;
                    if (heapSize > 0) {
                        current[0] = current[heapSize];
                        vals[0] = vals[heapSize];
                    }
                }
                heapify(0,heapSize,current,vals,documentIds);
            }
            queue.offer(new ScoredDoc(doc,score));
        }
        String[] categories = new String[queue.size()];
        double[] scores = new double[queue.size()];
        int pos = 0;
        for (ScoredDoc sd : queue) {
            categories[pos] = Integer.toString(sd.docId());
            scores[pos] = sd.score();
            ++pos;
        }
        return new ScoredClassification(categories,scores);

    }

    Object writeReplace() {
        return new Serializer(this);
    }


    static void heapify(int i, int heapSize,
                        int[] current, float[] vals,
                        int[] documentIds) {
        while (true) {
            int left = 2 * (i+1) - 1;
            if (left >= heapSize)
                return;
            if (documentIds[current[i]] > documentIds[current[left]]) {
                swap(left,i,current);
                swap(left,i,vals);
                i = left;
                continue;
            }
            int right = left+1;
            if (right >= heapSize)
                return;
            if (documentIds[current[i]] > documentIds[current[right]]) {
                swap(right,i,current);
                swap(right,i,vals);
                i = right;
                continue;
            }
            return;
        }
    }

    static void printHeap(int heapSize,
                          int[] current, float[] vals,
                          int[] documentIds) {
        System.out.println("\nHeapSize=" + heapSize);
        for (int i = 0; i < heapSize; ++i)
            System.out.println("i=" + i + " curent=" + current[i] + " vals=" + vals[i]
                               + " docId=" + documentIds[current[i]]);
    }

    static void swap(int i, int j, int[] xs) {
        int tempXsI = xs[i];
        xs[i] = xs[j];
        xs[j] = tempXsI;
    }


    static void swap(int i, int j, float[] xs) {
        float tempXsI = xs[i];
        xs[i] = xs[j];
        xs[j] = tempXsI;
    }


    static class ScoredDoc implements Scored {
        private final int mDocId;
        private final double mScore;
        public ScoredDoc(int docId, double score) {
            mDocId = docId;
            mScore = score;
        }
        public int docId() {
            return mDocId;
        }
        public double score() {
            return mScore;
        }
        public String toString() {
            return mDocId + ":" + mScore;
        }
    }



    static class Serializer extends AbstractExternalizable {
        static final long serialVersionUID = 3954262240692411543L;
        private final BigVectorClassifier mClassifier;
        public Serializer() {
            this(null);
        }
        public Serializer(BigVectorClassifier classifier) {
            mClassifier = classifier;
        }
        @Override
        public void writeExternal(ObjectOutput objOut) throws IOException {
            writeInts(mClassifier.mTermIndexes,objOut);
            writeInts(mClassifier.mDocumentIds,objOut);
            writeFloats(mClassifier.mScores,objOut);
            writeUTFs(mClassifier.mCategories,objOut);
            objOut.writeInt(mClassifier.mMaxResults);
        }
        @Override
        public Object read(ObjectInput objIn)
            throws ClassNotFoundException, IOException {
            int[] termIndexes = readInts(objIn);
            int[] documentIds = readInts(objIn);
            float[] scores = readFloats(objIn);
            String[] categories = readUTFs(objIn);
            int maxResults = objIn.readInt();
            return new BigVectorClassifier(termIndexes,
                                           documentIds,
                                           scores,
                                           categories,
                                           maxResults);
        }
    }





}