gov.sandia.cognition.learning.data.feature.FeatureHashing Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cognitive-foundry Show documentation
A single jar with all the Cognitive Foundry components.
There is a newer version: 4.0.1
/*
 * File:            FeatureHashing.java
 * Authors:         Justin Basilico
 * Project:         Cognitive Foundry
 * 
 * Copyright 2015 Cognitive Foundry. All rights reserved.
 */

package gov.sandia.cognition.learning.data.feature;

import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationType;
import gov.sandia.cognition.hash.HashFunction;
import gov.sandia.cognition.hash.HashFunctionUtil;
import gov.sandia.cognition.hash.Murmur32Hash;
import gov.sandia.cognition.math.matrix.Vector;
import gov.sandia.cognition.math.matrix.VectorEntry;
import gov.sandia.cognition.math.matrix.VectorFactory;
import gov.sandia.cognition.math.matrix.VectorFactoryContainer;
import gov.sandia.cognition.math.matrix.VectorFunction;
import gov.sandia.cognition.math.matrix.VectorOutputEvaluator;
import gov.sandia.cognition.util.AbstractCloneableSerializable;
import gov.sandia.cognition.util.ArgumentChecker;

/**
 * Implements a function that applies vector feature hashing. This is also known
 * as the "hashing trick". It takes in a  vector (typically sparse) and then 
 * applies hashing to the non-zero values of the vector to produce another 
 * vector, which is usually of a much lower dimensionality. It does this by 
 * hashing each index and then using the sign of the hashed value to either
 * increment or decrement the hashed index by the value.
 * 
 * @author  Justin Basilico
 * @since   3.4.2
 */
@PublicationReference(
    title="Feature Hashing for Large Scale Multitask Learning",
    author={"Kilian Weinberger", "Anirban Dasgupta", "Josh Attenberg", "John Langford", "Alex Smola"},
    year=2009,
    type=PublicationType.Conference,
    publication="Proceedings of the 26th Annual International Conference on Machine Learning (ICML)",
    url="http://arxiv.org/pdf/0902.2206.pdf")
public class FeatureHashing
    extends AbstractCloneableSerializable
    implements VectorFunction, VectorOutputEvaluator, 
        VectorFactoryContainer
{

    /** The default output dimensionality is {@value}. */
    public static final int DEFAULT_OUTPUT_DIMENSIONALITY = 100;

    /** The output size of the hash. */
    protected int outputDimensionality;
    
    /** The hashing function to use. */
    protected HashFunction hashFunction;
    
    /** Vector factory to use. */
    protected VectorFactory vectorFactory;
    
    /**
     * Creates a new {@link FeatureHashing}.
     */
    public FeatureHashing()
    {
        this(DEFAULT_OUTPUT_DIMENSIONALITY);
    }
    
    /**
     * Creates a new {@link FeatureHashing} with the given output size.
     * 
     * @param   outputDimensionality 
     *      The output dimensionality. Cannot be negative.
     */
    public FeatureHashing(
        final int outputDimensionality)
    {
        this(outputDimensionality, new Murmur32Hash(), 
            VectorFactory.getSparseDefault());
    }

    /**
     * Creates a new {@link FeatureHashing} with the given parameters.
     * 
     * @param   outputDimensionality 
     *      The output dimensionality. Cannot be negative.
     * @param hashFunction
     *      The hash function to use.
     * @param vectorFactory
     *      The vector factory to use.
     */
    public FeatureHashing(
        final int outputDimensionality,
        final HashFunction hashFunction,
        final VectorFactory vectorFactory)
    {
        super();
        
        this.setOutputDimensionality(outputDimensionality);
        this.setHashFunction(hashFunction);
        this.setVectorFactory(vectorFactory);
    }
    
    @Override
    public Vector evaluate(
        final Vector input)
    {
// TODO: For a sparse vector the insertion order may make a bad runtime for this. Think through other ways to build up the new vector.
// --jbasilico (2015-07-22)
        final Vector output = this.vectorFactory.createVectorCapacity(
            this.outputDimensionality, input.getEntryCount());
        for (final VectorEntry entry : input)
        {
            // Compute the hash for the current index to get the new one by
            // putting it into the output space size through the modulo 
            // operator.
            final int hashed = this.hash(entry.getIndex());
            final int newIndex = hashed % this.outputDimensionality;
            
            // Use the sign bit of the hash to determine if this value will
            // be treated as a positive or negaive update.
            if (hashed >= 0)
            {
                // Treat as positive.
                output.increment(newIndex, entry.getValue());
            }
            else
            {
                // Treat as negative. Also means the index will be negative,
                // so flip its sign to get the proper index.
                output.increment(-newIndex, -entry.getValue());
            }
        }
        return output;
    }
    
    /**
     * Applies the hashing function to the index.
     * 
     * @param   index
     *      The index to hash.
     * @return 
     *      The hashed value.
     */
    protected int hash(
        final int index)
    {
// TODO: Can we avoid creating the byte arrays?
// -- jbasilico (2015-10-30)
        final byte[] inputBytes = HashFunctionUtil.toByteArray(index);
        final byte[] hashedBytes = this.hashFunction.evaluate(inputBytes);
        return HashFunctionUtil.toInteger(hashedBytes);
    }

    @Override
    public int getOutputDimensionality()
    {
        return this.outputDimensionality;
    }

    /**
     * Sets the output dimensionality, which is the size of the output vector
     * that the input is hashed into.
     * 
     * @param   outputDimensionality 
     *      The output dimensionality. Cannot be negative.
     */
    public void setOutputDimensionality(
        final int outputDimensionality)
    {
        ArgumentChecker.assertIsNonNegative("outputDimensionality", outputDimensionality);
        this.outputDimensionality = outputDimensionality;
    }

    /**
     * Gets the hash function to use.
     * 
     * @return 
     *      The hash function to use.
     */
    public HashFunction getHashFunction()
    {
        return this.hashFunction;
    }

    /**
     * Gets the hash function to use.
     * 
     * @param hashFunction 
     *      The hash function to use.
     */
    public void setHashFunction(
        final HashFunction hashFunction)
    {
        this.hashFunction = hashFunction;
    }

    @Override
    public VectorFactory getVectorFactory()
    {
        return vectorFactory;
    }

    /**
     * Sets the vector factory to use.
     * 
     * @param   vectorFactory 
     *      The vector factory.
     */
    public void setVectorFactory(
        final VectorFactory vectorFactory)
    {
        this.vectorFactory = vectorFactory;
    }
    
}