All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.vectorizer.encoders;

import com.google.common.base.Charsets;
import com.google.common.collect.Sets;
import org.apache.mahout.math.MurmurHash;
import org.apache.mahout.math.Vector;

import java.util.Collections;
import java.util.Map;
import java.util.Set;

/**
 * General interface for objects that record features into a feature vector.
 * 

* By convention, sub-classes should provide a constructor that accepts just a field name as well as * setters to customize properties of the conversion such as adding tokenizers or a weight * dictionary. */ public abstract class FeatureVectorEncoder { protected static final int CONTINUOUS_VALUE_HASH_SEED = 1; protected static final int WORD_LIKE_VALUE_HASH_SEED = 100; private static final byte[] EMPTY_ARRAY = new byte[0]; private final String name; private int probes; private Map> traceDictionary; protected FeatureVectorEncoder(String name) { this(name, 1); } protected FeatureVectorEncoder(String name, int probes) { this.name = name; this.probes = probes; } /** * Adds a value expressed in string form to a vector. * * @param originalForm The original form of the value as a string. * @param data The vector to which the value should be added. */ public void addToVector(String originalForm, Vector data) { addToVector(originalForm, 1.0, data); } /** * Adds a value expressed in byte array form to a vector. * * @param originalForm The original form of the value as a byte array. * @param data The vector to which the value should be added. */ public void addToVector(byte[] originalForm, Vector data) { addToVector(originalForm, 1.0, data); } /** * Adds a weighted value expressed in string form to a vector. In some cases it is convenient to * use this method to encode continuous values using the weight as the value. In such cases, the * string value should typically be set to null. * * @param originalForm The original form of the value as a string. * @param weight The weight to be applied to this feature. * @param data The vector to which the value should be added. */ public void addToVector(String originalForm, double weight, Vector data) { addToVector(bytesForString(originalForm), weight, data); } public abstract void addToVector(byte[] originalForm, double weight, Vector data); /** * Provides the unique hash for a particular probe. For all encoders except text, this * is all that is needed and the default implementation of hashesForProbe will do the right * thing. For text and similar values, hashesForProbe should be over-ridden and this method * should not be used. * * @param originalForm The original byte array value * @param dataSize The length of the vector being encoded * @param name The name of the variable being encoded * @param probe The probe number * @return The hash of the current probe */ protected abstract int hashForProbe(byte[] originalForm, int dataSize, String name, int probe); /** * Returns all of the hashes for this probe. For most encoders, this is a singleton, but * for text, many hashes are returned, one for each word (unique or not). Most implementations * should only implement hashForProbe for simplicity. * * @param originalForm The original byte array value. * @param dataSize The length of the vector being encoded * @param name The name of the variable being encoded * @param probe The probe number * @return an Iterable of the hashes */ protected Iterable hashesForProbe(byte[] originalForm, int dataSize, String name, int probe) { return Collections.singletonList(hashForProbe(originalForm, dataSize, name, probe)); } protected double getWeight(byte[] originalForm, double w) { return 1.0; } // ******* Utility functions used by most implementations /** * Hash a string and an integer into the range [0..numFeatures-1]. * * @param term The string. * @param probe An integer that modifies the resulting hash. * @param numFeatures The range into which the resulting hash must fit. * @return An integer in the range [0..numFeatures-1] that has good spread for small changes in * term and probe. */ protected int hash(String term, int probe, int numFeatures) { long r = MurmurHash.hash64A(bytesForString(term), probe) % numFeatures; if (r < 0) { r += numFeatures; } return (int) r; } /** * Hash a byte array and an integer into the range [0..numFeatures-1]. * * @param term The bytes. * @param probe An integer that modifies the resulting hash. * @param numFeatures The range into which the resulting hash must fit. * @return An integer in the range [0..numFeatures-1] that has good spread for small changes in * term and probe. */ protected static int hash(byte[] term, int probe, int numFeatures) { long r = MurmurHash.hash64A(term, probe) % numFeatures; if (r < 0) { r += numFeatures; } return (int) r; } /** * Hash two strings and an integer into the range [0..numFeatures-1]. * * @param term1 The first string. * @param term2 The second string. * @param probe An integer that modifies the resulting hash. * @param numFeatures The range into which the resulting hash must fit. * @return An integer in the range [0..numFeatures-1] that has good spread for small changes in * term and probe. */ protected static int hash(String term1, String term2, int probe, int numFeatures) { long r = MurmurHash.hash64A(bytesForString(term1), probe); r = MurmurHash.hash64A(bytesForString(term2), (int) r) % numFeatures; if (r < 0) { r += numFeatures; } return (int) r; } /** * Hash two byte arrays and an integer into the range [0..numFeatures-1]. * * @param term1 The first string. * @param term2 The second string. * @param probe An integer that modifies the resulting hash. * @param numFeatures The range into which the resulting hash must fit. * @return An integer in the range [0..numFeatures-1] that has good spread for small changes in * term and probe. */ protected int hash(byte[] term1, byte[] term2, int probe, int numFeatures) { long r = MurmurHash.hash64A(term1, probe); r = MurmurHash.hash64A(term2, (int) r) % numFeatures; if (r < 0) { r += numFeatures; } return (int) r; } /** * Hash four strings and an integer into the range [0..numFeatures-1]. * * @param term1 The first string. * @param term2 The second string. * @param term3 The third string * @param term4 And the fourth. * @param probe An integer that modifies the resulting hash. * @param numFeatures The range into which the resulting hash must fit. * @return An integer in the range [0..numFeatures-1] that has good spread for small changes in * term and probe. */ protected int hash(String term1, String term2, String term3, String term4, int probe, int numFeatures) { long r = MurmurHash.hash64A(bytesForString(term1), probe); r = MurmurHash.hash64A(bytesForString(term2), (int) r) % numFeatures; r = MurmurHash.hash64A(bytesForString(term3), (int) r) % numFeatures; r = MurmurHash.hash64A(bytesForString(term4), (int) r) % numFeatures; if (r < 0) { r += numFeatures; } return (int) r; } /** * Converts a value into a form that would help a human understand the internals of how the value * is being interpreted. For text-like things, this is likely to be a list of the terms found * with associated weights (if any). * * @param originalForm The original form of the value as a string. * @return A string that a human can read. */ public abstract String asString(String originalForm); public int getProbes() { return probes; } /** * Sets the number of locations in the feature vector that a value should be in. * * @param probes Number of locations to increment. */ public void setProbes(int probes) { this.probes = probes; } public String getName() { return name; } protected boolean isTraceEnabled() { return traceDictionary != null; } protected void trace(String subName, int n) { if (traceDictionary != null) { String key = name; if (subName != null) { key = name + '=' + subName; } Set trace = traceDictionary.get(key); if (trace == null) { trace = Sets.newHashSet(n); traceDictionary.put(key, trace); } else { trace.add(n); } } } protected void trace(byte[] subName, int n) { trace(new String(subName, Charsets.UTF_8), n); } public void setTraceDictionary(Map> traceDictionary) { this.traceDictionary = traceDictionary; } protected static byte[] bytesForString(String x) { return x == null ? EMPTY_ARRAY : x.getBytes(Charsets.UTF_8); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy