
org.apache.mahout.classifier.AbstractVectorClassifier Maven / Gradle / Ivy
Show all versions of mahout-mr Show documentation
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.classifier;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.Vector;
import com.google.common.base.Preconditions;
/**
* Defines the interface for classifiers that take a vector as input. This is
* implemented as an abstract class so that it can implement a number of handy
* convenience methods related to classification of vectors.
*
*
* A classifier takes an input vector and calculates the scores (usually
* probabilities) that the input vector belongs to one of {@code n}
* categories. In {@code AbstractVectorClassifier} each category is denoted
* by an integer {@code c} between {@code 0} and {@code n-1}
* (inclusive).
*
*
* New users should start by looking at {@link #classifyFull} (not {@link #classify}).
*
*/
public abstract class AbstractVectorClassifier {
/** Minimum allowable log likelihood value. */
public static final double MIN_LOG_LIKELIHOOD = -100.0;
/**
* Returns the number of categories that a target variable can be assigned to.
* A vector classifier will encode it's output as an integer from
* {@code 0} to {@code numCategories()-1} (inclusive).
*
* @return The number of categories.
*/
public abstract int numCategories();
/**
* Compute and return a vector containing {@code n-1} scores, where
* {@code n} is equal to {@code numCategories()}, given an input
* vector {@code instance}. Higher scores indicate that the input vector
* is more likely to belong to that category. The categories are denoted by
* the integers {@code 0} through {@code n-1} (inclusive), and the
* scores in the returned vector correspond to categories 1 through
* {@code n-1} (leaving out category 0). It is assumed that the score for
* category 0 is one minus the sum of the scores in the returned vector.
*
* @param instance A feature vector to be classified.
* @return A vector of probabilities in 1 of {@code n-1} encoding.
*/
public abstract Vector classify(Vector instance);
/**
* Compute and return a vector of scores before applying the inverse link
* function. For logistic regression and other generalized linear models, this
* is just the linear part of the classification.
*
*
* The implementation of this method provided by {@code AbstractVectorClassifier} throws an
* {@link UnsupportedOperationException}. Your subclass must explicitly override this method to support
* this operation.
*
* @param features A feature vector to be classified.
* @return A vector of scores. If transformed by the link function, these will become probabilities.
*/
public Vector classifyNoLink(Vector features) {
throw new UnsupportedOperationException(this.getClass().getName()
+ " doesn't support classification without a link");
}
/**
* Classifies a vector in the special case of a binary classifier where
* {@link #classify(Vector)} would return a vector with only one element. As
* such, using this method can avoid the allocation of a vector.
*
* @param instance The feature vector to be classified.
* @return The score for category 1.
*
* @see #classify(Vector)
*/
public abstract double classifyScalar(Vector instance);
/**
* Computes and returns a vector containing {@code n} scores, where
* {@code n} is {@code numCategories()}, given an input vector
* {@code instance}. Higher scores indicate that the input vector is more
* likely to belong to the corresponding category. The categories are denoted
* by the integers {@code 0} through {@code n-1} (inclusive).
*
*
* Using this method it is possible to classify an input vector, for example,
* by selecting the category with the largest score. If
* {@code classifier} is an instance of
* {@code AbstractVectorClassifier} and {@code input} is a
* {@code Vector} of features describing an element to be classified,
* then the following code could be used to classify {@code input}.
* {@code
* Vector scores = classifier.classifyFull(input);
* int assignedCategory = scores.maxValueIndex();
* } Here {@code assignedCategory} is the index of the category
* with the maximum score.
*
*
* If an {@code n-1} encoding is acceptable, and allocation performance
* is an issue, then the {@link #classify(Vector)} method is probably better
* to use.
*
* @see #classify(Vector)
* @see #classifyFull(Vector r, Vector instance)
*
* @param instance A vector of features to be classified.
* @return A vector of probabilities, one for each category.
*/
public Vector classifyFull(Vector instance) {
return classifyFull(new DenseVector(numCategories()), instance);
}
/**
* Computes and returns a vector containing {@code n} scores, where
* {@code n} is {@code numCategories()}, given an input vector
* {@code instance}. Higher scores indicate that the input vector is more
* likely to belong to the corresponding category. The categories are denoted
* by the integers {@code 0} through {@code n-1} (inclusive). The
* main difference between this method and {@link #classifyFull(Vector)} is
* that this method allows a user to provide a previously allocated
* {@code Vector r} to store the returned scores.
*
*
* Using this method it is possible to classify an input vector, for example,
* by selecting the category with the largest score. If
* {@code classifier} is an instance of
* {@code AbstractVectorClassifier}, {@code result} is a non-null
* {@code Vector}, and {@code input} is a {@code Vector} of
* features describing an element to be classified, then the following code
* could be used to classify {@code input}.
* {@code
* Vector scores = classifier.classifyFull(result, input); // Notice that scores == result
* int assignedCategory = scores.maxValueIndex();
* } Here {@code assignedCategory} is the index of the category
* with the maximum score.
*
* @param r Where to put the results.
* @param instance A vector of features to be classified.
* @return A vector of scores/probabilities, one for each category.
*/
public Vector classifyFull(Vector r, Vector instance) {
r.viewPart(1, numCategories() - 1).assign(classify(instance));
r.setQuick(0, 1.0 - r.zSum());
return r;
}
/**
* Returns n-1 probabilities, one for each categories 1 through
* {@code n-1}, for each row of a matrix, where {@code n} is equal
* to {@code numCategories()}. The probability of the missing 0-th
* category is 1 - rowSum(this result).
*
* @param data The matrix whose rows are the input vectors to classify
* @return A matrix of scores, one row per row of the input matrix, one column for each but the last category.
*/
public Matrix classify(Matrix data) {
Matrix r = new DenseMatrix(data.numRows(), numCategories() - 1);
for (int row = 0; row < data.numRows(); row++) {
r.assignRow(row, classify(data.viewRow(row)));
}
return r;
}
/**
* Returns a matrix where the rows of the matrix each contain {@code n} probabilities, one for each category.
*
* @param data The matrix whose rows are the input vectors to classify
* @return A matrix of scores, one row per row of the input matrix, one column for each but the last category.
*/
public Matrix classifyFull(Matrix data) {
Matrix r = new DenseMatrix(data.numRows(), numCategories());
for (int row = 0; row < data.numRows(); row++) {
classifyFull(r.viewRow(row), data.viewRow(row));
}
return r;
}
/**
* Returns a vector of probabilities of category 1, one for each row
* of a matrix. This only makes sense if there are exactly two categories, but
* calling this method in that case can save a number of vector allocations.
*
* @param data The matrix whose rows are vectors to classify
* @return A vector of scores, with one value per row of the input matrix.
*/
public Vector classifyScalar(Matrix data) {
Preconditions.checkArgument(numCategories() == 2, "Can only call classifyScalar with two categories");
Vector r = new DenseVector(data.numRows());
for (int row = 0; row < data.numRows(); row++) {
r.set(row, classifyScalar(data.viewRow(row)));
}
return r;
}
/**
* Returns a measure of how good the classification for a particular example
* actually is.
*
* @param actual The correct category for the example.
* @param data The vector to be classified.
* @return The log likelihood of the correct answer as estimated by the current model. This will always be <= 0
* and larger (closer to 0) indicates better accuracy. In order to simplify code that maintains eunning averages,
* we bound this value at -100.
*/
public double logLikelihood(int actual, Vector data) {
if (numCategories() == 2) {
double p = classifyScalar(data);
if (actual > 0) {
return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p));
} else {
return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p));
}
} else {
Vector p = classify(data);
if (actual > 0) {
return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p.get(actual - 1)));
} else {
return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p.zSum()));
}
}
}
}