org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.cf.taste.impl.similarity;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.common.Weighting;
import org.apache.mahout.cf.taste.model.DataModel;
import com.google.common.base.Preconditions;
/**
*
* An implementation of the Pearson correlation. For users X and Y, the following values are calculated:
*
*
*
* - sumX2: sum of the square of all X's preference values
* - sumY2: sum of the square of all Y's preference values
* - sumXY: sum of the product of X and Y's preference value for all items for which both X and Y express a
* preference
*
*
*
* The correlation is then:
*
*
* {@code sumXY / sqrt(sumX2 * sumY2)}
*
*
*
* Note that this correlation "centers" its data, shifts the user's preference values so that each of their
* means is 0. This is necessary to achieve expected behavior on all data sets.
*
*
*
* This correlation implementation is equivalent to the cosine similarity since the data it receives
* is assumed to be centered -- mean is 0. The correlation may be interpreted as the cosine of the angle
* between the two vectors defined by the users' preference values.
*
*
*
* For cosine similarity on uncentered data, see {@link UncenteredCosineSimilarity}.
*
*/
public final class PearsonCorrelationSimilarity extends AbstractSimilarity {
/**
* @throws IllegalArgumentException if {@link DataModel} does not have preference values
*/
public PearsonCorrelationSimilarity(DataModel dataModel) throws TasteException {
this(dataModel, Weighting.UNWEIGHTED);
}
/**
* @throws IllegalArgumentException if {@link DataModel} does not have preference values
*/
public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
super(dataModel, weighting, true);
Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
}
@Override
double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
if (n == 0) {
return Double.NaN;
}
// Note that sum of X and sum of Y don't appear here since they are assumed to be 0;
// the data is assumed to be centered.
double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
if (denominator == 0.0) {
// One or both parties has -all- the same ratings;
// can't really say much similarity under this measure
return Double.NaN;
}
return sumXY / denominator;
}
}