org.codelibs.elasticsearch.taste.similarity.EuclideanDistanceSimilarity Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.codelibs.elasticsearch.taste.similarity;
import org.codelibs.elasticsearch.taste.common.Weighting;
import org.codelibs.elasticsearch.taste.model.DataModel;
import com.google.common.base.Preconditions;
/**
*
* An implementation of a "similarity" based on the Euclidean "distance" between two users X and Y. Thinking
* of items as dimensions and preferences as points along those dimensions, a distance is computed using all
* items (dimensions) where both users have expressed a preference for that item. This is simply the square
* root of the sum of the squares of differences in position (preference) along each dimension.
*
* The similarity could be computed as 1 / (1 + distance), so the resulting values are in the range (0,1].
* This would weight against pairs that overlap in more dimensions, which should indicate more similarity,
* since more dimensions offer more opportunities to be farther apart. Actually, it is computed as
* sqrt(n) / (1 + distance), where n is the number of dimensions, in order to help correct for this.
* sqrt(n) is chosen since randomly-chosen points have a distance that grows as sqrt(n).
*
* Note that this could cause a similarity to exceed 1; such values are capped at 1.
*
* Note that the distance isn't normalized in any way; it's not valid to compare similarities computed from
* different domains (different rating scales, for example). Within one domain, normalizing doesn't matter much as
* it doesn't change ordering.
*/
public final class EuclideanDistanceSimilarity extends AbstractSimilarity {
/**
* @throws IllegalArgumentException if {@link DataModel} does not have preference values
*/
public EuclideanDistanceSimilarity(final DataModel dataModel) {
this(dataModel, Weighting.UNWEIGHTED);
}
/**
* @throws IllegalArgumentException if {@link DataModel} does not have preference values
*/
public EuclideanDistanceSimilarity(final DataModel dataModel,
final Weighting weighting) {
super(dataModel, weighting, false);
Preconditions.checkArgument(dataModel.hasPreferenceValues(),
"DataModel doesn't have preference values");
}
@Override
double computeResult(final int n, final double sumXY, final double sumX2,
final double sumY2, final double sumXYdiff2) {
return 1.0 / (1.0 + Math.sqrt(sumXYdiff2) / Math.sqrt(n));
}
}