org.apache.lucene.search.similarities.IBSimilarity Maven / Gradle / Ivy
Show all versions of aem-sdk-api Show documentation
/*
* COPIED FROM APACHE LUCENE 4.7.2
*
* Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
*
* (see https://issues.apache.org/jira/browse/OAK-10786 for details)
*/
package org.apache.lucene.search.similarities;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.similarities.Normalization.NoNormalization;
/**
* Provides a framework for the family of information-based models, as described
* in Stéphane Clinchant and Eric Gaussier. 2010. Information-based
* models for ad hoc IR. In Proceeding of the 33rd international ACM SIGIR
* conference on Research and development in information retrieval (SIGIR '10).
* ACM, New York, NY, USA, 234-241.
* The retrieval function is of the form RSV(q, d) = ∑
* -xqw log Prob(Xw ≥
* tdw | λw), where
*
* - xqw is the query boost;
* - Xw is a random variable that counts the occurrences
* of word w;
* - tdw is the normalized term frequency;
* - λw is a parameter.
*
*
* The framework described in the paper has many similarities to the DFR
* framework (see {@link DFRSimilarity}). It is possible that the two
* Similarities will be merged at one point.
* To construct an IBSimilarity, you must specify the implementations for
* all three components of the Information-Based model.
*
* - {@link Distribution}: Probabilistic distribution used to
* model term occurrence
*
* - {@link DistributionLL}: Log-logistic
* - {@link DistributionLL}: Smoothed power-law
*
*
* - {@link Lambda}: λw parameter of the
* probability distribution
*
* - {@link LambdaDF}:
Nw/N
or average
* number of documents where w occurs
* - {@link LambdaTTF}:
Fw/N
or
* average number of occurrences of w in the collection
*
*
* - {@link Normalization}: Term frequency normalization
*
Any supported DFR normalization (listed in
* {@link DFRSimilarity})
*
*
*
* @see DFRSimilarity
* @lucene.experimental
*/
public class IBSimilarity extends SimilarityBase {
/** The probabilistic distribution used to model term occurrence. */
protected final Distribution distribution;
/** The lambda (λw) parameter. */
protected final Lambda lambda;
/** The term frequency normalization. */
protected final Normalization normalization;
/**
* Creates IBSimilarity from the three components.
*
* Note that null
values are not allowed:
* if you want no normalization, instead pass
* {@link NoNormalization}.
* @param distribution probabilistic distribution modeling term occurrence
* @param lambda distribution's λw parameter
* @param normalization term frequency normalization
*/
public IBSimilarity(Distribution distribution,
Lambda lambda,
Normalization normalization) {
this.distribution = distribution;
this.lambda = lambda;
this.normalization = normalization;
}
@Override
protected float score(BasicStats stats, float freq, float docLen) {
return stats.getTotalBoost() *
distribution.score(
stats,
normalization.tfn(stats, freq, docLen),
lambda.lambda(stats));
}
@Override
protected void explain(
Explanation expl, BasicStats stats, int doc, float freq, float docLen) {
if (stats.getTotalBoost() != 1.0f) {
expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
}
Explanation normExpl = normalization.explain(stats, freq, docLen);
Explanation lambdaExpl = lambda.explain(stats);
expl.addDetail(normExpl);
expl.addDetail(lambdaExpl);
expl.addDetail(distribution.explain(
stats, normExpl.getValue(), lambdaExpl.getValue()));
}
/**
* The name of IB methods follow the pattern
* {@code IB }. The name of the
* distribution is the same as in the original paper; for the names of lambda
* parameters, refer to the javadoc of the {@link Lambda} classes.
*/
@Override
public String toString() {
return "IB " + distribution.toString() + "-" + lambda.toString()
+ normalization.toString();
}
/**
* Returns the distribution
*/
public Distribution getDistribution() {
return distribution;
}
/**
* Returns the distribution's lambda parameter
*/
public Lambda getLambda() {
return lambda;
}
/**
* Returns the term frequency normalization
*/
public Normalization getNormalization() {
return normalization;
}
}