org.apache.lucene.search.similarities.DFISimilarity Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
/**
* Implements the Divergence from Independence (DFI) model based on Chi-square statistics
* (i.e., standardized Chi-squared distance from independence in term frequency tf).
*
* DFI is both parameter-free and non-parametric:
*
* - parameter-free: it does not require any parameter tuning or training.
* - non-parametric: it does not make any assumptions about word frequency distributions on document collections.
*
*
* It is highly recommended not to remove stopwords (very common terms: the, of, and, to, a, in, for, is, on, that, etc) with this similarity.
*
* For more information see: A nonparametric term weighting method for information retrieval based on measuring the divergence from independence
*
* @lucene.experimental
* @see org.apache.lucene.search.similarities.IndependenceStandardized
* @see org.apache.lucene.search.similarities.IndependenceSaturated
* @see org.apache.lucene.search.similarities.IndependenceChiSquared
*/
public class DFISimilarity extends SimilarityBase {
private final Independence independence;
/**
* Create DFI with the specified divergence from independence measure
* @param independenceMeasure measure of divergence from independence
*/
public DFISimilarity(Independence independenceMeasure) {
this.independence = independenceMeasure;
}
@Override
protected float score(BasicStats stats, float freq, float docLen) {
final float expected = (stats.getTotalTermFreq() + 1) * docLen / (stats.getNumberOfFieldTokens() + 1);
// if the observed frequency is less than or equal to the expected value, then return zero.
if (freq <= expected) return 0;
final float measure = independence.score(freq, expected);
return stats.getBoost() * (float) log2(measure + 1);
}
/**
* Returns the measure of independence
*/
public Independence getIndependence() {
return independence;
}
@Override
public String toString() {
return "DFI(" + independence + ")";
}
}