All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.similarities.IndriDirichletSimilarity Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.similarities;

import java.util.List;
import java.util.Locale;
import org.apache.lucene.search.Explanation;

/**
 * Bayesian smoothing using Dirichlet priors as implemented in the Indri Search engine
 * (http://www.lemurproject.org/indri.php). Indri Dirichelet Smoothing!
 *
 * 
 * tf_E + mu*P(t|D) P(t|E)= documentLength + documentMu
 * mu*P(t|C) + tf_D where P(t|D)= doclen + mu
 * 
* *

A larger value for mu, produces more smoothing. Smoothing is most important for short * documents where the probabilities are more granular. */ public class IndriDirichletSimilarity extends LMSimilarity { /** The μ parameter. */ private final float mu; /** Instantiates the similarity with the provided μ parameter. */ public IndriDirichletSimilarity(CollectionModel collectionModel, float mu) { super(collectionModel); this.mu = mu; } /** Instantiates the similarity with the provided μ parameter. */ public IndriDirichletSimilarity(float mu) { this.mu = mu; } /** Instantiates the similarity with the default μ value of 2000. */ public IndriDirichletSimilarity(CollectionModel collectionModel) { this(collectionModel, 2000); } /** Instantiates the similarity with the default μ value of 2000. */ public IndriDirichletSimilarity() { this(new IndriCollectionModel(), 2000); } @Override protected double score(BasicStats stats, double freq, double docLen) { double collectionProbability = ((LMStats) stats).getCollectionProbability(); double score = (freq + (mu * collectionProbability)) / (docLen + mu); return (Math.log(score)); } @Override protected void explain(List subs, BasicStats stats, double freq, double docLen) { if (stats.getBoost() != 1.0f) { subs.add(Explanation.match(stats.getBoost(), "boost")); } subs.add(Explanation.match(mu, "mu")); double collectionProbability = ((LMStats) stats).getCollectionProbability(); Explanation weightExpl = Explanation.match( (float) Math.log((freq + (mu * collectionProbability)) / (docLen + mu)), "term weight"); subs.add(weightExpl); subs.add(Explanation.match((float) Math.log(mu / (docLen + mu)), "document norm")); super.explain(subs, stats, freq, docLen); } /** Returns the μ parameter. */ public float getMu() { return mu; } public String getName() { return String.format(Locale.ROOT, "IndriDirichlet(%f)", getMu()); } /** * Models {@code p(w|C)} as the number of occurrences of the term in the collection, divided by * the total number of tokens {@code + 1}. */ public static class IndriCollectionModel implements CollectionModel { /** Sole constructor: parameter-free */ public IndriCollectionModel() {} @Override public double computeProbability(BasicStats stats) { return ((double) stats.getTotalTermFreq()) / ((double) stats.getNumberOfFieldTokens()); } @Override public String getName() { return null; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy