All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.similarities.DefaultSimilarity Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * COPIED FROM APACHE LUCENE 4.7.2
 *
 * Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
 *
 * (see https://issues.apache.org/jira/browse/OAK-10786 for details)
 */

package org.apache.lucene.search.similarities;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;

/**
 * Expert: Default scoring implementation which {@link #encodeNormValue(float)
 * encodes} norm values as a single byte before being stored. At search time,
 * the norm byte value is read from the index
 * {@link org.apache.lucene.store.Directory directory} and
 * {@link #decodeNormValue(long) decoded} back to a float norm value.
 * This encoding/decoding, while reducing index size, comes with the price of
 * precision loss - it is not guaranteed that decode(encode(x)) = x. For
 * instance, decode(encode(0.89)) = 0.75.
 * 

* Compression of norm values to a single byte saves memory at search time, * because once a field is referenced at search time, its norms - for all * documents - are maintained in memory. *

* The rationale supporting such lossy compression of norm values is that given * the difficulty (and inaccuracy) of users to express their true information * need by a query, only big differences matter.
*  
* Last, note that search time is too late to modify this norm part of * scoring, e.g. by using a different {@link Similarity} for search. */ public class DefaultSimilarity extends TFIDFSimilarity { /** Cache of decoded bytes. */ private static final float[] NORM_TABLE = new float[256]; static { for (int i = 0; i < 256; i++) { NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); } } /** Sole constructor: parameter-free */ public DefaultSimilarity() {} /** Implemented as overlap / maxOverlap. */ @Override public float coord(int overlap, int maxOverlap) { return overlap / (float)maxOverlap; } /** Implemented as 1/sqrt(sumOfSquaredWeights). */ @Override public float queryNorm(float sumOfSquaredWeights) { return (float)(1.0 / Math.sqrt(sumOfSquaredWeights)); } /** * Encodes a normalization factor for storage in an index. *

* The encoding uses a three-bit mantissa, a five-bit exponent, and the * zero-exponent point at 15, thus representing values from around 7x10^9 to * 2x10^-9 with about one significant decimal digit of accuracy. Zero is also * represented. Negative numbers are rounded up to zero. Values too large to * represent are rounded down to the largest representable value. Positive * values too small to represent are rounded up to the smallest positive * representable value. * * @see org.apache.lucene.document.Field#setBoost(float) * @see org.apache.lucene.util.SmallFloat */ @Override public final long encodeNormValue(float f) { return SmallFloat.floatToByte315(f); } /** * Decodes the norm value, assuming it is a single byte. * * @see #encodeNormValue(float) */ @Override public final float decodeNormValue(long norm) { return NORM_TABLE[(int) (norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127 } /** Implemented as * state.getBoost()*lengthNorm(numTerms), where * numTerms is {@link FieldInvertState#getLength()} if {@link * #setDiscountOverlaps} is false, else it's {@link * FieldInvertState#getLength()} - {@link * FieldInvertState#getNumOverlap()}. * * @lucene.experimental */ @Override public float lengthNorm(FieldInvertState state) { final int numTerms; if (discountOverlaps) numTerms = state.getLength() - state.getNumOverlap(); else numTerms = state.getLength(); return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))); } /** Implemented as sqrt(freq). */ @Override public float tf(float freq) { return (float)Math.sqrt(freq); } /** Implemented as 1 / (distance + 1). */ @Override public float sloppyFreq(int distance) { return 1.0f / (distance + 1); } /** The default implementation returns 1 */ @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 1; } /** Implemented as log(numDocs/(docFreq+1)) + 1. */ @Override public float idf(long docFreq, long numDocs) { return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); } /** * True if overlap tokens (tokens with a position of increment of zero) are * discounted from the document's length. */ protected boolean discountOverlaps = true; /** Determines whether overlap tokens (Tokens with * 0 position increment) are ignored when computing * norm. By default this is true, meaning overlap * tokens do not count when computing norms. * * @lucene.experimental * * @see #computeNorm */ public void setDiscountOverlaps(boolean v) { discountOverlaps = v; } /** * Returns true if overlap tokens are discounted from the document's length. * @see #setDiscountOverlaps */ public boolean getDiscountOverlaps() { return discountOverlaps; } @Override public String toString() { return "DefaultSimilarity"; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy