org.apache.lucene.document.FeatureField Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of org.apache.servicemix.bundles.lucene
This OSGi bundle wraps ${pkgArtifactId} ${pkgVersion} jar file.
There is a newer version: 6.4.2_1
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.document;

import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.DoubleValuesSource;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;

/**
 * {@link Field} that can be used to store static scoring factors into documents. This is mostly
 * inspired from the work from Nick Craswell, Stephen Robertson, Hugo Zaragoza and Michael Taylor.
 * Relevance weighting for query independent evidence. Proceedings of the 28th annual international
 * ACM SIGIR conference on Research and development in information retrieval. August 15-19, 2005,
 * Salvador, Brazil.
 *
 * Feature values are internally encoded as term frequencies. Putting feature queries as {@link
 * org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses of a {@link BooleanQuery} allows to
 * combine query-dependent scores (eg. BM25) with query-independent scores using a linear
 * combination. The fact that feature values are stored as frequencies also allows search logic to
 * efficiently skip documents that can't be competitive when total hit counts are not requested.
 * This makes it a compelling option compared to storing such factors eg. in a doc-value field.
 *
 * 
This field may only store factors that are positively correlated with the final score, like
 * pagerank. In case of factors that are inversely correlated with the score like url length, the
 * inverse of the scoring factor should be stored, ie. {@code 1/urlLength}.
 *
 * 
This field only considers the top 9 significant bits for storage efficiency which allows to
 * store them on 16 bits internally. In practice this limitation means that values are stored with a
 * relative precision of 2^-8 = 0.00390625.
 *
 * 
Given a scoring factor {@code S > 0} and its weight {@code w > 0}, there are four ways that S
 * can be turned into a score:
 *
 * 

 *   {@link #newLogQuery w * log(a + S)}, with a ≥ 1. This function usually makes sense
 *       because the distribution of scoring factors often follows a power law. This is typically
 *       the case for pagerank for instance. However the paper suggested that the {@code satu} and
 *       {@code sigm} functions give even better results.
 *   
{@link #newSaturationQuery satu(S) = w * S / (S + k)}, with k > 0. This function is
 *       similar to the one used by {@link BM25Similarity} in order to incorporate term frequency
 *       into the final score and produces values between 0 and 1. A value of 0.5 is obtained when S
 *       and k are equal.
 *   
{@link #newSigmoidQuery sigm(S) = w * S^a / (S^a + k^a)},
 *       with k > 0, a > 0. This function provided even better results than the two above but
 *       is also harder to tune due to the fact it has 2 parameters. Like with {@code satu}, values
 *       are in the 0..1 range and 0.5 is obtained when S and k are equal.
 *   
{@link #newLinearQuery w * S}. Expert: This function doesn't apply any transformation to an
 *       indexed feature value, and the indexed value itself, multiplied by weight, determines the
 *       score. Thus, there is an expectation that a feature value is encoded in the index in a way
 *       that makes sense for scoring.
 * 
 *
 * The constants in the above formulas typically need training in order to compute optimal
 * values. If you don't know where to start, the {@link #newSaturationQuery(String, String)} method
 * uses {@code 1f} as a weight and tries to guess a sensible value for the {@code pivot} parameter
 * of the saturation function based on index statistics, which shouldn't perform too bad. Here is an
 * example, assuming that documents have a {@link FeatureField} called 'features' with values for
 * the 'pagerank' feature.
 *
 * 
 * Query query = new BooleanQuery.Builder()
 *     .add(new TermQuery(new Term("body", "apache")), Occur.SHOULD)
 *     .add(new TermQuery(new Term("body", "lucene")), Occur.SHOULD)
 *     .build();
 * Query boost = FeatureField.newSaturationQuery("features", "pagerank");
 * Query boostedQuery = new BooleanQuery.Builder()
 *     .add(query, Occur.MUST)
 *     .add(boost, Occur.SHOULD)
 *     .build();
 * TopDocs topDocs = searcher.search(boostedQuery, 10);
 * 
 *
 * @lucene.experimental
 */
public final class FeatureField extends Field {

  private static final FieldType FIELD_TYPE = new FieldType();
  private static final FieldType FIELD_TYPE_STORE_TERM_VECTORS = new FieldType();

  static {
    FIELD_TYPE.setTokenized(false);
    FIELD_TYPE.setOmitNorms(true);
    FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);

    FIELD_TYPE_STORE_TERM_VECTORS.setTokenized(false);
    FIELD_TYPE_STORE_TERM_VECTORS.setOmitNorms(true);
    FIELD_TYPE_STORE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    FIELD_TYPE_STORE_TERM_VECTORS.setStoreTermVectors(true);
  }

  private float featureValue;

  /**
   * Create a feature.
   *
   * @param fieldName The name of the field to store the information into. All features may be
   *     stored in the same field.
   * @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term.
   * @param featureValue The value of the feature, must be a positive, finite, normal float.
   */
  public FeatureField(String fieldName, String featureName, float featureValue) {
    this(fieldName, featureName, featureValue, false);
  }

  /**
   * Create a feature.
   *
   * @param fieldName The name of the field to store the information into. All features may be
   *     stored in the same field.
   * @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term.
   * @param featureValue The value of the feature, must be a positive, finite, normal float.
   * @param storeTermVectors Whether term vectors should be stored.
   */
  public FeatureField(
      String fieldName, String featureName, float featureValue, boolean storeTermVectors) {
    super(fieldName, featureName, storeTermVectors ? FIELD_TYPE_STORE_TERM_VECTORS : FIELD_TYPE);
    setFeatureValue(featureValue);
  }

  /** Update the feature value of this field. */
  public void setFeatureValue(float featureValue) {
    if (Float.isFinite(featureValue) == false) {
      throw new IllegalArgumentException(
          "featureValue must be finite, got: "
              + featureValue
              + " for feature "
              + fieldsData
              + " on field "
              + name);
    }
    if (featureValue < Float.MIN_NORMAL) {
      throw new IllegalArgumentException(
          "featureValue must be a positive normal float, got: "
              + featureValue
              + " for feature "
              + fieldsData
              + " on field "
              + name
              + " which is less than the minimum positive normal float: "
              + Float.MIN_NORMAL);
    }
    this.featureValue = featureValue;
  }

  @Override
  public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
    FeatureTokenStream stream;
    if (reuse instanceof FeatureTokenStream) {
      stream = (FeatureTokenStream) reuse;
    } else {
      stream = new FeatureTokenStream();
    }

    int freqBits = Float.floatToIntBits(featureValue);
    stream.setValues((String) fieldsData, freqBits >>> 15);
    return stream;
  }

  /**
   * This is useful if you have multiple features sharing a name and you want to take action to
   * deduplicate them.
   *
   * @return the feature value of this field.
   */
  public float getFeatureValue() {
    return featureValue;
  }

  private static final class FeatureTokenStream extends TokenStream {
    private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
    private final TermFrequencyAttribute freqAttribute = addAttribute(TermFrequencyAttribute.class);
    private boolean used = true;
    private String value = null;
    private int freq = 0;

    private FeatureTokenStream() {}

    /** Sets the values */
    void setValues(String value, int freq) {
      this.value = value;
      this.freq = freq;
    }

    @Override
    public boolean incrementToken() {
      if (used) {
        return false;
      }
      clearAttributes();
      termAttribute.append(value);
      freqAttribute.setTermFrequency(freq);
      used = true;
      return true;
    }

    @Override
    public void reset() {
      used = false;
    }

    @Override
    public void close() {
      value = null;
    }
  }

  static final int MAX_FREQ = Float.floatToIntBits(Float.MAX_VALUE) >>> 15;

  static float decodeFeatureValue(float freq) {
    if (freq > MAX_FREQ) {
      // This is never used in practice but callers of the SimScorer API might
      // occasionally call it on eg. Float.MAX_VALUE to compute the max score
      // so we need to be consistent.
      return Float.MAX_VALUE;
    }
    int tf = (int) freq; // lossless
    int featureBits = tf << 15;
    return Float.intBitsToFloat(featureBits);
  }

  abstract static class FeatureFunction {
    abstract SimScorer scorer(float w);

    abstract Explanation explain(String field, String feature, float w, int freq);

    FeatureFunction rewrite(IndexSearcher indexSearcher) throws IOException {
      return this;
    }
  }

  static final class LinearFunction extends FeatureFunction {
    @Override
    SimScorer scorer(float w) {
      return new SimScorer() {
        @Override
        public float score(float freq, long norm) {
          return (w * decodeFeatureValue(freq));
        }
      };
    }

    @Override
    Explanation explain(String field, String feature, float w, int freq) {
      float featureValue = decodeFeatureValue(freq);
      float score = scorer(w).score(freq, 1L);
      return Explanation.match(
          score,
          "Linear function on the "
              + field
              + " field for the "
              + feature
              + " feature, computed as w * S from:",
          Explanation.match(w, "w, weight of this function"),
          Explanation.match(featureValue, "S, feature value"));
    }

    @Override
    public String toString() {
      return "LinearFunction";
    }

    @Override
    public int hashCode() {
      return getClass().hashCode();
    }

    @Override
    public boolean equals(Object obj) {
      if (obj == null || getClass() != obj.getClass()) {
        return false;
      }
      return true;
    }
  }

  static final class LogFunction extends FeatureFunction {

    private final float scalingFactor;

    LogFunction(float a) {
      this.scalingFactor = a;
    }

    @Override
    public boolean equals(Object obj) {
      if (obj == null || getClass() != obj.getClass()) {
        return false;
      }
      LogFunction that = (LogFunction) obj;
      return scalingFactor == that.scalingFactor;
    }

    @Override
    public int hashCode() {
      return Float.hashCode(scalingFactor);
    }

    @Override
    public String toString() {
      return "LogFunction(scalingFactor=" + scalingFactor + ")";
    }

    @Override
    SimScorer scorer(float weight) {
      return new SimScorer() {
        @Override
        public float score(float freq, long norm) {
          return (float) (weight * Math.log(scalingFactor + decodeFeatureValue(freq)));
        }
      };
    }

    @Override
    Explanation explain(String field, String feature, float w, int freq) {
      float featureValue = decodeFeatureValue(freq);
      float score = scorer(w).score(freq, 1L);
      return Explanation.match(
          score,
          "Log function on the "
              + field
              + " field for the "
              + feature
              + " feature, computed as w * log(a + S) from:",
          Explanation.match(w, "w, weight of this function"),
          Explanation.match(scalingFactor, "a, scaling factor"),
          Explanation.match(featureValue, "S, feature value"));
    }
  }

  static final class SaturationFunction extends FeatureFunction {

    private final String field, feature;
    private final Float pivot;

    SaturationFunction(String field, String feature, Float pivot) {
      this.field = field;
      this.feature = feature;
      this.pivot = pivot;
    }

    @Override
    public FeatureFunction rewrite(IndexSearcher indexSearcher) throws IOException {
      if (pivot != null) {
        return super.rewrite(indexSearcher);
      }
      float newPivot = computePivotFeatureValue(indexSearcher, field, feature);
      return new SaturationFunction(field, feature, newPivot);
    }

    @Override
    public boolean equals(Object obj) {
      if (obj == null || getClass() != obj.getClass()) {
        return false;
      }
      SaturationFunction that = (SaturationFunction) obj;
      return Objects.equals(field, that.field)
          && Objects.equals(feature, that.feature)
          && Objects.equals(pivot, that.pivot);
    }

    @Override
    public int hashCode() {
      return Objects.hash(field, feature, pivot);
    }

    @Override
    public String toString() {
      return "SaturationFunction(pivot=" + pivot + ")";
    }

    @Override
    SimScorer scorer(float weight) {
      if (pivot == null) {
        throw new IllegalStateException("Rewrite first");
      }
      final float pivot = this.pivot; // unbox
      return new SimScorer() {
        @Override
        public float score(float freq, long norm) {
          float f = decodeFeatureValue(freq);
          // should be f / (f + k) but we rewrite it to
          // 1 - k / (f + k) to make sure it doesn't decrease
          // with f in spite of rounding
          return weight * (1 - pivot / (f + pivot));
        }
      };
    }

    @Override
    Explanation explain(String field, String feature, float weight, int freq) {
      float featureValue = decodeFeatureValue(freq);
      float score = scorer(weight).score(freq, 1L);
      return Explanation.match(
          score,
          "Saturation function on the "
              + field
              + " field for the "
              + feature
              + " feature, computed as w * S / (S + k) from:",
          Explanation.match(weight, "w, weight of this function"),
          Explanation.match(
              pivot, "k, pivot feature value that would give a score contribution equal to w/2"),
          Explanation.match(featureValue, "S, feature value"));
    }
  }

  static final class SigmoidFunction extends FeatureFunction {

    private final float pivot, a;
    private final double pivotPa;

    SigmoidFunction(float pivot, float a) {
      this.pivot = pivot;
      this.a = a;
      this.pivotPa = Math.pow(pivot, a);
    }

    @Override
    public boolean equals(Object obj) {
      if (obj == null || getClass() != obj.getClass()) {
        return false;
      }
      SigmoidFunction that = (SigmoidFunction) obj;
      return pivot == that.pivot && a == that.a;
    }

    @Override
    public int hashCode() {
      int h = Float.hashCode(pivot);
      h = 31 * h + Float.hashCode(a);
      return h;
    }

    @Override
    public String toString() {
      return "SigmoidFunction(pivot=" + pivot + ", a=" + a + ")";
    }

    @Override
    SimScorer scorer(float weight) {
      return new SimScorer() {
        @Override
        public float score(float freq, long norm) {
          float f = decodeFeatureValue(freq);
          // should be f^a / (f^a + k^a) but we rewrite it to
          // 1 - k^a / (f + k^a) to make sure it doesn't decrease
          // with f in spite of rounding
          return (float) (weight * (1 - pivotPa / (Math.pow(f, a) + pivotPa)));
        }
      };
    }

    @Override
    Explanation explain(String field, String feature, float weight, int freq) {
      float featureValue = decodeFeatureValue(freq);
      float score = scorer(weight).score(freq, 1L);
      return Explanation.match(
          score,
          "Sigmoid function on the "
              + field
              + " field for the "
              + feature
              + " feature, computed as w * S^a / (S^a + k^a) from:",
          Explanation.match(weight, "w, weight of this function"),
          Explanation.match(
              pivot, "k, pivot feature value that would give a score contribution equal to w/2"),
          Explanation.match(
              a,
              "a, exponent, higher values make the function grow slower before k and faster after k"),
          Explanation.match(featureValue, "S, feature value"));
    }
  }

  /**
   * Given that IDFs are logs, similarities that incorporate term freq and document length in sane
   * (ie. saturated) ways should have their score bounded by a log. So we reject weights that are
   * too high as it would mean that this clause would completely dominate ranking, removing the need
   * for query-dependent scores.
   */
  private static final float MAX_WEIGHT = Long.SIZE;

  /**
   * Return a new {@link Query} that will score documents as {@code weight * S} where S is the value
   * of the static feature.
   *
   * @param fieldName field that stores features
   * @param featureName name of the feature
   * @param weight weight to give to this feature, must be in (0,64]
   * @throws IllegalArgumentException if weight is not in (0,64]
   */
  public static Query newLinearQuery(String fieldName, String featureName, float weight) {
    if (weight <= 0 || weight > MAX_WEIGHT) {
      throw new IllegalArgumentException(
          "weight must be in (0, " + MAX_WEIGHT + "], got: " + weight);
    }
    Query q = new FeatureQuery(fieldName, featureName, new LinearFunction());
    if (weight != 1f) {
      q = new BoostQuery(q, weight);
    }
    return q;
  }

  /**
   * Return a new {@link Query} that will score documents as {@code weight * Math.log(scalingFactor
   * + S)} where S is the value of the static feature.
   *
   * @param fieldName field that stores features
   * @param featureName name of the feature
   * @param weight weight to give to this feature, must be in (0,64]
   * @param scalingFactor scaling factor applied before taking the logarithm, must be in [1,
   *     +Infinity)
   * @throws IllegalArgumentException if weight is not in (0,64] or scalingFactor is not in [1,
   *     +Infinity)
   */
  public static Query newLogQuery(
      String fieldName, String featureName, float weight, float scalingFactor) {
    if (weight <= 0 || weight > MAX_WEIGHT) {
      throw new IllegalArgumentException(
          "weight must be in (0, " + MAX_WEIGHT + "], got: " + weight);
    }
    if (scalingFactor < 1 || Float.isFinite(scalingFactor) == false) {
      throw new IllegalArgumentException("scalingFactor must be >= 1, got: " + scalingFactor);
    }
    Query q = new FeatureQuery(fieldName, featureName, new LogFunction(scalingFactor));
    if (weight != 1f) {
      q = new BoostQuery(q, weight);
    }
    return q;
  }

  /**
   * Return a new {@link Query} that will score documents as {@code weight * S / (S + pivot)} where
   * S is the value of the static feature.
   *
   * @param fieldName field that stores features
   * @param featureName name of the feature
   * @param weight weight to give to this feature, must be in (0,64]
   * @param pivot feature value that would give a score contribution equal to weight/2, must be in
   *     (0, +Infinity)
   * @throws IllegalArgumentException if weight is not in (0,64] or pivot is not in (0, +Infinity)
   */
  public static Query newSaturationQuery(
      String fieldName, String featureName, float weight, float pivot) {
    return newSaturationQuery(fieldName, featureName, weight, Float.valueOf(pivot));
  }

  /**
   * Same as {@link #newSaturationQuery(String, String, float, float)} but {@code 1f} is used as a
   * weight and a reasonably good default pivot value is computed based on index statistics and is
   * approximately equal to the geometric mean of all values that exist in the index.
   *
   * @param fieldName field that stores features
   * @param featureName name of the feature
   * @throws IllegalArgumentException if weight is not in (0,64] or pivot is not in (0, +Infinity)
   */
  public static Query newSaturationQuery(String fieldName, String featureName) {
    return newSaturationQuery(fieldName, featureName, 1f, null);
  }

  private static Query newSaturationQuery(
      String fieldName, String featureName, float weight, Float pivot) {
    if (weight <= 0 || weight > MAX_WEIGHT) {
      throw new IllegalArgumentException(
          "weight must be in (0, " + MAX_WEIGHT + "], got: " + weight);
    }
    if (pivot != null && (pivot <= 0 || Float.isFinite(pivot) == false)) {
      throw new IllegalArgumentException("pivot must be > 0, got: " + pivot);
    }
    Query q =
        new FeatureQuery(
            fieldName, featureName, new SaturationFunction(fieldName, featureName, pivot));
    if (weight != 1f) {
      q = new BoostQuery(q, weight);
    }
    return q;
  }

  /**
   * Return a new {@link Query} that will score documents as {@code weight * S^a / (S^a + pivot^a)}
   * where S is the value of the static feature.
   *
   * @param fieldName field that stores features
   * @param featureName name of the feature
   * @param weight weight to give to this feature, must be in (0,64]
   * @param pivot feature value that would give a score contribution equal to weight/2, must be in
   *     (0, +Infinity)
   * @param exp exponent, higher values make the function grow slower before 'pivot' and faster
   *     after 'pivot', must be in (0, +Infinity)
   * @throws IllegalArgumentException if w is not in (0,64] or either k or a are not in (0,
   *     +Infinity)
   */
  public static Query newSigmoidQuery(
      String fieldName, String featureName, float weight, float pivot, float exp) {
    if (weight <= 0 || weight > MAX_WEIGHT) {
      throw new IllegalArgumentException(
          "weight must be in (0, " + MAX_WEIGHT + "], got: " + weight);
    }
    if (pivot <= 0 || Float.isFinite(pivot) == false) {
      throw new IllegalArgumentException("pivot must be > 0, got: " + pivot);
    }
    if (exp <= 0 || Float.isFinite(exp) == false) {
      throw new IllegalArgumentException("exp must be > 0, got: " + exp);
    }
    Query q = new FeatureQuery(fieldName, featureName, new SigmoidFunction(pivot, exp));
    if (weight != 1f) {
      q = new BoostQuery(q, weight);
    }
    return q;
  }

  /**
   * Compute a feature value that may be used as the {@code pivot} parameter of the {@link
   * #newSaturationQuery(String, String, float, float)} and {@link #newSigmoidQuery(String, String,
   * float, float, float)} factory methods. The implementation takes the average of the int bits of
   * the float representation in practice before converting it back to a float. Given that floats
   * store the exponent in the higher bits, it means that the result will be an approximation of the
   * geometric mean of all feature values.
   *
   * @param searcher the {@link IndexSearcher} to perform the search
   * @param featureField the field that stores features
   * @param featureName the name of the feature
   */
  static float computePivotFeatureValue(
      IndexSearcher searcher, String featureField, String featureName) throws IOException {
    Term term = new Term(featureField, featureName);
    TermStates states = TermStates.build(searcher, term, true);
    if (states.docFreq() == 0) {
      // avoid division by 0
      // The return value doesn't matter much here, the term doesn't exist,
      // it will never be used for scoring. Just Make sure to return a legal
      // value.
      return 1;
    }
    float avgFreq = (float) ((double) states.totalTermFreq() / states.docFreq());
    return decodeFeatureValue(avgFreq);
  }

  /**
   * Creates a SortField for sorting by the value of a feature.
   *
   * This sort orders documents by descending value of a feature. The value returned in {@link
   * FieldDoc} for the hits contains a Float instance with the feature value.
   *
   * If a document is missing the field, then it is treated as having a value of 0.0f
   * .
   *
   * @param field field name. Must not be null.
   * @param featureName feature name. Must not be null.
   * @return SortField ordering documents by the value of the feature
   * @throws NullPointerException if {@code field} or {@code featureName} is null.
   */
  public static SortField newFeatureSort(String field, String featureName) {
    return new FeatureSortField(field, featureName);
  }

  /**
   * Creates a {@link DoubleValuesSource} instance which can be used to read the values of a feature
   * from the a {@link FeatureField} for documents.
   *
   * @param field field name. Must not be null.
   * @param featureName feature name. Must not be null.
   * @return a {@link DoubleValuesSource} which can be used to access the values of the feature for
   *     documents
   * @throws NullPointerException if {@code field} or {@code featureName} is null.
   */
  public static DoubleValuesSource newDoubleValues(String field, String featureName) {
    return new FeatureDoubleValuesSource(field, featureName);
  }
}