org.apache.solr.search.facet.RelatednessAgg Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.search.facet;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.IntFunction;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.QParser;
import org.apache.solr.search.facet.SlotAcc.SweepableSlotAcc;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * An aggregation function designed to be nested under other (possibly deeply nested) facets for the
 * purposes of computing the "relatedness" of facet buckets relative to "foreground" and
 * "background" sets -- primarily for the purpose of building "Semantic Knowledge Graphs"
 *
 * @see The Semantic Knowledge Graph: A compact,
 *     auto-generated model for real-time traversal and ranking of any relationship within a
 *     domain
 */
public class RelatednessAgg extends AggValueSource {
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  // end user values
  private static final String RELATEDNESS = "relatedness";
  private static final String FG_POP = "foreground_popularity";
  private static final String BG_POP = "background_popularity";
  public static final String SWEEP_COLLECTION = "sweep_collection";

  // needed for distrib calculation
  private static final String FG_SIZE = "foreground_size";
  private static final String FG_COUNT = "foreground_count";
  private static final String BG_SIZE = "background_size";
  private static final String BG_COUNT = "background_count";

  protected final Query fgQ;
  protected final Query bgQ;
  protected double min_pop = 0.0D;
  private Boolean useSweep;

  public static final String NAME = RELATEDNESS;
  private static final boolean DEFAULT_SWEEP_COLLECTION = true;

  public RelatednessAgg(Query fgQ, Query bgQ) {
    super(NAME);
    // NOTE: ideally we don't want to assume any defaults *yet* if fgQ/bgQ are null
    // keep them null until it's time to created a SlotAcc, at which point we might inherit values
    // from an ancestor facet context w/same key -- see comments in createSlotAcc
    this.fgQ = fgQ;
    this.bgQ = bgQ;

    // TODO: defaults not supported yet -- see comments in createSlotAcc
    if (null == fgQ || null == bgQ) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          NAME
              + " aggregate function requires both foreground & background "
              + "to be real (non-null) queries");
    }
  }

  public void setOpts(QParser parser) {
    final boolean isShard = parser.getReq().getParams().getBool(ShardParams.IS_SHARD, false);
    SolrParams opts = parser.getLocalParams();
    if (null == opts) {
      this.useSweep = DEFAULT_SWEEP_COLLECTION;
    } else {
      this.useSweep = opts.getBool(SWEEP_COLLECTION, DEFAULT_SWEEP_COLLECTION);
      if (!isShard) { // ignore min_pop if this is a shard request
        this.min_pop = opts.getDouble("min_popularity", 0.0D);
      }
    }
  }

  @Override
  public String description() {
    // TODO: need better output processing when we start supporting null fgQ/bgQ in constructor
    return name
        + "(fgQ="
        + fgQ
        + ",bgQ="
        + bgQ
        + ",min_pop="
        + min_pop
        + ",useSweep="
        + useSweep
        + ")";
  }

  @Override
  public boolean equals(Object o) {
    if (!(o instanceof RelatednessAgg)) {
      return false;
    }
    RelatednessAgg that = (RelatednessAgg) o;
    return Objects.equals(fgQ, that.fgQ)
        && Objects.equals(bgQ, that.bgQ)
        && min_pop == that.min_pop;
  }

  @Override
  public int hashCode() {
    return Objects.hash(getClass(), fgQ, bgQ, min_pop);
  }

  @Override
  public FunctionValues getValues(Map context, LeafReaderContext readerContext)
      throws IOException {
    throw new UnsupportedOperationException("NOT IMPLEMENTED " + name + " " + this);
  }

  @Override
  public SlotAcc createSlotAcc(FacetContext fcontext, long numDocs, int numSlots)
      throws IOException {
    // TODO: Ideally this is where we should check fgQ/bgQ for 'null' and apply defaults...
    //
    // we want to walk up the fcontext and inherit the queries from any ancestor SKGAgg
    // with the same "key" that we have in our own context -- and as a last resort use
    // "$q" for the foreground and "*:*" for the bgQ (if no ancestors)
    // (Hmmm... or maybe we should use the "Domain" of our FacetRequest as the default bg?)
    //
    // How do we find our what key we have in the current context?
    // loop over all the stats in the current context until we find one that's '==' to this???

    List fgFilters = new ArrayList<>(3);
    fgFilters.add(fgQ);
    for (FacetContext ctx = fcontext; ctx != null; ctx = ctx.parent) {
      if (null != ctx.filter) {
        fgFilters.add(ctx.filter);
      } else {
        // sanity check...
        // the only way the filter on the current context should be null is...
        assert ( // 1) it's the actual top most context,
        //    (ie: the func is directly used w/o being nested under a facet)
        (null == ctx.parent && fcontext == ctx)
            ||
            // 2) it's a child of the top most context
            //    (ie: the context of a top level facet)
            (null == ctx.parent.parent && null == ctx.parent.filter));
        // either way, no reason to keep looping up the (0 or 1) remaining ancestors
        // (which is why #1 can assert '&& fcontext == ctx')
        break;
      }
    }

    DocSet fgSet = fcontext.searcher.getDocSet(fgFilters);
    DocSet bgSet = fcontext.searcher.getDocSet(bgQ);
    return new SKGSlotAcc(this, fcontext, numSlots, fgSet, bgSet);
  }

  @Override
  public FacetMerger createFacetMerger(Object prototype) {
    return new Merger(this);
  }

  private static final class SweepSKGSlotAcc extends SlotAcc {

    private final int minCount; // pre-calculate for a given min_popularity
    private final long fgSize;
    private final long bgSize;
    private final ReadOnlyCountSlotAcc fgCount;
    private final ReadOnlyCountSlotAcc bgCount;
    private double[] relatedness;

    private static final int NO_ALL_BUCKETS = -2;
    private static final int ALL_BUCKETS_UNINITIALIZED = -1;

    // we can't get the allBuckets info from the slotContext in collect(), b/c the whole point of
    // sweep collection is that the "collect" methods aren't called. So this is the compromise: note
    // in construction either that we're using a processor w/ NO_ALL_BUCKETS or that we don't know
    // the bucket yet (ALL_BUCKETS_UNINITIALIZED) and fill it in getValues where we can check
    // against the processor
    private int allBucketsSlot;

    public SweepSKGSlotAcc(
        double minPopularity,
        FacetContext fcontext,
        int numSlots,
        long fgSize,
        long bgSize,
        ReadOnlyCountSlotAcc fgCount,
        ReadOnlyCountSlotAcc bgCount) {
      super(fcontext);
      this.minCount = (int) Math.ceil(minPopularity * bgSize);
      this.fgSize = fgSize;
      this.bgSize = bgSize;
      this.fgCount = fgCount;
      this.bgCount = bgCount;
      relatedness = new double[numSlots];
      Arrays.fill(relatedness, 0, numSlots, Double.NaN);

      // any processor that can (currently) result in the use of SweepSKGSlotAcc *should* be a
      // FacetFieldProcessor -- but don't assume that will always be true...
      this.allBucketsSlot = NO_ALL_BUCKETS;
      if (fcontext.processor instanceof FacetFieldProcessor
          // NOTE: if this instanceof/cast changes, getValues needs updated as well
          && ((FacetFieldProcessor) fcontext.processor).freq.allBuckets) {
        this.allBucketsSlot = ALL_BUCKETS_UNINITIALIZED;
      }
    }

    @Override
    public void collect(int perSegDocId, int slot, IntFunction slotContext)
        throws IOException {
      throw new UnsupportedOperationException(
          "collect() not supported, this SlotAcc impl only usable for sweeping");
    }

    @Override
    public int collect(DocSet docs, int slot, IntFunction slotContext)
        throws IOException {
      throw new UnsupportedOperationException(
          "collect() not supported, this SlotAcc impl only usable for sweeping");
    }

    private double getRelatedness(int slot) {
      final double cachedRelatedness = relatedness[slot];
      if (Double.isNaN(cachedRelatedness)) {
        final long fg_count = fgCount.getCount(slot);
        final long bg_count = bgCount.getCount(slot);
        if (minCount > 0) {
          // if min_pop is configured, and either (fg|bg) popularity is lower then that value
          // then "this.relatedness=-Infinity" so it sorts below any "valid" relatedness scores
          if (fg_count < minCount || bg_count < minCount) {
            return relatedness[slot] = Double.NEGATIVE_INFINITY;
          }
        }
        return relatedness[slot] = computeRelatedness(fg_count, fgSize, bg_count, bgSize);
      } else {
        return cachedRelatedness;
      }
    }

    @Override
    public int compare(int slotA, int slotB) {
      int r = Double.compare(getRelatedness(slotA), getRelatedness(slotB));
      if (0 == r) {
        r = Long.compare(fgCount.getCount(slotA), fgCount.getCount(slotB));
      }
      if (0 == r) {
        r = Long.compare(bgCount.getCount(slotA), bgCount.getCount(slotB));
      }
      return r;
    }

    @Override
    public Object getValue(int slotNum) {
      final BucketData slotVal;
      if (NO_ALL_BUCKETS != allBucketsSlot) {
        // there's no reason why a processor should be resizing SlotAccs in the middle of getValue,
        // but we're going to be vigilent against that possibility just in case...
        if (ALL_BUCKETS_UNINITIALIZED == allBucketsSlot || allBucketsSlot == slotNum) {
          assert fcontext.processor instanceof FacetFieldProcessor
              : "code changed, non FacetFieldProcessor sweeping w/allBuckets?!?";
          allBucketsSlot = ((FacetFieldProcessor) fcontext.processor).allBucketsAcc.collectAccSlot;
        }
      }
      if (slotNum == allBucketsSlot) {
        slotVal = new BucketData(null);
      } else {
        slotVal =
            new BucketData(
                fgCount.getCount(slotNum),
                fgSize,
                bgCount.getCount(slotNum),
                bgSize,
                getRelatedness(slotNum));
      }
      return slotVal.externalize(fcontext.isShard());
    }

    @Override
    public void reset() throws IOException {
      Arrays.fill(relatedness, Double.NaN);
      if (allBucketsSlot != NO_ALL_BUCKETS) {
        allBucketsSlot = ALL_BUCKETS_UNINITIALIZED;
      }
    }

    @Override
    public void resize(Resizer resizer) {
      relatedness = resizer.resize(relatedness, Double.NaN);
    }

    @Override
    public void close() throws IOException {
      relatedness = null;
    }
  }

  private static final String IMPLIED_KEY = "implied";

  private static final class SKGSlotAcc extends SlotAcc implements SweepableSlotAcc {
    private final RelatednessAgg agg;
    private BucketData[] slotvalues;
    private final DocSet fgSet;
    private final DocSet bgSet;
    private final long fgSize;
    private final long bgSize;

    public SKGSlotAcc(
        final RelatednessAgg agg,
        final FacetContext fcontext,
        final int numSlots,
        final DocSet fgSet,
        final DocSet bgSet)
        throws IOException {
      super(fcontext);
      this.agg = agg;
      this.fgSet = fgSet;
      this.bgSet = bgSet;
      // cache the set sizes for frequent re-use on every slot
      this.fgSize = fgSet.size();
      this.bgSize = bgSet.size();
      // TODO: avoid initializing array until we know we're not doing sweep collection?
      this.slotvalues = new BucketData[numSlots];
      reset();
    }

    /**
     * If called, may register SweepingAccs for fg and bg set based on whether user indicated
     * sweeping should be used (default)
     *
     * @returns null if any SweepingAccs were registered since no other collection is needed for
     *     relatedness
     */
    @Override
    public SKGSlotAcc registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc) {
      if (!this.agg.useSweep) {
        return this;
      } else {
        final ReadOnlyCountSlotAcc fgCount =
            baseSweepingAcc.add(key + "!fg", fgSet, slotvalues.length);
        final ReadOnlyCountSlotAcc bgCount =
            baseSweepingAcc.add(key + "!bg", bgSet, slotvalues.length);
        SweepSKGSlotAcc readOnlyReplacement =
            new SweepSKGSlotAcc(
                agg.min_pop, fcontext, slotvalues.length, fgSize, bgSize, fgCount, bgCount);
        readOnlyReplacement.key = key;
        baseSweepingAcc.registerMapping(this, readOnlyReplacement);
        return null;
      }
    }

    private void processSlot(int slot, IntFunction slotContext) throws IOException {

      assert null != slotContext;

      final BucketData slotVal = new BucketData(agg);
      slotvalues[slot] = slotVal;

      final SlotContext ctx = slotContext.apply(slot);
      if (ctx.isAllBuckets()) {
        // relatedness is meaningless for allBuckets (see SOLR-14467)
        // our current (implied & empty) BucketData is all we need
        //
        // NOTE: it might be temping to use 'slotvalues[slot] = null' in this case
        // since getValue() will also ultimately generate an implied bucket in that case,
        // but by using a non-null bucket we let collect(int,...) know it doesn't need to keep
        // calling processSlot over and over.
        return;
      }

      Query slotQ = ctx.getSlotQuery();
      if (null == slotQ) {
        // extremeley special edge case...
        // the only way this should be possible is if our relatedness() function is used as a "top
        // level" stat w/o being nested under any facet, in which case it should be a FacetQuery
        // w/no parent...
        assert fcontext.processor.freq instanceof FacetQuery : fcontext.processor.freq;
        assert null == fcontext.parent;
        assert null == fcontext.filter;
      }
      // ...and in which case we should just use the current base
      final DocSet slotSet;
      if (null == slotQ) {
        slotSet = fcontext.base;
      } else {
        slotSet = fcontext.searcher.getDocSet(slotQ);
      }

      slotVal.incSizes(fgSize, bgSize);
      slotVal.incCounts(fgSet.intersectionSize(slotSet), bgSet.intersectionSize(slotSet));
    }

    @Override
    public void collect(int perSegDocId, int slot, IntFunction slotContext)
        throws IOException {
      // NOTE: we don't actaully care about the individual docs being collected
      // (the only reason we even bother implementing this method is because it's needed for sorting
      // buckets by a function)

      // so we only worry about ensuring that every "slot" / bucket is processed the first time
      // we're asked about it...
      if (null == slotvalues[slot]) {
        processSlot(slot, slotContext);
      }
    }

    @Override
    public int collect(DocSet docs, int slot, IntFunction slotContext)
        throws IOException {
      // NOTE: we don't actaully care about the doc set being collected for the bucket
      // so we only worry about ensuring that every "slot" / bucket is processed exactly once

      // if we're doing bulk collection, we better not be getting asked to re-use slots
      assert null == slotvalues[slot];
      processSlot(slot, slotContext);

      // we don't do any filtering, we collect the whole docset, so return that as out collected
      // count (as a stat, we're actually required to return this by assertions in
      // FacetFieldProcessor.processStats)
      return docs.size();
    }

    @Override
    public int compare(int slotA, int slotB) {
      final BucketData a = slotvalues[slotA];
      final BucketData b = slotvalues[slotB];

      // we initialize & reset() (unused) slotvalues elements to null
      // but we should never be asked to compare a slot that hasn't been collected...
      assert null != a;
      assert null != b;
      return a.compareTo(b);
    }

    @Override
    public Object getValue(int slotNum) {
      BucketData slotVal = slotvalues[slotNum];
      if (null == slotVal) {
        // since we haven't collected any docs for this slot, use am (implied) slot w/no counts,
        // just the known fg/bg sizes. (this is most likely a refinement request for a bucket we
        // dont have)
        slotVal = new BucketData(agg);
        slotVal.incSizes(fgSize, bgSize);
      }

      return slotVal.externalize(fcontext.isShard());
    }

    @Override
    public void reset() {
      Arrays.fill(slotvalues, null);
    }

    @Override
    public void resize(Resizer resizer) {
      slotvalues = resizer.resize(slotvalues, null);
    }

    @Override
    public void close() throws IOException {
      slotvalues = null;
    }
  }

  /**
   * Encapsulates all data needed for a single bucket/slot
   *
   * @see SKGSlotAcc
   * @see Merger
   */
  private static class BucketData implements Comparable {
    private RelatednessAgg agg;
    private long fg_size = 0;
    private long bg_size = 0;
    private long fg_count = 0;
    private long bg_count = 0;

    /**
     * Buckets are implied until/unless counts are explicitly incremented (even if those counts are
     * 0) An implied bucket means we have no real data for it -- it may be useful for a per-Shard
     * request to return "size" info of a bucket that doesn't exist on the current shard, or it may
     * represent the allBuckets bucket.
     *
     * @see #incCounts
     */
    private boolean implied;

    /**
     * NaN indicates that all derived values need (re)-computed
     *
     * @see #computeDerivedValues
     * @see #getRelatedness
     */
    private double relatedness = Double.NaN;

    /**
     * @see #computeDerivedValues
     * @see #getForegroundPopularity
     */
    private double fg_pop;

    /**
     * @see #computeDerivedValues
     * @see #getBackgroundPopularity
     */
    private double bg_pop;

    public BucketData(final RelatednessAgg agg) {
      this.agg = agg;
      this.implied = true;
    }

    public BucketData(
        long fg_count, long fg_size, long bg_count, long bg_size, double relatedness) {
      this.fg_count = fg_count;
      this.fg_size = fg_size;
      this.fg_pop = (double) fg_count / bg_size; // yes, BACKGROUND size is intentional
      this.bg_count = bg_count;
      this.bg_size = bg_size;
      this.bg_pop = (double) bg_count / bg_size;
      this.relatedness = relatedness;
    }

    /**
     * Increment both the foreground & background counts for the current bucket,
     * reseting any derived values that may be cached
     */
    public void incCounts(final long fgInc, final long bgInc) {
      this.implied = false;
      this.relatedness = Double.NaN;
      fg_count += fgInc;
      bg_count += bgInc;
    }

    /**
     * Increment both the foreground & background sizes for the current bucket,
     * reseting any derived values that may be cached
     */
    public void incSizes(final long fgInc, final long bgInc) {
      this.relatedness = Double.NaN;
      fg_size += fgInc;
      bg_size += bgInc;
    }

    @Override
    public int hashCode() {
      return Objects.hash(this.getClass(), implied, fg_count, bg_count, fg_size, bg_size, agg);
    }

    @Override
    public boolean equals(Object other) {
      if (!(other instanceof BucketData)) {
        return false;
      }
      BucketData that = (BucketData) other;
      // we will most certainly be compared to other buckets of the same Agg instance, so compare
      // counts first
      return this.implied == that.implied
          && this.fg_count == that.fg_count
          && this.bg_count == that.bg_count
          && this.fg_size == that.fg_size
          && this.bg_size == that.bg_size
          && Objects.equals(this.agg, that.agg);
    }

    /**
     * Computes (and caches) the derived relatedness & popularity scores for this bucket if
     * needed
     */
    private void computeDerivedValues() {
      if (!Double.isNaN(this.relatedness)) {
        return; // values already computed;
      }

      this.fg_pop = (double) fg_count / bg_size; // yes, BACKGROUND size is intentional
      this.bg_pop = (double) bg_count / bg_size;

      if (0.0D < agg.min_pop) {
        // if min_pop is configured, and either (fg|bg) popularity is lower then that value
        // then "this.relatedness=-Infinity" so it sorts below any "valid" relatedness scores
        if (fg_pop < agg.min_pop || bg_pop < agg.min_pop) {
          this.relatedness = Double.NEGATIVE_INFINITY;
          return;
        }
      }

      this.relatedness =
          computeRelatedness(
              this.fg_count, this.fg_size,
              this.bg_count, this.bg_size);
    }

    private double getRelatedness() {
      computeDerivedValues();
      return this.relatedness;
    }

    private double getForegroundPopularity() {
      computeDerivedValues();
      return this.fg_pop;
    }

    private double getBackgroundPopularity() {
      computeDerivedValues();
      return this.bg_pop;
    }

    @Override
    public int compareTo(BucketData that) {
      // TODO: add support for a "sort_val" option...
      //
      // default should be "relatedness" but also support "foreground" and "background" ...
      // either of those should sort by the corrisponding ratio
      // To do this, we should probably precommpute the ratios in incCounts

      int r = Double.compare(this.getRelatedness(), that.getRelatedness());
      if (0 == r) {
        r = Long.compare(this.fg_count, that.fg_count);
      }
      if (0 == r) {
        r = Long.compare(this.bg_count, that.bg_count);
      }
      return r;
    }

    /**
     * @see SlotAcc#getValue
     * @see Merger#getMergedResult
     */
    public SimpleOrderedMap