Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.solr.search.facet.RelatednessAgg Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.IntFunction;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.QParser;
import org.apache.solr.search.facet.SlotAcc.SweepableSlotAcc;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* An aggregation function designed to be nested under other (possibly deeply nested) facets for the
* purposes of computing the "relatedness" of facet buckets relative to "foreground" and
* "background" sets -- primarily for the purpose of building "Semantic Knowledge Graphs"
*
* @see The Semantic Knowledge Graph: A compact,
* auto-generated model for real-time traversal and ranking of any relationship within a
* domain
*/
public class RelatednessAgg extends AggValueSource {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
// end user values
private static final String RELATEDNESS = "relatedness";
private static final String FG_POP = "foreground_popularity";
private static final String BG_POP = "background_popularity";
public static final String SWEEP_COLLECTION = "sweep_collection";
// needed for distrib calculation
private static final String FG_SIZE = "foreground_size";
private static final String FG_COUNT = "foreground_count";
private static final String BG_SIZE = "background_size";
private static final String BG_COUNT = "background_count";
protected final Query fgQ;
protected final Query bgQ;
protected double min_pop = 0.0D;
private Boolean useSweep;
public static final String NAME = RELATEDNESS;
private static final boolean DEFAULT_SWEEP_COLLECTION = true;
public RelatednessAgg(Query fgQ, Query bgQ) {
super(NAME);
// NOTE: ideally we don't want to assume any defaults *yet* if fgQ/bgQ are null
// keep them null until it's time to created a SlotAcc, at which point we might inherit values
// from an ancestor facet context w/same key -- see comments in createSlotAcc
this.fgQ = fgQ;
this.bgQ = bgQ;
// TODO: defaults not supported yet -- see comments in createSlotAcc
if (null == fgQ || null == bgQ) {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
NAME
+ " aggregate function requires both foreground & background "
+ "to be real (non-null) queries");
}
}
public void setOpts(QParser parser) {
final boolean isShard = parser.getReq().getParams().getBool(ShardParams.IS_SHARD, false);
SolrParams opts = parser.getLocalParams();
if (null == opts) {
this.useSweep = DEFAULT_SWEEP_COLLECTION;
} else {
this.useSweep = opts.getBool(SWEEP_COLLECTION, DEFAULT_SWEEP_COLLECTION);
if (!isShard) { // ignore min_pop if this is a shard request
this.min_pop = opts.getDouble("min_popularity", 0.0D);
}
}
}
@Override
public String description() {
// TODO: need better output processing when we start supporting null fgQ/bgQ in constructor
return name
+ "(fgQ="
+ fgQ
+ ",bgQ="
+ bgQ
+ ",min_pop="
+ min_pop
+ ",useSweep="
+ useSweep
+ ")";
}
@Override
public boolean equals(Object o) {
if (!(o instanceof RelatednessAgg)) {
return false;
}
RelatednessAgg that = (RelatednessAgg) o;
return Objects.equals(fgQ, that.fgQ)
&& Objects.equals(bgQ, that.bgQ)
&& min_pop == that.min_pop;
}
@Override
public int hashCode() {
return Objects.hash(getClass(), fgQ, bgQ, min_pop);
}
@Override
public FunctionValues getValues(Map context, LeafReaderContext readerContext)
throws IOException {
throw new UnsupportedOperationException("NOT IMPLEMENTED " + name + " " + this);
}
@Override
public SlotAcc createSlotAcc(FacetContext fcontext, long numDocs, int numSlots)
throws IOException {
// TODO: Ideally this is where we should check fgQ/bgQ for 'null' and apply defaults...
//
// we want to walk up the fcontext and inherit the queries from any ancestor SKGAgg
// with the same "key" that we have in our own context -- and as a last resort use
// "$q" for the foreground and "*:*" for the bgQ (if no ancestors)
// (Hmmm... or maybe we should use the "Domain" of our FacetRequest as the default bg?)
//
// How do we find our what key we have in the current context?
// loop over all the stats in the current context until we find one that's '==' to this???
List fgFilters = new ArrayList<>(3);
fgFilters.add(fgQ);
for (FacetContext ctx = fcontext; ctx != null; ctx = ctx.parent) {
if (null != ctx.filter) {
fgFilters.add(ctx.filter);
} else {
// sanity check...
// the only way the filter on the current context should be null is...
assert ( // 1) it's the actual top most context,
// (ie: the func is directly used w/o being nested under a facet)
(null == ctx.parent && fcontext == ctx)
||
// 2) it's a child of the top most context
// (ie: the context of a top level facet)
(null == ctx.parent.parent && null == ctx.parent.filter));
// either way, no reason to keep looping up the (0 or 1) remaining ancestors
// (which is why #1 can assert '&& fcontext == ctx')
break;
}
}
DocSet fgSet = fcontext.searcher.getDocSet(fgFilters);
DocSet bgSet = fcontext.searcher.getDocSet(bgQ);
return new SKGSlotAcc(this, fcontext, numSlots, fgSet, bgSet);
}
@Override
public FacetMerger createFacetMerger(Object prototype) {
return new Merger(this);
}
private static final class SweepSKGSlotAcc extends SlotAcc {
private final int minCount; // pre-calculate for a given min_popularity
private final long fgSize;
private final long bgSize;
private final ReadOnlyCountSlotAcc fgCount;
private final ReadOnlyCountSlotAcc bgCount;
private double[] relatedness;
private static final int NO_ALL_BUCKETS = -2;
private static final int ALL_BUCKETS_UNINITIALIZED = -1;
// we can't get the allBuckets info from the slotContext in collect(), b/c the whole point of
// sweep collection is that the "collect" methods aren't called. So this is the compromise: note
// in construction either that we're using a processor w/ NO_ALL_BUCKETS or that we don't know
// the bucket yet (ALL_BUCKETS_UNINITIALIZED) and fill it in getValues where we can check
// against the processor
private int allBucketsSlot;
public SweepSKGSlotAcc(
double minPopularity,
FacetContext fcontext,
int numSlots,
long fgSize,
long bgSize,
ReadOnlyCountSlotAcc fgCount,
ReadOnlyCountSlotAcc bgCount) {
super(fcontext);
this.minCount = (int) Math.ceil(minPopularity * bgSize);
this.fgSize = fgSize;
this.bgSize = bgSize;
this.fgCount = fgCount;
this.bgCount = bgCount;
relatedness = new double[numSlots];
Arrays.fill(relatedness, 0, numSlots, Double.NaN);
// any processor that can (currently) result in the use of SweepSKGSlotAcc *should* be a
// FacetFieldProcessor -- but don't assume that will always be true...
this.allBucketsSlot = NO_ALL_BUCKETS;
if (fcontext.processor instanceof FacetFieldProcessor
// NOTE: if this instanceof/cast changes, getValues needs updated as well
&& ((FacetFieldProcessor) fcontext.processor).freq.allBuckets) {
this.allBucketsSlot = ALL_BUCKETS_UNINITIALIZED;
}
}
@Override
public void collect(int perSegDocId, int slot, IntFunction slotContext)
throws IOException {
throw new UnsupportedOperationException(
"collect() not supported, this SlotAcc impl only usable for sweeping");
}
@Override
public int collect(DocSet docs, int slot, IntFunction slotContext)
throws IOException {
throw new UnsupportedOperationException(
"collect() not supported, this SlotAcc impl only usable for sweeping");
}
private double getRelatedness(int slot) {
final double cachedRelatedness = relatedness[slot];
if (Double.isNaN(cachedRelatedness)) {
final long fg_count = fgCount.getCount(slot);
final long bg_count = bgCount.getCount(slot);
if (minCount > 0) {
// if min_pop is configured, and either (fg|bg) popularity is lower then that value
// then "this.relatedness=-Infinity" so it sorts below any "valid" relatedness scores
if (fg_count < minCount || bg_count < minCount) {
return relatedness[slot] = Double.NEGATIVE_INFINITY;
}
}
return relatedness[slot] = computeRelatedness(fg_count, fgSize, bg_count, bgSize);
} else {
return cachedRelatedness;
}
}
@Override
public int compare(int slotA, int slotB) {
int r = Double.compare(getRelatedness(slotA), getRelatedness(slotB));
if (0 == r) {
r = Long.compare(fgCount.getCount(slotA), fgCount.getCount(slotB));
}
if (0 == r) {
r = Long.compare(bgCount.getCount(slotA), bgCount.getCount(slotB));
}
return r;
}
@Override
public Object getValue(int slotNum) {
final BucketData slotVal;
if (NO_ALL_BUCKETS != allBucketsSlot) {
// there's no reason why a processor should be resizing SlotAccs in the middle of getValue,
// but we're going to be vigilent against that possibility just in case...
if (ALL_BUCKETS_UNINITIALIZED == allBucketsSlot || allBucketsSlot == slotNum) {
assert fcontext.processor instanceof FacetFieldProcessor
: "code changed, non FacetFieldProcessor sweeping w/allBuckets?!?";
allBucketsSlot = ((FacetFieldProcessor) fcontext.processor).allBucketsAcc.collectAccSlot;
}
}
if (slotNum == allBucketsSlot) {
slotVal = new BucketData(null);
} else {
slotVal =
new BucketData(
fgCount.getCount(slotNum),
fgSize,
bgCount.getCount(slotNum),
bgSize,
getRelatedness(slotNum));
}
return slotVal.externalize(fcontext.isShard());
}
@Override
public void reset() throws IOException {
Arrays.fill(relatedness, Double.NaN);
if (allBucketsSlot != NO_ALL_BUCKETS) {
allBucketsSlot = ALL_BUCKETS_UNINITIALIZED;
}
}
@Override
public void resize(Resizer resizer) {
relatedness = resizer.resize(relatedness, Double.NaN);
}
@Override
public void close() throws IOException {
relatedness = null;
}
}
private static final String IMPLIED_KEY = "implied";
private static final class SKGSlotAcc extends SlotAcc implements SweepableSlotAcc {
private final RelatednessAgg agg;
private BucketData[] slotvalues;
private final DocSet fgSet;
private final DocSet bgSet;
private final long fgSize;
private final long bgSize;
public SKGSlotAcc(
final RelatednessAgg agg,
final FacetContext fcontext,
final int numSlots,
final DocSet fgSet,
final DocSet bgSet)
throws IOException {
super(fcontext);
this.agg = agg;
this.fgSet = fgSet;
this.bgSet = bgSet;
// cache the set sizes for frequent re-use on every slot
this.fgSize = fgSet.size();
this.bgSize = bgSet.size();
// TODO: avoid initializing array until we know we're not doing sweep collection?
this.slotvalues = new BucketData[numSlots];
reset();
}
/**
* If called, may register SweepingAccs for fg and bg set based on whether user indicated
* sweeping should be used (default)
*
* @returns null if any SweepingAccs were registered since no other collection is needed for
* relatedness
*/
@Override
public SKGSlotAcc registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc) {
if (!this.agg.useSweep) {
return this;
} else {
final ReadOnlyCountSlotAcc fgCount =
baseSweepingAcc.add(key + "!fg", fgSet, slotvalues.length);
final ReadOnlyCountSlotAcc bgCount =
baseSweepingAcc.add(key + "!bg", bgSet, slotvalues.length);
SweepSKGSlotAcc readOnlyReplacement =
new SweepSKGSlotAcc(
agg.min_pop, fcontext, slotvalues.length, fgSize, bgSize, fgCount, bgCount);
readOnlyReplacement.key = key;
baseSweepingAcc.registerMapping(this, readOnlyReplacement);
return null;
}
}
private void processSlot(int slot, IntFunction slotContext) throws IOException {
assert null != slotContext;
final BucketData slotVal = new BucketData(agg);
slotvalues[slot] = slotVal;
final SlotContext ctx = slotContext.apply(slot);
if (ctx.isAllBuckets()) {
// relatedness is meaningless for allBuckets (see SOLR-14467)
// our current (implied & empty) BucketData is all we need
//
// NOTE: it might be temping to use 'slotvalues[slot] = null' in this case
// since getValue() will also ultimately generate an implied bucket in that case,
// but by using a non-null bucket we let collect(int,...) know it doesn't need to keep
// calling processSlot over and over.
return;
}
Query slotQ = ctx.getSlotQuery();
if (null == slotQ) {
// extremeley special edge case...
// the only way this should be possible is if our relatedness() function is used as a "top
// level" stat w/o being nested under any facet, in which case it should be a FacetQuery
// w/no parent...
assert fcontext.processor.freq instanceof FacetQuery : fcontext.processor.freq;
assert null == fcontext.parent;
assert null == fcontext.filter;
}
// ...and in which case we should just use the current base
final DocSet slotSet;
if (null == slotQ) {
slotSet = fcontext.base;
} else {
slotSet = fcontext.searcher.getDocSet(slotQ);
}
slotVal.incSizes(fgSize, bgSize);
slotVal.incCounts(fgSet.intersectionSize(slotSet), bgSet.intersectionSize(slotSet));
}
@Override
public void collect(int perSegDocId, int slot, IntFunction slotContext)
throws IOException {
// NOTE: we don't actaully care about the individual docs being collected
// (the only reason we even bother implementing this method is because it's needed for sorting
// buckets by a function)
// so we only worry about ensuring that every "slot" / bucket is processed the first time
// we're asked about it...
if (null == slotvalues[slot]) {
processSlot(slot, slotContext);
}
}
@Override
public int collect(DocSet docs, int slot, IntFunction slotContext)
throws IOException {
// NOTE: we don't actaully care about the doc set being collected for the bucket
// so we only worry about ensuring that every "slot" / bucket is processed exactly once
// if we're doing bulk collection, we better not be getting asked to re-use slots
assert null == slotvalues[slot];
processSlot(slot, slotContext);
// we don't do any filtering, we collect the whole docset, so return that as out collected
// count (as a stat, we're actually required to return this by assertions in
// FacetFieldProcessor.processStats)
return docs.size();
}
@Override
public int compare(int slotA, int slotB) {
final BucketData a = slotvalues[slotA];
final BucketData b = slotvalues[slotB];
// we initialize & reset() (unused) slotvalues elements to null
// but we should never be asked to compare a slot that hasn't been collected...
assert null != a;
assert null != b;
return a.compareTo(b);
}
@Override
public Object getValue(int slotNum) {
BucketData slotVal = slotvalues[slotNum];
if (null == slotVal) {
// since we haven't collected any docs for this slot, use am (implied) slot w/no counts,
// just the known fg/bg sizes. (this is most likely a refinement request for a bucket we
// dont have)
slotVal = new BucketData(agg);
slotVal.incSizes(fgSize, bgSize);
}
return slotVal.externalize(fcontext.isShard());
}
@Override
public void reset() {
Arrays.fill(slotvalues, null);
}
@Override
public void resize(Resizer resizer) {
slotvalues = resizer.resize(slotvalues, null);
}
@Override
public void close() throws IOException {
slotvalues = null;
}
}
/**
* Encapsulates all data needed for a single bucket/slot
*
* @see SKGSlotAcc
* @see Merger
*/
private static class BucketData implements Comparable {
private RelatednessAgg agg;
private long fg_size = 0;
private long bg_size = 0;
private long fg_count = 0;
private long bg_count = 0;
/**
* Buckets are implied until/unless counts are explicitly incremented (even if those counts are
* 0) An implied bucket means we have no real data for it -- it may be useful for a per-Shard
* request to return "size" info of a bucket that doesn't exist on the current shard, or it may
* represent the allBuckets
bucket.
*
* @see #incCounts
*/
private boolean implied;
/**
* NaN indicates that all derived values need (re)-computed
*
* @see #computeDerivedValues
* @see #getRelatedness
*/
private double relatedness = Double.NaN;
/**
* @see #computeDerivedValues
* @see #getForegroundPopularity
*/
private double fg_pop;
/**
* @see #computeDerivedValues
* @see #getBackgroundPopularity
*/
private double bg_pop;
public BucketData(final RelatednessAgg agg) {
this.agg = agg;
this.implied = true;
}
public BucketData(
long fg_count, long fg_size, long bg_count, long bg_size, double relatedness) {
this.fg_count = fg_count;
this.fg_size = fg_size;
this.fg_pop = (double) fg_count / bg_size; // yes, BACKGROUND size is intentional
this.bg_count = bg_count;
this.bg_size = bg_size;
this.bg_pop = (double) bg_count / bg_size;
this.relatedness = relatedness;
}
/**
* Increment both the foreground & background counts for the current bucket,
* reseting any derived values that may be cached
*/
public void incCounts(final long fgInc, final long bgInc) {
this.implied = false;
this.relatedness = Double.NaN;
fg_count += fgInc;
bg_count += bgInc;
}
/**
* Increment both the foreground & background sizes for the current bucket,
* reseting any derived values that may be cached
*/
public void incSizes(final long fgInc, final long bgInc) {
this.relatedness = Double.NaN;
fg_size += fgInc;
bg_size += bgInc;
}
@Override
public int hashCode() {
return Objects.hash(this.getClass(), implied, fg_count, bg_count, fg_size, bg_size, agg);
}
@Override
public boolean equals(Object other) {
if (!(other instanceof BucketData)) {
return false;
}
BucketData that = (BucketData) other;
// we will most certainly be compared to other buckets of the same Agg instance, so compare
// counts first
return this.implied == that.implied
&& this.fg_count == that.fg_count
&& this.bg_count == that.bg_count
&& this.fg_size == that.fg_size
&& this.bg_size == that.bg_size
&& Objects.equals(this.agg, that.agg);
}
/**
* Computes (and caches) the derived relatedness & popularity scores for this bucket if
* needed
*/
private void computeDerivedValues() {
if (!Double.isNaN(this.relatedness)) {
return; // values already computed;
}
this.fg_pop = (double) fg_count / bg_size; // yes, BACKGROUND size is intentional
this.bg_pop = (double) bg_count / bg_size;
if (0.0D < agg.min_pop) {
// if min_pop is configured, and either (fg|bg) popularity is lower then that value
// then "this.relatedness=-Infinity" so it sorts below any "valid" relatedness scores
if (fg_pop < agg.min_pop || bg_pop < agg.min_pop) {
this.relatedness = Double.NEGATIVE_INFINITY;
return;
}
}
this.relatedness =
computeRelatedness(
this.fg_count, this.fg_size,
this.bg_count, this.bg_size);
}
private double getRelatedness() {
computeDerivedValues();
return this.relatedness;
}
private double getForegroundPopularity() {
computeDerivedValues();
return this.fg_pop;
}
private double getBackgroundPopularity() {
computeDerivedValues();
return this.bg_pop;
}
@Override
public int compareTo(BucketData that) {
// TODO: add support for a "sort_val" option...
//
// default should be "relatedness" but also support "foreground" and "background" ...
// either of those should sort by the corrisponding ratio
// To do this, we should probably precommpute the ratios in incCounts
int r = Double.compare(this.getRelatedness(), that.getRelatedness());
if (0 == r) {
r = Long.compare(this.fg_count, that.fg_count);
}
if (0 == r) {
r = Long.compare(this.bg_count, that.bg_count);
}
return r;
}
/**
* @see SlotAcc#getValue
* @see Merger#getMergedResult
*/
public SimpleOrderedMap externalize(final boolean isShardRequest) {
SimpleOrderedMap result = new SimpleOrderedMap<>();
// if counts are non-zero, then this bucket must not be implied
assert 0 == fg_count || !implied : "Implied bucket has non-zero fg_count";
assert 0 == bg_count || !implied : "Implied bucket has non-zero bg_count";
if (isShardRequest) {
// shard responses must include size info, but don't need the derived stats
//
// NOTE: sizes will be the same for every slot...
// TODO: it would be nice to put them directly in the parent facet, instead of every bucket,
// in order to reduce the size of the response.
result.add(FG_SIZE, fg_size);
result.add(BG_SIZE, bg_size);
if (implied) {
// for an implied bucket on this shard, we don't need to bother returning the (empty)
// counts, just the flag explaining that this bucket is (locally) implied...
result.add(IMPLIED_KEY, Boolean.TRUE);
} else {
result.add(FG_COUNT, fg_count);
result.add(BG_COUNT, bg_count);
}
} else {
if (implied) {
// When returning results to an external client, any bucket still 'implied' shouldn't
// return any results at all.
// (practically speaking this should only happen for the 'allBuckets' bucket
return null;
}
// there's no need to bother computing these when returning results *to* a shard coordinator
// only useful to external clients
result.add(RELATEDNESS, this.getRelatedness());
result.add(FG_POP, roundTo5Digits(this.getForegroundPopularity()));
result.add(BG_POP, roundTo5Digits(this.getBackgroundPopularity()));
}
return result;
}
}
/** Merges in the per shard {@link BucketData} output into a unified {@link BucketData} */
private static final class Merger extends FacetModule.FacetSortableMerger {
private final BucketData mergedData;
public Merger(final RelatednessAgg agg) {
this.mergedData = new BucketData(agg);
}
@Override
public void merge(Object facetResult, Context mcontext) {
@SuppressWarnings({"unchecked"})
final NamedList shardData = (NamedList) facetResult;
final boolean shardImplied =
Objects.requireNonNullElse((Boolean) shardData.remove(IMPLIED_KEY), false);
// regardless of whether this shard is implied, we want to know its size info...
mergedData.incSizes((Long) shardData.remove(FG_SIZE), (Long) shardData.remove(BG_SIZE));
if (!shardImplied) {
// only merge in counts from non-implied shard buckets...
mergedData.incCounts((Long) shardData.remove(FG_COUNT), (Long) shardData.remove(BG_COUNT));
} else {
// if this shard is implied, we shouldn't have even gotten counts...
assert shardImplied;
assert null == shardData.remove(FG_COUNT);
assert null == shardData.remove(BG_COUNT);
}
}
@Override
public int compareTo(
FacetModule.FacetSortableMerger other, FacetRequest.SortDirection direction) {
// NOTE: regardless of the SortDirection hint, we want normal comparison of the BucketData
assert other instanceof Merger;
Merger that = (Merger) other;
return mergedData.compareTo(that.mergedData);
}
@Override
public Object getMergedResult() {
return mergedData.externalize(false);
}
}
/**
* This is an aproximated Z-Score, as described in the "Scoring Semantic Relationships" section of
* "The Semantic Knowledge Graph: A compact,
* auto-generated model for real-time traversal and ranking of any relationship within a
* domain "
*
* See Also:
*
*
*/
// NOTE: javadoc linter freaks out if we try doing those links as '@see