org.apache.lucene.search.join.ToParentBlockJoinCollector Maven / Gradle / Ivy
package org.apache.lucene.search.join;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexWriter; // javadocs
import org.apache.lucene.search.*;
import org.apache.lucene.search.Scorer.ChildScorer;
import org.apache.lucene.search.grouping.GroupDocs;
import org.apache.lucene.search.grouping.TopGroups;
import org.apache.lucene.util.ArrayUtil;
import java.io.IOException;
import java.util.*;
/** Collects parent document hits for a Query containing one more more
* BlockJoinQuery clauses, sorted by the
* specified parent Sort. Note that this cannot perform
* arbitrary joins; rather, it requires that all joined
* documents are indexed as a doc block (using {@link
* IndexWriter#addDocuments} or {@link
* IndexWriter#updateDocuments}). Ie, the join is computed
* at index time.
*
* The parent Sort must only use
* fields from the parent documents; sorting by field in
* the child documents is not supported.
*
* You should only use this
* collector if one or more of the clauses in the query is
* a {@link ToParentBlockJoinQuery}. This collector will find those query
* clauses and record the matching child documents for the
* top scoring parent documents.
*
* Multiple joins (star join) and nested joins and a mix
* of the two are allowed, as long as in all cases the
* documents corresponding to a single row of each joined
* parent table were indexed as a doc block.
*
* For the simple star join you can retrieve the
* {@link TopGroups} instance containing each {@link ToParentBlockJoinQuery}'s
* matching child documents for the top parent groups,
* using {@link #getTopGroups}. Ie,
* a single query, which will contain two or more
* {@link ToParentBlockJoinQuery}'s as clauses representing the star join,
* can then retrieve two or more {@link TopGroups} instances.
*
* For nested joins, the query will run correctly (ie,
* match the right parent and child documents), however,
* because TopGroups is currently unable to support nesting
* (each group is not able to hold another TopGroups), you
* are only able to retrieve the TopGroups of the first
* join. The TopGroups of the nested joins will not be
* correct.
*
* See {@link org.apache.lucene.search.join} for a code
* sample.
*
* @lucene.experimental
*/
public class ToParentBlockJoinCollector extends Collector {
private final Sort sort;
// Maps each BlockJoinQuery instance to its "slot" in
// joinScorers and in OneGroup's cached doc/scores/count:
private final Map joinQueryID = new HashMap<>();
private final int numParentHits;
private final FieldValueHitQueue queue;
private final FieldComparator[] comparators;
private final int[] reverseMul;
private final int compEnd;
private final boolean trackMaxScore;
private final boolean trackScores;
private int docBase;
private ToParentBlockJoinQuery.BlockJoinScorer[] joinScorers = new ToParentBlockJoinQuery.BlockJoinScorer[0];
private AtomicReaderContext currentReaderContext;
private Scorer scorer;
private boolean queueFull;
private OneGroup bottom;
private int totalHitCount;
private float maxScore = Float.NaN;
/** Creates a ToParentBlockJoinCollector. The provided sort must
* not be null. If you pass true trackScores, all
* ToParentBlockQuery instances must not use
* ScoreMode.None. */
public ToParentBlockJoinCollector(Sort sort, int numParentHits, boolean trackScores, boolean trackMaxScore) throws IOException {
// TODO: allow null sort to be specialized to relevance
// only collector
this.sort = sort;
this.trackMaxScore = trackMaxScore;
if (trackMaxScore) {
maxScore = Float.MIN_VALUE;
}
//System.out.println("numParentHits=" + numParentHits);
this.trackScores = trackScores;
this.numParentHits = numParentHits;
queue = FieldValueHitQueue.create(sort.getSort(), numParentHits);
comparators = queue.getComparators();
reverseMul = queue.getReverseMul();
compEnd = comparators.length - 1;
}
private static final class OneGroup extends FieldValueHitQueue.Entry {
public OneGroup(int comparatorSlot, int parentDoc, float parentScore, int numJoins, boolean doScores) {
super(comparatorSlot, parentDoc, parentScore);
//System.out.println("make OneGroup parentDoc=" + parentDoc);
docs = new int[numJoins][];
for(int joinID=0;joinID 0) {
// Definitely competitive.
break;
} else if (i == compEnd) {
// Here c=0. If we're at the last comparator, this doc is not
// competitive, since docs are visited in doc Id order, which means
// this doc cannot compete with any other document in the queue.
//System.out.println(" skip");
return;
}
}
//System.out.println(" competes! doc=" + (docBase + parentDoc));
// This hit is competitive - replace bottom element in queue & adjustTop
for (int i = 0; i < comparators.length; i++) {
comparators[i].copy(bottom.slot, parentDoc);
}
if (!trackMaxScore && trackScores) {
score = scorer.score();
}
bottom.doc = docBase + parentDoc;
bottom.readerContext = currentReaderContext;
bottom.score = score;
copyGroups(bottom);
bottom = queue.updateTop();
for (int i = 0; i < comparators.length; i++) {
comparators[i].setBottom(bottom.slot);
}
} else {
// Startup transient: queue is not yet full:
final int comparatorSlot = totalHitCount - 1;
// Copy hit into queue
for (int i = 0; i < comparators.length; i++) {
comparators[i].copy(comparatorSlot, parentDoc);
}
//System.out.println(" startup: new OG doc=" + (docBase+parentDoc));
if (!trackMaxScore && trackScores) {
score = scorer.score();
}
final OneGroup og = new OneGroup(comparatorSlot, docBase+parentDoc, score, joinScorers.length, trackScores);
og.readerContext = currentReaderContext;
copyGroups(og);
bottom = queue.add(og);
queueFull = totalHitCount == numParentHits;
if (queueFull) {
// End of startup transient: queue just filled up:
for (int i = 0; i < comparators.length; i++) {
comparators[i].setBottom(bottom.slot);
}
}
}
}
// Pulls out child doc and scores for all join queries:
private void copyGroups(OneGroup og) {
// While rare, it's possible top arrays could be too
// short if join query had null scorer on first
// segment(s) but then became non-null on later segments
final int numSubScorers = joinScorers.length;
if (og.docs.length < numSubScorers) {
// While rare, this could happen if join query had
// null scorer on first segment(s) but then became
// non-null on later segments
og.docs = ArrayUtil.grow(og.docs);
}
if (og.counts.length < numSubScorers) {
og.counts = ArrayUtil.grow(og.counts);
}
if (trackScores && og.scores.length < numSubScorers) {
og.scores = ArrayUtil.grow(og.scores);
}
//System.out.println("\ncopyGroups parentDoc=" + og.doc);
for(int scorerIDX = 0;scorerIDX < numSubScorers;scorerIDX++) {
final ToParentBlockJoinQuery.BlockJoinScorer joinScorer = joinScorers[scorerIDX];
//System.out.println(" scorer=" + joinScorer);
if (joinScorer != null && docBase + joinScorer.getParentDoc() == og.doc) {
og.counts[scorerIDX] = joinScorer.getChildCount();
//System.out.println(" count=" + og.counts[scorerIDX]);
og.docs[scorerIDX] = joinScorer.swapChildDocs(og.docs[scorerIDX]);
assert og.docs[scorerIDX].length >= og.counts[scorerIDX]: "length=" + og.docs[scorerIDX].length + " vs count=" + og.counts[scorerIDX];
//System.out.println(" len=" + og.docs[scorerIDX].length);
/*
for(int idx=0;idx= og.counts[scorerIDX]: "length=" + og.scores[scorerIDX].length + " vs count=" + og.counts[scorerIDX];
}
} else {
og.counts[scorerIDX] = 0;
}
}
}
@Override
public void setNextReader(AtomicReaderContext context) throws IOException {
currentReaderContext = context;
docBase = context.docBase;
for (int compIDX = 0; compIDX < comparators.length; compIDX++) {
queue.setComparator(compIDX, comparators[compIDX].setNextReader(context));
}
}
@Override
public boolean acceptsDocsOutOfOrder() {
return false;
}
private void enroll(ToParentBlockJoinQuery query, ToParentBlockJoinQuery.BlockJoinScorer scorer) {
scorer.trackPendingChildHits();
final Integer slot = joinQueryID.get(query);
if (slot == null) {
joinQueryID.put(query, joinScorers.length);
//System.out.println("found JQ: " + query + " slot=" + joinScorers.length);
final ToParentBlockJoinQuery.BlockJoinScorer[] newArray = new ToParentBlockJoinQuery.BlockJoinScorer[1+joinScorers.length];
System.arraycopy(joinScorers, 0, newArray, 0, joinScorers.length);
joinScorers = newArray;
joinScorers[joinScorers.length-1] = scorer;
} else {
joinScorers[slot] = scorer;
}
}
@Override
public void setScorer(Scorer scorer) {
//System.out.println("C.setScorer scorer=" + scorer);
// Since we invoke .score(), and the comparators likely
// do as well, cache it so it's only "really" computed
// once:
this.scorer = new ScoreCachingWrappingScorer(scorer);
for (int compIDX = 0; compIDX < comparators.length; compIDX++) {
comparators[compIDX].setScorer(this.scorer);
}
Arrays.fill(joinScorers, null);
Queue queue = new LinkedList<>();
//System.out.println("\nqueue: add top scorer=" + scorer);
queue.add(scorer);
while ((scorer = queue.poll()) != null) {
//System.out.println(" poll: " + scorer + "; " + scorer.getWeight().getQuery());
if (scorer instanceof ToParentBlockJoinQuery.BlockJoinScorer) {
enroll((ToParentBlockJoinQuery) scorer.getWeight().getQuery(), (ToParentBlockJoinQuery.BlockJoinScorer) scorer);
}
for (ChildScorer sub : scorer.getChildren()) {
//System.out.println(" add sub: " + sub.child + "; " + sub.child.getWeight().getQuery());
queue.add(sub.child);
}
}
}
private OneGroup[] sortedGroups;
private void sortQueue() {
sortedGroups = new OneGroup[queue.size()];
for(int downTo=queue.size()-1;downTo>=0;downTo--) {
sortedGroups[downTo] = queue.pop();
}
}
/** Returns the TopGroups for the specified
* BlockJoinQuery. The groupValue of each GroupDocs will
* be the parent docID for that group.
* The number of documents within each group is calculated as minimum of maxDocsPerGroup
* and number of matched child documents for that group.
* Returns null if no groups matched.
*
* @param query Search query
* @param withinGroupSort Sort criteria within groups
* @param offset Parent docs offset
* @param maxDocsPerGroup Upper bound of documents per group number
* @param withinGroupOffset Offset within each group of child docs
* @param fillSortFields Specifies whether to add sort fields or not
* @return TopGroups for specified query
* @throws IOException if there is a low-level I/O error
*/
public TopGroups getTopGroups(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset,
int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields)
throws IOException {
final Integer _slot = joinQueryID.get(query);
if (_slot == null && totalHitCount == 0) {
return null;
}
if (sortedGroups == null) {
if (offset >= queue.size()) {
return null;
}
sortQueue();
} else if (offset > sortedGroups.length) {
return null;
}
return accumulateGroups(_slot == null ? -1 : _slot.intValue(), offset, maxDocsPerGroup, withinGroupOffset, withinGroupSort, fillSortFields);
}
/**
* Accumulates groups for the BlockJoinQuery specified by its slot.
*
* @param slot Search query's slot
* @param offset Parent docs offset
* @param maxDocsPerGroup Upper bound of documents per group number
* @param withinGroupOffset Offset within each group of child docs
* @param withinGroupSort Sort criteria within groups
* @param fillSortFields Specifies whether to add sort fields or not
* @return TopGroups for the query specified by slot
* @throws IOException if there is a low-level I/O error
*/
@SuppressWarnings({"unchecked","rawtypes"})
private TopGroups accumulateGroups(int slot, int offset, int maxDocsPerGroup,
int withinGroupOffset, Sort withinGroupSort, boolean fillSortFields) throws IOException {
final GroupDocs[] groups = new GroupDocs[sortedGroups.length - offset];
final FakeScorer fakeScorer = new FakeScorer();
int totalGroupedHitCount = 0;
//System.out.println("slot=" + slot);
for(int groupIDX=offset;groupIDX= og.counts.length) {
numChildDocs = 0;
} else {
numChildDocs = og.counts[slot];
}
// Number of documents in group should be bounded to prevent redundant memory allocation
final int numDocsInGroup = Math.max(1, Math.min(numChildDocs, maxDocsPerGroup));
//System.out.println("parent doc=" + og.doc + " numChildDocs=" + numChildDocs + " maxDocsPG=" + maxDocsPerGroup);
// At this point we hold all docs w/ in each group,
// unsorted; we now sort them:
final TopDocsCollector> collector;
if (withinGroupSort == null) {
//System.out.println("sort by score");
// Sort by score
if (!trackScores) {
throw new IllegalArgumentException("cannot sort by relevance within group: trackScores=false");
}
collector = TopScoreDocCollector.create(numDocsInGroup, true);
} else {
// Sort by fields
collector = TopFieldCollector.create(withinGroupSort, numDocsInGroup, fillSortFields, trackScores, trackMaxScore, true);
}
collector.setScorer(fakeScorer);
collector.setNextReader(og.readerContext);
for(int docIDX=0;docIDX(og.score,
topDocs.getMaxScore(),
numChildDocs,
topDocs.scoreDocs,
og.doc,
groupSortValues);
}
return new TopGroups<>(new TopGroups<>(sort.getSort(),
withinGroupSort == null ? null : withinGroupSort.getSort(),
0, totalGroupedHitCount, groups, maxScore),
totalHitCount);
}
/** Returns the TopGroups for the specified BlockJoinQuery.
* The groupValue of each GroupDocs will be the parent docID for that group.
* The number of documents within each group
* equals to the total number of matched child documents for that group.
* Returns null if no groups matched.
*
* @param query Search query
* @param withinGroupSort Sort criteria within groups
* @param offset Parent docs offset
* @param withinGroupOffset Offset within each group of child docs
* @param fillSortFields Specifies whether to add sort fields or not
* @return TopGroups for specified query
* @throws IOException if there is a low-level I/O error
*/
public TopGroups getTopGroupsWithAllChildDocs(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset,
int withinGroupOffset, boolean fillSortFields)
throws IOException {
return getTopGroups(query, withinGroupSort, offset, Integer.MAX_VALUE, withinGroupOffset, fillSortFields);
}
/**
* Returns the highest score across all collected parent hits, as long as
* trackMaxScores=true
was passed
* {@link #ToParentBlockJoinCollector(Sort, int, boolean, boolean) on
* construction}. Else, this returns Float.NaN
*/
public float getMaxScore() {
return maxScore;
}
}