All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.search.join.GraphTermsCollector Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.search.join;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.TreeSet;

import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DocValuesTermsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocSet;

/**
 * A graph hit collector.  This accumulates the edges for a given graph traversal.
 * On each collect method, the collector skips edge extraction for nodes that it has
 * already traversed.
 * @lucene.internal
 */
abstract class GraphEdgeCollector extends SimpleCollector implements Collector {
  // For graph traversal, the result set that has already been visited and thus can be skipped for during value collection.
  DocSet skipSet;
  // known leaf nodes
  DocSet leafNodes;

  int numHits=0;    // number of documents visited
  BitSet bits;  // if not null, used to collect documents visited

  int base;

  SchemaField collectField;

  // skipSet and leafNodes may be null
  GraphEdgeCollector(SchemaField collectField, DocSet skipSet, DocSet leafNodes) {
    this.collectField = collectField;
    this.skipSet = skipSet;
    this.leafNodes = leafNodes;
  }

  // Set to use to collect docs being visited
  // TODO: this should be replaced with a more general delegating collector
  public void setCollectDocs(FixedBitSet target) {
    this.bits = target;
  }

  // the number of docs visited
  public int getNumHits() { return numHits; }
  
  public void collect(int segDoc) throws IOException {
    int doc = segDoc + base;
    if (skipSet != null && skipSet.exists(doc)) {
      // when skipSet == all nodes visited so far, then this represents a cycle and we can
      // keep track of that here in the future if we need to.
      return;
    }

    if (bits != null) bits.set(doc);
    // increment the hit count so we know how many docs we traversed this time.
    numHits++;

    // Optimization to not look up edges for a document that is a leaf node (i.e. has no outgoing edges)
    if (leafNodes == null || !leafNodes.exists(doc)) {
      addEdgeIdsToResult(segDoc);
    } 
    // Note: tracking links in for each result would be a huge memory hog... so not implementing at this time.
  }
  
  abstract void addEdgeIdsToResult(int doc) throws IOException;
  
  private void addDocToResult(int docWithBase) {
    // this document is part of the traversal. mark it in our bitmap.
    bits.set(docWithBase);
    // increment the hit count so we know how many docs we traversed this time.
    numHits++;
  }
  
  @Override
  public void doSetNextReader(LeafReaderContext context) throws IOException {
    base = context.docBase;
  }

  public abstract Query getResultQuery(SchemaField matchField, boolean useAutomaton);

  @Override
  public ScoreMode scoreMode() {
    return ScoreMode.COMPLETE_NO_SCORES;
  }
  
}

class GraphTermsCollector extends GraphEdgeCollector {
  // all the collected terms
  private BytesRefHash collectorTerms;
  private SortedSetDocValues docTermOrds;


  GraphTermsCollector(SchemaField collectField, DocSet skipSet, DocSet leafNodes) {
    super(collectField, skipSet, leafNodes);
    this.collectorTerms =  new BytesRefHash();
  }

  @Override
  public void doSetNextReader(LeafReaderContext context) throws IOException {
    super.doSetNextReader(context);
    // Grab the updated doc values.
    docTermOrds = DocValues.getSortedSet(context.reader(), collectField.getName());
  }

  @Override
  void addEdgeIdsToResult(int doc) throws IOException {
    // set the doc to pull the edges ids for.
    if (doc > docTermOrds.docID()) {
      docTermOrds.advance(doc);
    }
    if (doc == docTermOrds.docID()) {
      BytesRef edgeValue = new BytesRef();
      long ord;
      while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
        edgeValue = docTermOrds.lookupOrd(ord);
        // add the edge id to the collector terms.
        collectorTerms.add(edgeValue);
      }
    }
  }

  @Override
  public Query getResultQuery(SchemaField matchField, boolean useAutomaton) {
    if (collectorTerms == null || collectorTerms.size() == 0) {
      // return null if there are no terms (edges) to traverse.
      return null;
    } else {
      // Create a query
      Query q = null;

      // TODO: see if we should dynamically select this based on the frontier size.
      if (useAutomaton) {
        // build an automaton based query for the frontier.
        Automaton autn = buildAutomaton(collectorTerms);
        AutomatonQuery autnQuery = new AutomatonQuery(new Term(matchField.getName()), autn);
        q = autnQuery;
      } else {
        List termList = new ArrayList<>(collectorTerms.size());
        for (int i = 0 ; i < collectorTerms.size(); i++) {
          BytesRef ref = new BytesRef();
          collectorTerms.get(i, ref);
          termList.add(ref);
        }
        q = (matchField.hasDocValues() && !matchField.indexed())
            ? new DocValuesTermsQuery(matchField.getName(), termList)
            : new TermInSetQuery(matchField.getName(), termList);
      }

      return q;
    }
  }


  /** Build an automaton to represent the frontier query */
  private Automaton buildAutomaton(BytesRefHash termBytesHash) {
    // need top pass a sorted set of terms to the autn builder (maybe a better way to avoid this?)
    final TreeSet terms = new TreeSet();
    for (int i = 0 ; i < termBytesHash.size(); i++) {
      BytesRef ref = new BytesRef();
      termBytesHash.get(i, ref);
      terms.add(ref);
    }
    final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
    return a;
  }
}






© 2015 - 2024 Weber Informatics LLC | Privacy Policy