All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.search.GraphTermsQParserPlugin Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.search;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.DocIdSetBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.FutureArrays;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.NumberType;
import org.apache.solr.schema.SchemaField;

/**
 *  The GraphTermsQuery builds a disjunction query from a list of terms. The terms are first filtered by the maxDocFreq parameter.
 *  This allows graph traversals to skip traversing high frequency nodes which is often desirable from a performance standpoint.
 *
 *   Syntax: {!graphTerms f=field maxDocFreq=10000}term1,term2,term3
 */
public class GraphTermsQParserPlugin extends QParserPlugin {
  public static final String NAME = "graphTerms";

  @Override
  public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
    return new QParser(qstr, localParams, params, req) {
      @Override
      public Query parse() throws SyntaxError {
        String fname = localParams.get(QueryParsing.F);
        FieldType ft = req.getSchema().getFieldTypeNoEx(fname);
        int maxDocFreq = localParams.getInt("maxDocFreq", Integer.MAX_VALUE);
        String qstr = localParams.get(QueryParsing.V);//never null

        if (qstr.length() == 0) {
          return new MatchNoDocsQuery();
        }

        final String[] splitVals = qstr.split(",");

        SchemaField sf = req.getSchema().getField(fname);

        // if we don't limit by maxDocFreq, then simply use a normal set query
        if (maxDocFreq == Integer.MAX_VALUE) {
          return sf.getType().getSetQuery(this, sf, Arrays.asList(splitVals));
        }

        if (sf.getType().isPointField()) {
          PointSetQuery setQ = null;
          if (sf.getType().getNumberType() == NumberType.INTEGER) {
            int[] vals = new int[splitVals.length];
            for (int i=0; i finalContexts = new ArrayList<>();
      List finalTerms = new ArrayList<>();
      {
        List contexts = searcher.getTopReaderContext().leaves();
        TermStates[] termStates = new TermStates[this.queryTerms.length];
        collectTermStates(searcher.getIndexReader(), contexts, termStates, this.queryTerms);
        for(int i=0; i terms) {
          // no-op
          // This query is for abuse cases when the number of terms is too high to
          // run efficiently as a BooleanQuery. So likewise we hide its terms in
          // order to protect highlighters
        }

        @Override
        public Scorer scorer(LeafReaderContext context) throws IOException {
          final LeafReader reader = context.reader();
          Terms terms = reader.terms(field);
          if (terms == null) {
            return null;
          }
          TermsEnum  termsEnum = terms.iterator();
          PostingsEnum docs = null;
          DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc(), terms);
          for (int i=0; i leaves,
                                   TermStates[] contextArray,
                                   Term[] queryTerms) throws IOException {
      TermsEnum termsEnum = null;
      for (LeafReaderContext context : leaves) {

        Terms terms = context.reader().terms(this.field);
        if (terms == null) {
          // field does not exist
          continue;
        }

        termsEnum = terms.iterator();

        if (termsEnum == TermsEnum.EMPTY) continue;

        for (int i = 0; i < queryTerms.length; i++) {
          Term term = queryTerms[i];
          TermStates termStates = contextArray[i];

          if (termsEnum.seekExact(term.bytes())) {
            if (termStates == null) {
              contextArray[i] = new TermStates(reader.getContext(),
                  termsEnum.termState(), context.ord, termsEnum.docFreq(),
                  termsEnum.totalTermFreq());
            } else {
              termStates.register(termsEnum.termState(), context.ord,
                  termsEnum.docFreq(), termsEnum.totalTermFreq());
            }
          }
        }
      }
    }
  }
}



// modified version of PointInSetQuery
abstract class PointSetQuery extends Query implements DocSetProducer, Accountable {
  protected static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(PointSetQuery.class);

  // A little bit overkill for us, since all of our "terms" are always in the same field:
  final PrefixCodedTerms sortedPackedPoints;
  final int sortedPackedPointsHashCode;
  final String field;
  final int bytesPerDim;
  final int numDims;
  int maxDocFreq = Integer.MAX_VALUE;
  final long ramBytesUsed; // cache

  /**
   * Iterator of encoded point values.
   */
  // TODO: if we want to stream, maybe we should use jdk stream class?
  public static abstract class Stream implements BytesRefIterator {
    @Override
    public abstract BytesRef next();
  };

  public void setMaxDocFreq(int maxDocFreq) {
    this.maxDocFreq = maxDocFreq;
  }

  public static PointSetQuery newSetQuery(String field, float... sortedValues) {

    final BytesRef encoded = new BytesRef(new byte[Float.BYTES]);

    return new PointSetQuery(field, 1, Float.BYTES,
        new PointSetQuery.Stream() {

          int upto;

          @Override
          public BytesRef next() {
            if (upto == sortedValues.length) {
              return null;
            } else {
              FloatPoint.encodeDimension(sortedValues[upto], encoded.bytes, 0);
              upto++;
              return encoded;
            }
          }
        }) {
      @Override
      protected String toString(byte[] value) {
        assert value.length == Float.BYTES;
        return Float.toString(FloatPoint.decodeDimension(value, 0));
      }
    };
  }

  public static PointSetQuery newSetQuery(String field, long... sortedValues) {
    final BytesRef encoded = new BytesRef(new byte[Long.BYTES]);

    return new PointSetQuery(field, 1, Long.BYTES,
        new PointSetQuery.Stream() {

          int upto;

          @Override
          public BytesRef next() {
            if (upto == sortedValues.length) {
              return null;
            } else {
              LongPoint.encodeDimension(sortedValues[upto], encoded.bytes, 0);
              upto++;
              return encoded;
            }
          }
        }) {
      @Override
      protected String toString(byte[] value) {
        assert value.length == Long.BYTES;
        return Long.toString(LongPoint.decodeDimension(value, 0));
      }
    };
  }

  public static PointSetQuery newSetQuery(String field, int... sortedValues) {
    final BytesRef encoded = new BytesRef(new byte[Integer.BYTES]);

    return new PointSetQuery(field, 1, Integer.BYTES,
        new PointSetQuery.Stream() {

          int upto;

          @Override
          public BytesRef next() {
            if (upto == sortedValues.length) {
              return null;
            } else {
              IntPoint.encodeDimension(sortedValues[upto], encoded.bytes, 0);
              upto++;
              return encoded;
            }
          }
        }) {
      @Override
      protected String toString(byte[] value) {
        assert value.length == Integer.BYTES;
        return Integer.toString(IntPoint.decodeDimension(value, 0));
      }
    };
  }

  public static PointSetQuery newSetQuery(String field, double... values) {

    // Don't unexpectedly change the user's incoming values array:
    double[] sortedValues = values.clone();
    Arrays.sort(sortedValues);

    final BytesRef encoded = new BytesRef(new byte[Double.BYTES]);

    return new PointSetQuery(field, 1, Double.BYTES,
        new PointSetQuery.Stream() {

          int upto;

          @Override
          public BytesRef next() {
            if (upto == sortedValues.length) {
              return null;
            } else {
              DoublePoint.encodeDimension(sortedValues[upto], encoded.bytes, 0);
              upto++;
              return encoded;
            }
          }
        }) {
      @Override
      protected String toString(byte[] value) {
        assert value.length == Double.BYTES;
        return Double.toString(DoublePoint.decodeDimension(value, 0));
      }
    };
  }

  public PointSetQuery(String field, int numDims, int bytesPerDim, Stream packedPoints) {
    this.field = field;
    this.bytesPerDim = bytesPerDim;
    this.numDims = numDims;

    // In the 1D case this works well (the more points, the more common prefixes they share, typically), but in
    // the > 1 D case, where we are only looking at the first dimension's prefix bytes, it can at worst not hurt:
    PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
    BytesRefBuilder previous = null;
    BytesRef current;
    while ((current = packedPoints.next()) != null) {
      if (current.length != numDims * bytesPerDim) {
        throw new IllegalArgumentException("packed point length should be " + (numDims * bytesPerDim) + " but got " + current.length + "; field=\"" + field + "\" numDims=" + numDims + " bytesPerDim=" + bytesPerDim);
      }
      if (previous == null) {
        previous = new BytesRefBuilder();
      } else {
        int cmp = previous.get().compareTo(current);
        if (cmp == 0) {
          continue; // deduplicate
        } else if (cmp > 0) {
          throw new IllegalArgumentException("values are out of order: saw " + previous + " before " + current);
        }
      }
      builder.add(field, current);
      previous.copyBytes(current);
    }
    sortedPackedPoints = builder.finish();
    sortedPackedPointsHashCode = sortedPackedPoints.hashCode();
    ramBytesUsed = BASE_RAM_BYTES +
        RamUsageEstimator.sizeOfObject(sortedPackedPoints);
  }

  private FixedBitSet getLiveDocs(IndexSearcher searcher) throws IOException {
    if (!searcher.getIndexReader().hasDeletions()) {
      return null;
    }
    if (searcher instanceof SolrIndexSearcher) {
      return ((SolrIndexSearcher) searcher).getLiveDocSet().getBits();
    } else {
      // TODO Does this ever happen?  In Solr should always be SolrIndexSearcher?
      //smallSetSize==0 thus will always produce a BitDocSet (FixedBitSet)
      DocSetCollector docSetCollector = new DocSetCollector(0, searcher.getIndexReader().maxDoc());
      searcher.search(new MatchAllDocsQuery(), docSetCollector);
      return ((BitDocSet) docSetCollector.getDocSet()).getBits();
    }
  }

  @Override
  public DocSet createDocSet(SolrIndexSearcher searcher) throws IOException {
    return getDocSet(searcher);
  }

  @Override
  public long ramBytesUsed() {
    return ramBytesUsed;
  }

  public DocSet getDocSet(IndexSearcher searcher) throws IOException {
    IndexReaderContext top = ReaderUtil.getTopLevelContext(searcher.getTopReaderContext());
    List segs = top.leaves();
    DocSetBuilder builder = new DocSetBuilder(top.reader().maxDoc(), Math.min(64,(top.reader().maxDoc()>>>10)+4));
    PointValues[] segPoints = new PointValues[segs.size()];
    for (int i=0; i maxDocFreq) {
          continue outer;
        }
      }
      int collected = visitor.getCount();
      int[] ids = visitor.getGlobalIds();
      for (int i=0; i 0) {
          return PointValues.Relation.CELL_OUTSIDE_QUERY;
        }

        int cmpMax = FutureArrays.compareUnsigned(maxPackedValue, offset, offset + bytesPerDim, pointBytes, offset, offset + bytesPerDim);
        if (cmpMax < 0) {
          return PointValues.Relation.CELL_OUTSIDE_QUERY;
        }

        if (cmpMin != 0 || cmpMax != 0) {
          crosses = true;
        }
      }

      if (crosses) {
        return PointValues.Relation.CELL_CROSSES_QUERY;
      } else {
        // NOTE: we only hit this if we are on a cell whose min and max values are exactly equal to our point,
        // which can easily happen if many docs share this one value
        return PointValues.Relation.CELL_INSIDE_QUERY;
      }
    }
  }

  public String getField() {
    return field;
  }

  public int getNumDims() {
    return numDims;
  }

  public int getBytesPerDim() {
    return bytesPerDim;
  }

  @Override
  public final int hashCode() {
    int hash = classHash();
    hash = 31 * hash + field.hashCode();
    hash = 31 * hash + sortedPackedPointsHashCode;
    hash = 31 * hash + numDims;
    hash = 31 * hash + bytesPerDim;
    hash = 31 * hash + maxDocFreq;
    return hash;
  }

  @Override
  public final boolean equals(Object other) {
    return sameClassAs(other) &&
        equalsTo(getClass().cast(other));
  }

  private boolean equalsTo(PointSetQuery other) {
    return other.field.equals(field) &&
        other.numDims == numDims &&
        other.bytesPerDim == bytesPerDim &&
        other.sortedPackedPointsHashCode == sortedPackedPointsHashCode &&
        other.sortedPackedPoints.equals(sortedPackedPoints) &&
        other.maxDocFreq == maxDocFreq;
  }

  @Override
  public final String toString(String field) {
    final StringBuilder sb = new StringBuilder();
    if (this.field.equals(field) == false) {
      sb.append(this.field);
      sb.append(':');
    }

    sb.append("{");

    PrefixCodedTerms.TermIterator iterator = sortedPackedPoints.iterator();
    byte[] pointBytes = new byte[numDims * bytesPerDim];
    boolean first = true;
    for (BytesRef point = iterator.next(); point != null; point = iterator.next()) {
      if (first == false) {
        sb.append(" ");
      }
      first = false;
      System.arraycopy(point.bytes, point.offset, pointBytes, 0, pointBytes.length);
      sb.append(toString(pointBytes));
    }
    sb.append("}");
    return sb.toString();
  }

  @Override
  public void visit(QueryVisitor visitor) {
    visitor.visitLeaf(this);
  }

  protected abstract String toString(byte[] value);
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy