org.apache.solr.search.SolrDocumentFetcher Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.search;

import java.io.IOException;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.function.Predicate;
import java.util.function.Supplier;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.InvertableType;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StoredValue;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.misc.document.LazyDocument;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentBase;
import org.apache.solr.common.SolrException;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.response.DocsStreamer;
import org.apache.solr.response.ResultContext;
import org.apache.solr.schema.AbstractEnumField;
import org.apache.solr.schema.BoolField;
import org.apache.solr.schema.LatLonPointSpatialField;
import org.apache.solr.schema.NumberType;
import org.apache.solr.schema.SchemaField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A helper class of {@link org.apache.solr.search.SolrIndexSearcher} for stored Document related
 * matters including DocValue substitutions.
 */
public class SolrDocumentFetcher {

  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  private final SolrIndexSearcher searcher;

  private final int nLeaves;

  private final boolean enableLazyFieldLoading;

  private final SolrCache documentCache;

  private final Set allStored;

  private final Set dvsCanSubstituteStored;

  /** Contains the names/patterns of all docValues=true,stored=false fields in the schema. */
  private final Set allNonStoredDVs;

  /**
   * Contains the names/patterns of all docValues=true,stored=false,useDocValuesAsStored=true fields
   * in the schema.
   */
  private final Set nonStoredDVsUsedAsStored;

  /**
   * Contains the names/patterns of all docValues=true,stored=false fields, excluding those that are
   * copyField targets in the schema.
   */
  private final Set nonStoredDVsWithoutCopyTargets;

  private static int largeValueLengthCacheThreshold =
      Integer.getInteger("solr.largeField.cacheThreshold", 512 * 1024); // internal setting

  private final Set largeFields;

  private Collection storedHighlightFieldNames; // lazy populated; use getter

  private Collection indexedFieldNames; // lazy populated; use getter

  @SuppressWarnings({"unchecked"})
  SolrDocumentFetcher(SolrIndexSearcher searcher, SolrConfig solrConfig, boolean cachingEnabled) {
    this.searcher = searcher;
    this.nLeaves = searcher.getTopReaderContext().leaves().size();
    this.enableLazyFieldLoading = solrConfig.enableLazyFieldLoading;
    if (cachingEnabled) {
      documentCache =
          solrConfig.documentCacheConfig == null
              ? null
              : solrConfig.documentCacheConfig.newInstance();
    } else {
      documentCache = null;
    }

    final Set nonStoredDVsUsedAsStored = new HashSet<>();
    final Set allNonStoredDVs = new HashSet<>();
    final Set nonStoredDVsWithoutCopyTargets = new HashSet<>();
    final Set storedLargeFields = new HashSet<>();
    final Set dvsCanSubstituteStored = new HashSet<>();
    final Set allStoreds = new HashSet<>();

    // can find materialized dynamic fields, unlike using the Solr IndexSchema.
    for (FieldInfo fieldInfo : searcher.getFieldInfos()) {
      final SchemaField schemaField = searcher.getSchema().getFieldOrNull(fieldInfo.name);
      if (schemaField == null) {
        continue;
      }
      if (canSubstituteDvForStored(fieldInfo, schemaField)) {
        dvsCanSubstituteStored.add(fieldInfo.name);
      }
      if (schemaField.stored()) {
        allStoreds.add(fieldInfo.name);
      }
      if (!schemaField.stored() && schemaField.hasDocValues()) {
        if (schemaField.useDocValuesAsStored()) {
          nonStoredDVsUsedAsStored.add(fieldInfo.name);
        }
        allNonStoredDVs.add(fieldInfo.name);
        if (!searcher.getSchema().isCopyFieldTarget(schemaField)) {
          nonStoredDVsWithoutCopyTargets.add(fieldInfo.name);
        }
      }
      if (schemaField.stored() && schemaField.isLarge()) {
        storedLargeFields.add(schemaField.getName());
      }
    }

    this.nonStoredDVsUsedAsStored = Collections.unmodifiableSet(nonStoredDVsUsedAsStored);
    this.allNonStoredDVs = Collections.unmodifiableSet(allNonStoredDVs);
    this.nonStoredDVsWithoutCopyTargets =
        Collections.unmodifiableSet(nonStoredDVsWithoutCopyTargets);
    this.largeFields = Collections.unmodifiableSet(storedLargeFields);
    this.dvsCanSubstituteStored = Collections.unmodifiableSet(dvsCanSubstituteStored);
    this.allStored = Collections.unmodifiableSet(allStoreds);
  }

  // Does this field have both stored=true and docValues=true and is otherwise
  // eligible for getting the field's value from DV?
  private boolean canSubstituteDvForStored(FieldInfo fieldInfo, SchemaField schemaField) {
    if (!schemaField.hasDocValues() || !schemaField.stored()) return false;
    if (schemaField.multiValued()) return false;
    DocValuesType docValuesType = fieldInfo.getDocValuesType();
    NumberType numberType = schemaField.getType().getNumberType();
    // can not decode a numeric without knowing its numberType
    if (numberType == null
        && (docValuesType == DocValuesType.SORTED_NUMERIC
            || docValuesType == DocValuesType.NUMERIC)) {
      return false;
    }
    return true;
  }

  public boolean isLazyFieldLoadingEnabled() {
    return enableLazyFieldLoading;
  }

  public SolrCache getDocumentCache() {
    return documentCache;
  }

  /**
   * Returns a collection of the names of all stored fields which can be highlighted the index
   * reader knows about.
   */
  public Collection getStoredHighlightFieldNames() {
    synchronized (this) {
      if (storedHighlightFieldNames == null) {
        storedHighlightFieldNames = new ArrayList<>();
        for (FieldInfo fieldInfo : searcher.getFieldInfos()) {
          final String fieldName = fieldInfo.name;
          try {
            SchemaField field = searcher.getSchema().getField(fieldName);
            if (field.stored()
                && ((field.getType() instanceof org.apache.solr.schema.TextField)
                    || (field.getType() instanceof org.apache.solr.schema.StrField))) {
              storedHighlightFieldNames.add(fieldName);
            }
          } catch (RuntimeException e) {
            // getField() throws a SolrException, but it arrives as a RuntimeException
            log.warn("Field [{}] found in index, but not defined in schema.", fieldName);
          }
        }
      }
      return storedHighlightFieldNames;
    }
  }

  /** Returns a collection of the names of all indexed fields which the index reader knows about. */
  public Collection getIndexedFieldNames() {
    synchronized (this) {
      if (indexedFieldNames == null) {
        indexedFieldNames = new ArrayList<>();
        for (FieldInfo fieldInfo : searcher.getFieldInfos()) {
          if (fieldInfo.getIndexOptions() != IndexOptions.NONE) {
            indexedFieldNames.add(fieldInfo.name);
          }
        }
      }
      return indexedFieldNames;
    }
  }

  /**
   * @see SolrIndexSearcher#doc(int)
   */
  public Document doc(int docId) throws IOException {
    return doc(docId, (Set) null);
  }

  /**
   * Retrieve the {@link Document} instance corresponding to the document id.
   *
   * NOTE: the document will have all fields accessible, but if a field filter is
   * provided, only the provided fields will be loaded (the remainder will be available lazily).
   *
   * @see SolrIndexSearcher#doc(int, Set)
   */
  public Document doc(int i, Set fields) throws IOException {
    Document d;
    if (documentCache != null) {
      final Set getFields = enableLazyFieldLoading ? fields : null;
      d = documentCache.computeIfAbsent(i, docId -> docNC(docId, getFields));
      if (d == null) {
        // failed to retrieve due to an earlier exception, try again?
        return docNC(i, fields);
      } else {
        return d;
      }
    } else {
      return docNC(i, fields);
    }
  }

  private Document docNC(int i, Set fields) throws IOException {
    final DirectoryReader reader = searcher.getIndexReader();
    final SolrDocumentStoredFieldVisitor visitor =
        new SolrDocumentStoredFieldVisitor(fields, reader, i);
    reader.document(i, visitor);
    return visitor.getDocument();
  }

  /**
   * This is an optimized version for populating a SolrDocument that:
   *
   * 
1. fetches all fields from docValues if possible. If no decompression of the stored data is
   * necessary, we can avoid a disk seek and decompression cycle. This step is only used if all
   * requested fields are {code docValues=true stored=false multiValued=false}. This last
   * restriction because multiValued docValues fields do not faithfully reflect the input order in
   * all cases. the values are returned and no decompression is necessary.
   *
   * 
2. if 1 is impossible, try to fetch all requested fields from the stored values. If the
   * stored data has to be decompressed anyway, it's more efficient to just get all field values
   * from the stored values. If we got all the requested fields, return.
   *
   * 
3. add fields where docValues=true stored=false thus could not be fetched in step 2
   *
   * @param luceneDocId The Lucene doc ID
   * @param solrReturnFields the structure holding the fields to be returned. The first time this
   *     method is called for a particular document list, it will be modified by adding a
   *     RetrieveFieldsOptimizer for use in future calls.
   * @return The SolrDocument with values requested.
   *     
This method is designed to be as simple as possible to use, just call it. e.g. {code
   *     SolrDocument sdoc = docFetcher.solrDoc(id, solrReturnFields);} then process the resulting
   *     SolrDocument as usual. Subsequent calls with the same solrReturnFields will re-use the
   *     optimizer created the first time.
   *     
NOTE: DO NOT re-use the same SolrReturnFields object if the fields requested change.
   */
  public SolrDocument solrDoc(int luceneDocId, SolrReturnFields solrReturnFields) {
    Supplier rfoSupplier =
        () -> new RetrieveFieldsOptimizer(solrReturnFields);
    return solrReturnFields.getFetchOptimizer(rfoSupplier).getSolrDoc(luceneDocId);
  }

  /**
   * {@link StoredFieldVisitor} which loads the specified fields eagerly (or all if null). If {@link
   * #enableLazyFieldLoading} then the rest get special lazy field entries. Designated "large"
   * fields will always get a special field entry.
   */
  private class SolrDocumentStoredFieldVisitor extends DocumentStoredFieldVisitor {
    private final Document doc;
    private final LazyDocument
        lazyFieldProducer; // arguably a better name than LazyDocument; at least how we use it here
    private final int docId;
    private final boolean addLargeFieldsLazily;

    SolrDocumentStoredFieldVisitor(Set toLoad, IndexReader reader, int docId) {
      super(toLoad);
      this.docId = docId;
      this.doc = getDocument();
      this.lazyFieldProducer =
          toLoad != null && enableLazyFieldLoading ? new LazyDocument(reader, docId) : null;
      this.addLargeFieldsLazily = (documentCache != null && !largeFields.isEmpty());
      // TODO can we return Status.STOP after a val is loaded and we know there are no other fields
      // of interest?
      //    When: toLoad is one single-valued field, no lazyFieldProducer
    }

    @Override
    public void stringField(FieldInfo fieldInfo, String value) throws IOException {
      Predicate readAsBytes = ResultContext.READASBYTES.get();
      if (readAsBytes != null && readAsBytes.test(fieldInfo.name)) {
        final FieldType ft = new FieldType(TextField.TYPE_STORED);
        ft.setStoreTermVectors(fieldInfo.hasVectors());
        ft.setOmitNorms(fieldInfo.omitsNorms());
        ft.setIndexOptions(fieldInfo.getIndexOptions());
        Objects.requireNonNull(value, "String value should not be null");
        doc.add(new StoredField(fieldInfo.name, value, ft));
      } else {
        super.stringField(fieldInfo, value);
      }
    }

    @Override
    public Status needsField(FieldInfo fieldInfo) throws IOException {
      Status status = super.needsField(fieldInfo);
      assert status != Status.STOP : "Status.STOP not supported or expected";
      // load "large" fields using this lazy mechanism
      if (addLargeFieldsLazily && largeFields.contains(fieldInfo.name)) {
        if (lazyFieldProducer != null || status == Status.YES) {
          doc.add(new LargeLazyField(fieldInfo.name, docId));
        }
        return Status.NO;
      }
      if (status == Status.NO && lazyFieldProducer != null) { // lazy
        doc.add(lazyFieldProducer.getField(fieldInfo));
      }
      return status;
    }
  }

  /**
   * @see SolrIndexSearcher#doc(int, StoredFieldVisitor)
   */
  public void doc(int docId, StoredFieldVisitor visitor) throws IOException {
    if (documentCache != null) {
      // get cached document or retrieve it including all fields (and cache it)
      Document cached = doc(docId);
      visitFromCached(cached, visitor);
    } else {
      searcher.getIndexReader().document(docId, visitor);
    }
  }

  /** Executes a stored field visitor against a hit from the document cache */
  private void visitFromCached(Document document, StoredFieldVisitor visitor) throws IOException {
    for (IndexableField f : document) {
      final FieldInfo info = searcher.getFieldInfos().fieldInfo(f.name());
      final StoredFieldVisitor.Status needsField = visitor.needsField(info);
      if (needsField == StoredFieldVisitor.Status.STOP) return;
      if (needsField == StoredFieldVisitor.Status.NO) continue;
      BytesRef binaryValue = f.binaryValue();
      if (binaryValue != null) {
        visitor.binaryField(info, toByteArrayUnwrapIfPossible(binaryValue));
        continue;
      }
      Number numericValue = f.numericValue();
      if (numericValue != null) {
        if (numericValue instanceof Double) {
          visitor.doubleField(info, numericValue.doubleValue());
        } else if (numericValue instanceof Integer) {
          visitor.intField(info, numericValue.intValue());
        } else if (numericValue instanceof Float) {
          visitor.floatField(info, numericValue.floatValue());
        } else if (numericValue instanceof Long) {
          visitor.longField(info, numericValue.longValue());
        } else {
          throw new AssertionError();
        }
        continue;
      }
      // must be String
      if (f instanceof LargeLazyField) { // optimization to avoid premature string conversion
        visitor.stringField(info, toStringUnwrapIfPossible(((LargeLazyField) f).readBytes()));
      } else {
        visitor.stringField(info, f.stringValue());
      }
    }
  }

  private byte[] toByteArrayUnwrapIfPossible(BytesRef bytesRef) {
    if (bytesRef.offset == 0 && bytesRef.bytes.length == bytesRef.length) {
      return bytesRef.bytes;
    } else {
      return Arrays.copyOfRange(bytesRef.bytes, bytesRef.offset, bytesRef.offset + bytesRef.length);
    }
  }

  private String toStringUnwrapIfPossible(BytesRef bytesRef) {
    if (bytesRef.offset == 0 && bytesRef.bytes.length == bytesRef.length) {
      return new String(bytesRef.bytes, StandardCharsets.UTF_8);
    } else {
      return new String(
          bytesRef.bytes,
          bytesRef.offset,
          bytesRef.offset + bytesRef.length,
          StandardCharsets.UTF_8);
    }
  }

  /**
   * Unlike LazyDocument.LazyField, we (a) don't cache large values, and (b) provide access to the
   * byte[].
   */
  class LargeLazyField implements IndexableField {

    final String name;
    final int docId;
    // synchronize on 'this' to access:
    BytesRef cachedBytes; // we only conditionally populate this if it's big enough

    private LargeLazyField(String name, int docId) {
      this.name = name;
      this.docId = docId;
    }

    @Override
    public String toString() {
      return fieldType().toString() + "<" + name() + ">"; // mimic Field.java
    }

    @Override
    public String name() {
      return name;
    }

    @Override
    public IndexableFieldType fieldType() {
      return searcher.getSchema().getField(name());
    }

    @Override
    public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
      // or we could throw unsupported exception?
      return analyzer.tokenStream(name(), stringValue());
    }

    /** (for tests) */
    synchronized boolean hasBeenLoaded() {
      return cachedBytes != null;
    }

    @Override
    public synchronized String stringValue() {
      try {
        return readBytes().utf8ToString();
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }

    synchronized BytesRef readBytes() throws IOException {
      if (cachedBytes != null) {
        return cachedBytes;
      } else {
        BytesRef bytesRef = new BytesRef();
        searcher
            .getIndexReader()
            .document(
                docId,
                new StoredFieldVisitor() {
                  boolean done = false;

                  @Override
                  public Status needsField(FieldInfo fieldInfo) throws IOException {
                    if (done) {
                      return Status.STOP;
                    }
                    return fieldInfo.name.equals(name()) ? Status.YES : Status.NO;
                  }

                  @Override
                  public void stringField(FieldInfo fieldInfo, String value) throws IOException {
                    Objects.requireNonNull(value, "String value should not be null");
                    bytesRef.bytes = value.getBytes(StandardCharsets.UTF_8);
                    bytesRef.length = bytesRef.bytes.length;
                    done = true;
                  }

                  @Override
                  public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
                    throw new UnsupportedOperationException(
                        "'large' binary fields are not (yet) supported");
                  }
                });
        if (bytesRef.length < largeValueLengthCacheThreshold) {
          return cachedBytes = bytesRef;
        } else {
          return bytesRef;
        }
      }
    }

    @Override
    public BytesRef binaryValue() {
      return null;
    }

    @Override
    public Reader readerValue() {
      return null;
    }

    @Override
    public Number numericValue() {
      return null;
    }

    @Override
    public StoredValue storedValue() {
      return new StoredValue(stringValue());
    }

    @Override
    public InvertableType invertableType() {
      return null;
    }
  }

  /**
   * This will fetch and add the docValues fields to a given SolrDocument/SolrInputDocument
   *
   * @param doc A SolrDocument or SolrInputDocument instance where docValues will be added
   * @param docid The lucene docid of the document to be populated
   * @param fields The fields with docValues to populate the document with. DocValues fields which
   *     do not exist or not decodable will be ignored.
   */
  public void decorateDocValueFields(
      SolrDocumentBase doc,
      int docid,
      Set fields,
      DocValuesIteratorCache reuseDvIters)
      throws IOException {
    final List leafContexts = searcher.getLeafContexts();
    final int subIndex = ReaderUtil.subIndex(docid, leafContexts);
    final int localId = docid - leafContexts.get(subIndex).docBase;
    final LeafReader leafReader = leafContexts.get(subIndex).reader();
    for (String fieldName : fields) {
      DocValuesIteratorCache.FieldDocValuesSupplier e = reuseDvIters.getSupplier(fieldName);
      if (e != null) {
        Object fieldValue = decodeDVField(localId, leafReader, subIndex, e);
        if (fieldValue != null) {
          doc.setField(fieldName, fieldValue);
        }
      }
    }
  }

  /**
   * Decode value from DV field for a document
   *
   * @return null if DV field is not exist or can not decodable
   */
  private Object decodeDVField(
      int localId,
      LeafReader leafReader,
      int readerOrd,
      DocValuesIteratorCache.FieldDocValuesSupplier e)
      throws IOException {

    final DocValuesType dvType = e.type;
    switch (dvType) {
      case NUMERIC:
        final NumericDocValues ndv = e.getNumericDocValues(localId, leafReader, readerOrd);
        if (ndv == null) {
          return null;
        }
        long val = ndv.longValue();
        return decodeNumberFromDV(e.schemaField, val, false);
      case BINARY:
        BinaryDocValues bdv = e.getBinaryDocValues(localId, leafReader, readerOrd);
        if (bdv != null) {
          return BytesRef.deepCopyOf(bdv.binaryValue());
        }
        return null;
      case SORTED:
        SortedDocValues sdv = e.getSortedDocValues(localId, leafReader, readerOrd);
        if (sdv != null) {
          final BytesRef bRef = sdv.lookupOrd(sdv.ordValue());
          // Special handling for Boolean fields since they're stored as 'T' and 'F'.
          if (e.schemaField.getType() instanceof BoolField) {
            return e.schemaField.getType().toObject(e.schemaField, bRef);
          } else {
            return bRef.utf8ToString();
          }
        }
        return null;
      case SORTED_NUMERIC:
        final SortedNumericDocValues numericDv =
            e.getSortedNumericDocValues(localId, leafReader, readerOrd);
        if (numericDv != null) {
          final int docValueCount = numericDv.docValueCount();
          final List