org.apache.solr.search.SolrDocumentFetcher Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.search;

import java.io.IOException;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Predicate;
import java.util.function.Supplier;

import org.apache.commons.collections.CollectionUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.LazyDocument;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentBase;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.ByteArrayUtf8CharSequence;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.response.DocsStreamer;
import org.apache.solr.response.ResultContext;
import org.apache.solr.schema.AbstractEnumField;
import org.apache.solr.schema.BoolField;
import org.apache.solr.schema.LatLonPointSpatialField;
import org.apache.solr.schema.NumberType;
import org.apache.solr.schema.SchemaField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A helper class of {@link org.apache.solr.search.SolrIndexSearcher} for stored Document related matters
 * including DocValue substitutions.
 */
public class SolrDocumentFetcher {

  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  private final SolrIndexSearcher searcher;

  private final boolean enableLazyFieldLoading;

  private final SolrCache documentCache;

  private final Set allStored;

  private final Set dvsCanSubstituteStored;

  /** Contains the names/patterns of all docValues=true,stored=false fields in the schema. */
  private final Set allNonStoredDVs;

  /** Contains the names/patterns of all docValues=true,stored=false,useDocValuesAsStored=true fields in the schema. */
  private final Set nonStoredDVsUsedAsStored;

  /** Contains the names/patterns of all docValues=true,stored=false fields, excluding those that are copyField targets in the schema. */
  private final Set nonStoredDVsWithoutCopyTargets;

  private static int largeValueLengthCacheThreshold = Integer.getInteger("solr.largeField.cacheThreshold", 512 * 1024); // internal setting

  private final Set largeFields;

  private Collection storedHighlightFieldNames; // lazy populated; use getter

  @SuppressWarnings({"unchecked"})
  SolrDocumentFetcher(SolrIndexSearcher searcher, SolrConfig solrConfig, boolean cachingEnabled) {
    this.searcher = searcher;
    this.enableLazyFieldLoading = solrConfig.enableLazyFieldLoading;
    if (cachingEnabled) {
      documentCache = solrConfig.documentCacheConfig == null ? null : solrConfig.documentCacheConfig.newInstance();
    } else {
      documentCache = null;
    }

    final Set nonStoredDVsUsedAsStored = new HashSet<>();
    final Set allNonStoredDVs = new HashSet<>();
    final Set nonStoredDVsWithoutCopyTargets = new HashSet<>();
    final Set storedLargeFields = new HashSet<>();
    final Set dvsCanSubstituteStored = new HashSet<>();
    final Set allStoreds = new HashSet<>();

    for (FieldInfo fieldInfo : searcher.getFieldInfos()) { // can find materialized dynamic fields, unlike using the Solr IndexSchema.
      final SchemaField schemaField = searcher.getSchema().getFieldOrNull(fieldInfo.name);
      if (schemaField == null) {
        continue;
      }
      if (canSubstituteDvForStored(fieldInfo, schemaField)) {
        dvsCanSubstituteStored.add(fieldInfo.name);
      }
      if (schemaField.stored()) {
        allStoreds.add(fieldInfo.name);
      }
      if (!schemaField.stored() && schemaField.hasDocValues()) {
        if (schemaField.useDocValuesAsStored()) {
          nonStoredDVsUsedAsStored.add(fieldInfo.name);
        }
        allNonStoredDVs.add(fieldInfo.name);
        if (!searcher.getSchema().isCopyFieldTarget(schemaField)) {
          nonStoredDVsWithoutCopyTargets.add(fieldInfo.name);
        }
      }
      if (schemaField.stored() && schemaField.isLarge()) {
        storedLargeFields.add(schemaField.getName());
      }
    }

    this.nonStoredDVsUsedAsStored = Collections.unmodifiableSet(nonStoredDVsUsedAsStored);
    this.allNonStoredDVs = Collections.unmodifiableSet(allNonStoredDVs);
    this.nonStoredDVsWithoutCopyTargets = Collections.unmodifiableSet(nonStoredDVsWithoutCopyTargets);
    this.largeFields = Collections.unmodifiableSet(storedLargeFields);
    this.dvsCanSubstituteStored = Collections.unmodifiableSet(dvsCanSubstituteStored);
    this.allStored = Collections.unmodifiableSet(allStoreds);
  }

  // Does this field have both stored=true and docValues=true and is otherwise
  // eligible for getting the field's value from DV?
  private boolean canSubstituteDvForStored(FieldInfo fieldInfo, SchemaField schemaField) {
    if (!schemaField.hasDocValues() || !schemaField.stored()) return false;
    if (schemaField.multiValued()) return false;
    DocValuesType docValuesType = fieldInfo.getDocValuesType();
    NumberType numberType = schemaField.getType().getNumberType();
    // can not decode a numeric without knowing its numberType
    if (numberType == null && (docValuesType == DocValuesType.SORTED_NUMERIC || docValuesType == DocValuesType.NUMERIC)) {
      return false;
    }
    return true;
  }

  public boolean isLazyFieldLoadingEnabled() {
    return enableLazyFieldLoading;
  }

  public SolrCache getDocumentCache() {
    return documentCache;
  }

  /**
   * Returns a collection of the names of all stored fields which can be highlighted the index reader knows about.
   */
  public Collection getStoredHighlightFieldNames() {
    synchronized (this) {
      if (storedHighlightFieldNames == null) {
        storedHighlightFieldNames = new LinkedList<>();
        for (FieldInfo fieldInfo : searcher.getFieldInfos()) {
          final String fieldName = fieldInfo.name;
          try {
            SchemaField field = searcher.getSchema().getField(fieldName);
            if (field.stored() && ((field.getType() instanceof org.apache.solr.schema.TextField)
                || (field.getType() instanceof org.apache.solr.schema.StrField))) {
              storedHighlightFieldNames.add(fieldName);
            }
          } catch (RuntimeException e) { // getField() throws a SolrException, but it arrives as a RuntimeException
            log.warn("Field [{}] found in index, but not defined in schema.", fieldName);
          }
        }
      }
      return storedHighlightFieldNames;
    }
  }

  /** @see SolrIndexSearcher#doc(int) */
  public Document doc(int docId) throws IOException {
    return doc(docId, (Set) null);
  }

  /**
   * Retrieve the {@link Document} instance corresponding to the document id.
   * 
   * NOTE: the document will have all fields accessible, but if a field filter is provided, only the provided
   * fields will be loaded (the remainder will be available lazily).
   *
   * @see SolrIndexSearcher#doc(int, Set)
   */
  public Document doc(int i, Set fields) throws IOException {
    Document d;
    if (documentCache != null) {
      final Set getFields = enableLazyFieldLoading ? fields : null;
      AtomicReference exceptionRef = new AtomicReference<>();
      d = documentCache.computeIfAbsent(i, docId -> {
        try {
          return docNC(docId, getFields);
        } catch (IOException e) {
          exceptionRef.set(e);
          return null;
        }
      });
      if (exceptionRef.get() != null) {
        throw exceptionRef.get();
      }
      if (d == null) {
        // failed to retrieve due to an earlier exception, try again?
        return docNC(i, fields);
      } else {
        return d;
      }
    } else {
      return docNC(i, fields);
    }
  }

  private Document docNC(int i, Set fields) throws IOException {
    final DirectoryReader reader = searcher.getIndexReader();
    final SolrDocumentStoredFieldVisitor visitor = new SolrDocumentStoredFieldVisitor(fields, reader, i);
    reader.document(i, visitor);
    return visitor.getDocument();
  }

  /**
   * This is an optimized version for populating a SolrDocument that:
   *
   * 1. fetches all fields from docValues if possible. If no decompression of the stored
   * data is necessary, we can avoid a disk seek and decompression cycle.
   * This step is only used if all requested fields are
   * {code docValues=true stored=false multiValued=false}.
   * This last restriction because multiValued docValues fields do not faithfully reflect
   * the input order in all cases. the values are returned and no decompression is necessary.
   *
   * 2. if 1 is impossible, try to fetch all requested fields from the stored values. If
   * the stored data has to be decompressed anyway, it's more efficient to
   * just get all field values from the stored values. If we got all the requested fields, return.
   *
   * 3. add fields where docValues=true stored=false thus could not be fetched in step 2
   *
   * @param luceneDocId       The Lucene doc ID
   * @param solrReturnFields  the structure holding the fields to be returned.
   *                          The first time this method is called for a particular
   *                          document list, it will be modified by adding a
   *                          RetrieveFieldsOptimizer for use in future calls.
   *
   * @return The SolrDocument with values requested.
   * 

   * This method is designed to be as simple as possible to use, just call it. e.g.
   * {code SolrDocument sdoc = docFetcher.solrDoc(id, solrReturnFields);}
   * then process the resulting SolrDocument as usual. Subsequent calls with the same
   * solrReturnFields will re-use the optimizer created the first time.
   *
   * NOTE: DO NOT re-use the same SolrReturnFields object if the fields requested change.
   */

  public SolrDocument solrDoc(int luceneDocId, SolrReturnFields solrReturnFields) {
    Supplier rfoSupplier = () -> new RetrieveFieldsOptimizer(solrReturnFields);
    return solrReturnFields.getFetchOptimizer(rfoSupplier).getSolrDoc(luceneDocId);
  }

  /** {@link StoredFieldVisitor} which loads the specified fields eagerly (or all if null).
   * If {@link #enableLazyFieldLoading} then the rest get special lazy field entries.  Designated "large"
   * fields will always get a special field entry. */
  private class SolrDocumentStoredFieldVisitor extends DocumentStoredFieldVisitor {
    private final Document doc;
    private final LazyDocument lazyFieldProducer; // arguably a better name than LazyDocument; at least how we use it here
    private final int docId;
    private final boolean addLargeFieldsLazily;

    SolrDocumentStoredFieldVisitor(Set toLoad, IndexReader reader, int docId) {
      super(toLoad);
      this.docId = docId;
      this.doc = getDocument();
      this.lazyFieldProducer = toLoad != null && enableLazyFieldLoading ? new LazyDocument(reader, docId) : null;
      this.addLargeFieldsLazily = (documentCache != null && !largeFields.isEmpty());
      //TODO can we return Status.STOP after a val is loaded and we know there are no other fields of interest?
      //    When: toLoad is one single-valued field, no lazyFieldProducer
    }


    @Override
    public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
      Predicate readAsBytes = ResultContext.READASBYTES.get();
      if (readAsBytes != null && readAsBytes.test(fieldInfo.name)) {
        final FieldType ft = new FieldType(TextField.TYPE_STORED);
        ft.setStoreTermVectors(fieldInfo.hasVectors());
        ft.setOmitNorms(fieldInfo.omitsNorms());
        ft.setIndexOptions(fieldInfo.getIndexOptions());
        doc.add(new StoredField(fieldInfo.name, new ByteArrayUtf8CharSequence(value, 0, value.length), ft));
      } else {
        super.stringField(fieldInfo, value);
      }

    }

    @Override
    public Status needsField(FieldInfo fieldInfo) throws IOException {
      Status status = super.needsField(fieldInfo);
      assert status != Status.STOP : "Status.STOP not supported or expected";
      if (addLargeFieldsLazily && largeFields.contains(fieldInfo.name)) { // load "large" fields using this lazy mechanism
        if (lazyFieldProducer != null || status == Status.YES) {
          doc.add(new LargeLazyField(fieldInfo.name, docId));
        }
        return Status.NO;
      }
      if (status == Status.NO && lazyFieldProducer != null) { // lazy
        doc.add(lazyFieldProducer.getField(fieldInfo));
      }
      return status;
    }
  }

  /** @see SolrIndexSearcher#doc(int, StoredFieldVisitor) */
  public void doc(int docId, StoredFieldVisitor visitor) throws IOException {
    if (documentCache != null) {
      // get cached document or retrieve it including all fields (and cache it)
      Document cached = doc(docId);
      visitFromCached(cached, visitor);
    } else {
      searcher.getIndexReader().document(docId, visitor);
    }
  }

  /** Executes a stored field visitor against a hit from the document cache */
  private void visitFromCached(Document document, StoredFieldVisitor visitor) throws IOException {
    for (IndexableField f : document) {
      final FieldInfo info = searcher.getFieldInfos().fieldInfo(f.name());
      final StoredFieldVisitor.Status needsField = visitor.needsField(info);
      if (needsField == StoredFieldVisitor.Status.STOP) return;
      if (needsField == StoredFieldVisitor.Status.NO) continue;
      BytesRef binaryValue = f.binaryValue();
      if (binaryValue != null) {
        visitor.binaryField(info, toByteArrayUnwrapIfPossible(binaryValue));
        continue;
      }
      Number numericValue = f.numericValue();
      if (numericValue != null) {
        if (numericValue instanceof Double) {
          visitor.doubleField(info, numericValue.doubleValue());
        } else if (numericValue instanceof Integer) {
          visitor.intField(info, numericValue.intValue());
        } else if (numericValue instanceof Float) {
          visitor.floatField(info, numericValue.floatValue());
        } else if (numericValue instanceof Long) {
          visitor.longField(info, numericValue.longValue());
        } else {
          throw new AssertionError();
        }
        continue;
      }
      // must be String
      if (f instanceof LargeLazyField) { // optimization to avoid premature string conversion
        visitor.stringField(info, toByteArrayUnwrapIfPossible(((LargeLazyField) f).readBytes()));
      } else {
        visitor.stringField(info, f.stringValue().getBytes(StandardCharsets.UTF_8));
      }
    }
  }

  private byte[] toByteArrayUnwrapIfPossible(BytesRef bytesRef) {
    if (bytesRef.offset == 0 && bytesRef.bytes.length == bytesRef.length) {
      return bytesRef.bytes;
    } else {
      return Arrays.copyOfRange(bytesRef.bytes, bytesRef.offset, bytesRef.offset + bytesRef.length);
    }
  }

  /** Unlike LazyDocument.LazyField, we (a) don't cache large values, and (b) provide access to the byte[]. */
  class LargeLazyField implements IndexableField {

    final String name;
    final int docId;
    // synchronize on 'this' to access:
    BytesRef cachedBytes; // we only conditionally populate this if it's big enough

    private LargeLazyField(String name, int docId) {
      this.name = name;
      this.docId = docId;
    }

    @Override
    public String toString() {
      return fieldType().toString() + "<" + name() + ">"; // mimic Field.java
    }

    @Override
    public String name() {
      return name;
    }

    @Override
    public IndexableFieldType fieldType() {
      return searcher.getSchema().getField(name());
    }

    @Override
    public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
      return analyzer.tokenStream(name(), stringValue()); // or we could throw unsupported exception?
    }

    /** (for tests) */
    synchronized boolean hasBeenLoaded() {
      return cachedBytes != null;
    }

    @Override
    public synchronized String stringValue() {
      try {
        return readBytes().utf8ToString();
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }

    synchronized BytesRef readBytes() throws IOException {
      if (cachedBytes != null) {
        return cachedBytes;
      } else {
        BytesRef bytesRef = new BytesRef();
        searcher.getIndexReader().document(docId, new StoredFieldVisitor() {
          boolean done = false;

          @Override
          public Status needsField(FieldInfo fieldInfo) throws IOException {
            if (done) {
              return Status.STOP;
            }
            return fieldInfo.name.equals(name()) ? Status.YES : Status.NO;
          }

          @Override
          public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
            bytesRef.bytes = value;
            bytesRef.length = value.length;
            done = true;
          }

          @Override
          public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
            throw new UnsupportedOperationException("'large' binary fields are not (yet) supported");
          }
        });
        if (bytesRef.length < largeValueLengthCacheThreshold) {
          return cachedBytes = bytesRef;
        } else {
          return bytesRef;
        }
      }
    }

    @Override
    public BytesRef binaryValue() {
      return null;
    }

    @Override
    public Reader readerValue() {
      return null;
    }

    @Override
    public Number numericValue() {
      return null;
    }
  }

  /**
   * This will fetch and add the docValues fields to a given SolrDocument/SolrInputDocument
   *
   * @param doc
   *          A SolrDocument or SolrInputDocument instance where docValues will be added
   * @param docid
   *          The lucene docid of the document to be populated
   * @param fields
   *          The fields with docValues to populate the document with.
   *          DocValues fields which do not exist or not decodable will be ignored.
   */
  public void decorateDocValueFields(@SuppressWarnings("rawtypes") SolrDocumentBase doc, int docid, Set fields)
      throws IOException {
    final List leafContexts = searcher.getLeafContexts();
    final int subIndex = ReaderUtil.subIndex(docid, leafContexts);
    final int localId = docid - leafContexts.get(subIndex).docBase;
    final LeafReader leafReader = leafContexts.get(subIndex).reader();
    for (String fieldName : fields) {
      Object fieldValue = decodeDVField(localId, leafReader, fieldName);
      if (fieldValue != null) {
        doc.setField(fieldName, fieldValue);
      }
    }
  }

  /**
   * Decode value from DV field for a document
   * @return null if DV field is not exist or can not decodable
   */
  private Object decodeDVField(int localId, LeafReader leafReader, String fieldName) throws IOException {
    final SchemaField schemaField = searcher.getSchema().getFieldOrNull(fieldName);
    FieldInfo fi = searcher.getFieldInfos().fieldInfo(fieldName);
    if (schemaField == null || !schemaField.hasDocValues() || fi == null) {
      return null; // Searcher doesn't have info about this field, hence ignore it.
    }

    final DocValuesType dvType = fi.getDocValuesType();
    switch (dvType) {
      case NUMERIC:
        final NumericDocValues ndv = leafReader.getNumericDocValues(fieldName);
        if (ndv == null) {
          return null;
        }
        if (!ndv.advanceExact(localId)) {
          return null;
        }
        Long val = ndv.longValue();
        return decodeNumberFromDV(schemaField, val, false);
      case BINARY:
        BinaryDocValues bdv = leafReader.getBinaryDocValues(fieldName);
        if (bdv != null && bdv.advanceExact(localId)) {
          return BytesRef.deepCopyOf(bdv.binaryValue());
        }
        return null;
      case SORTED:
        SortedDocValues sdv = leafReader.getSortedDocValues(fieldName);
        if (sdv != null && sdv.advanceExact(localId)) {
          final BytesRef bRef = sdv.binaryValue();
          // Special handling for Boolean fields since they're stored as 'T' and 'F'.
          if (schemaField.getType() instanceof BoolField) {
            return schemaField.getType().toObject(schemaField, bRef);
          } else {
            return bRef.utf8ToString();
          }
        }
        return null;
      case SORTED_NUMERIC:
        final SortedNumericDocValues numericDv = leafReader.getSortedNumericDocValues(fieldName);
        if (numericDv != null && numericDv.advance(localId) == localId) {
          final int docValueCount = numericDv.docValueCount();
          final List