All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.index.ParallelLeafReader Maven / Gradle / Ivy

There is a newer version: 6.4.2_1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.index;

import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.search.Sort;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.Version;

/**
 * An {@link LeafReader} which reads multiple, parallel indexes. Each index added must have the same
 * number of documents, but typically each contains different fields. Deletions are taken from the
 * first reader. Each document contains the union of the fields of all documents with the same
 * document number. When searching, matches for a query term are from the first index added that has
 * the field.
 *
 * 

This is useful, e.g., with collections that have large fields which change rarely and small * fields that change more frequently. The smaller fields may be re-indexed in a new index and both * indexes may be searched together. * *

Warning: It is up to you to make sure all indexes are created and modified * the same way. For example, if you add documents to one index, you need to add the same documents * in the same order to the other indexes. Failure to do so will result in undefined * behavior. */ public class ParallelLeafReader extends LeafReader { private final FieldInfos fieldInfos; private final LeafReader[] parallelReaders, storedFieldsReaders; private final Set completeReaderSet = Collections.newSetFromMap(new IdentityHashMap()); private final boolean closeSubReaders; private final int maxDoc, numDocs; private final boolean hasDeletions; private final LeafMetaData metaData; private final SortedMap tvFieldToReader = new TreeMap<>(); private final SortedMap fieldToReader = new TreeMap<>(); // TODO needn't sort? private final Map termsFieldToReader = new HashMap<>(); /** * Create a ParallelLeafReader based on the provided readers; auto-closes the given readers on * {@link #close()}. */ public ParallelLeafReader(LeafReader... readers) throws IOException { this(true, readers); } /** Create a ParallelLeafReader based on the provided readers. */ public ParallelLeafReader(boolean closeSubReaders, LeafReader... readers) throws IOException { this(closeSubReaders, readers, readers); } /** * Expert: create a ParallelLeafReader based on the provided readers and storedFieldReaders; when * a document is loaded, only storedFieldsReaders will be used. */ public ParallelLeafReader( boolean closeSubReaders, LeafReader[] readers, LeafReader[] storedFieldsReaders) throws IOException { this.closeSubReaders = closeSubReaders; if (readers.length == 0 && storedFieldsReaders.length > 0) throw new IllegalArgumentException( "There must be at least one main reader if storedFieldsReaders are used."); this.parallelReaders = readers.clone(); this.storedFieldsReaders = storedFieldsReaders.clone(); if (parallelReaders.length > 0) { final LeafReader first = parallelReaders[0]; this.maxDoc = first.maxDoc(); this.numDocs = first.numDocs(); this.hasDeletions = first.hasDeletions(); } else { this.maxDoc = this.numDocs = 0; this.hasDeletions = false; } Collections.addAll(completeReaderSet, this.parallelReaders); Collections.addAll(completeReaderSet, this.storedFieldsReaders); // check compatibility: for (LeafReader reader : completeReaderSet) { if (reader.maxDoc() != maxDoc) { throw new IllegalArgumentException( "All readers must have same maxDoc: " + maxDoc + "!=" + reader.maxDoc()); } } final String softDeletesField = completeReaderSet.stream() .map(r -> r.getFieldInfos().getSoftDeletesField()) .filter(Objects::nonNull) .findAny() .orElse(null); final String parentField = completeReaderSet.stream() .map(r -> r.getFieldInfos().getParentField()) .filter(Objects::nonNull) .findAny() .orElse(null); // TODO: make this read-only in a cleaner way? FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(softDeletesField, parentField)); Sort indexSort = null; int createdVersionMajor = -1; // build FieldInfos and fieldToReader map: for (final LeafReader reader : this.parallelReaders) { LeafMetaData leafMetaData = reader.getMetaData(); Sort leafIndexSort = leafMetaData.sort(); if (indexSort == null) { indexSort = leafIndexSort; } else if (leafIndexSort != null && indexSort.equals(leafIndexSort) == false) { throw new IllegalArgumentException( "cannot combine LeafReaders that have different index sorts: saw both sort=" + indexSort + " and " + leafIndexSort); } if (createdVersionMajor == -1) { createdVersionMajor = leafMetaData.createdVersionMajor(); } else if (createdVersionMajor != leafMetaData.createdVersionMajor()) { throw new IllegalArgumentException( "cannot combine LeafReaders that have different creation versions: saw both version=" + createdVersionMajor + " and " + leafMetaData.createdVersionMajor()); } final FieldInfos readerFieldInfos = reader.getFieldInfos(); for (FieldInfo fieldInfo : readerFieldInfos) { // NOTE: first reader having a given field "wins": if (!fieldToReader.containsKey(fieldInfo.name)) { builder.add(fieldInfo, fieldInfo.getDocValuesGen()); fieldToReader.put(fieldInfo.name, reader); // only add these if the reader responsible for that field name is the current: // TODO consider populating 1st leaf with vectors even if the field name has been seen on // a previous leaf if (fieldInfo.hasTermVectors()) { tvFieldToReader.put(fieldInfo.name, reader); } // TODO consider populating 1st leaf with terms even if the field name has been seen on a // previous leaf if (fieldInfo.getIndexOptions() != IndexOptions.NONE) { termsFieldToReader.put(fieldInfo.name, reader); } } } } if (createdVersionMajor == -1) { // empty reader createdVersionMajor = Version.LATEST.major; } Version minVersion = Version.LATEST; boolean hasBlocks = false; for (final LeafReader reader : this.parallelReaders) { Version leafVersion = reader.getMetaData().minVersion(); hasBlocks |= reader.getMetaData().hasBlocks(); if (leafVersion == null) { minVersion = null; break; } else if (minVersion.onOrAfter(leafVersion)) { minVersion = leafVersion; } } fieldInfos = builder.finish(); this.metaData = new LeafMetaData(createdVersionMajor, minVersion, indexSort, hasBlocks); // do this finally so any Exceptions occurred before don't affect refcounts: for (LeafReader reader : completeReaderSet) { if (!closeSubReaders) { reader.incRef(); } reader.registerParentReader(this); } } @Override public String toString() { final StringBuilder buffer = new StringBuilder("ParallelLeafReader("); for (final Iterator iter = completeReaderSet.iterator(); iter.hasNext(); ) { buffer.append(iter.next()); if (iter.hasNext()) buffer.append(", "); } return buffer.append(')').toString(); } // Single instance of this, per ParallelReader instance private static final class ParallelFields extends Fields { final Map fields = new TreeMap<>(); ParallelFields() {} void addField(String fieldName, Terms terms) { fields.put(fieldName, terms); } @Override public Iterator iterator() { return Collections.unmodifiableSet(fields.keySet()).iterator(); } @Override public Terms terms(String field) { return fields.get(field); } @Override public int size() { return fields.size(); } } /** * {@inheritDoc} * *

NOTE: the returned field numbers will likely not correspond to the actual field numbers in * the underlying readers, and codec metadata ({@link FieldInfo#getAttribute(String)} will be * unavailable. */ @Override public FieldInfos getFieldInfos() { return fieldInfos; } @Override public Bits getLiveDocs() { ensureOpen(); return hasDeletions ? parallelReaders[0].getLiveDocs() : null; } @Override public Terms terms(String field) throws IOException { ensureOpen(); LeafReader leafReader = termsFieldToReader.get(field); return leafReader == null ? null : leafReader.terms(field); } @Override public int numDocs() { // Don't call ensureOpen() here (it could affect performance) return numDocs; } @Override public int maxDoc() { // Don't call ensureOpen() here (it could affect performance) return maxDoc; } @Override public StoredFields storedFields() throws IOException { ensureOpen(); StoredFields[] fields = new StoredFields[storedFieldsReaders.length]; for (int i = 0; i < fields.length; i++) { fields[i] = storedFieldsReaders[i].storedFields(); } return new StoredFields() { @Override public void prefetch(int docID) throws IOException { for (StoredFields reader : fields) { reader.prefetch(docID); } } @Override public void document(int docID, StoredFieldVisitor visitor) throws IOException { for (StoredFields reader : fields) { reader.document(docID, visitor); } } }; } @Override public CacheHelper getCoreCacheHelper() { // ParallelReader instances can be short-lived, which would make caching trappy // so we do not cache on them, unless they wrap a single reader in which // case we delegate if (parallelReaders.length == 1 && storedFieldsReaders.length == 1 && parallelReaders[0] == storedFieldsReaders[0]) { return parallelReaders[0].getCoreCacheHelper(); } return null; } @Override public CacheHelper getReaderCacheHelper() { // ParallelReader instances can be short-lived, which would make caching trappy // so we do not cache on them, unless they wrap a single reader in which // case we delegate if (parallelReaders.length == 1 && storedFieldsReaders.length == 1 && parallelReaders[0] == storedFieldsReaders[0]) { return parallelReaders[0].getReaderCacheHelper(); } return null; } @Override public TermVectors termVectors() throws IOException { ensureOpen(); Map readerToTermVectors = new IdentityHashMap<>(); for (LeafReader reader : parallelReaders) { if (reader.getFieldInfos().hasTermVectors()) { TermVectors termVectors = reader.termVectors(); readerToTermVectors.put(reader, termVectors); } } return new TermVectors() { @Override public void prefetch(int docID) throws IOException { // Prefetch all vectors. Note that this may be wasteful if the consumer doesn't need to read // all the fields but we have no way to know what fields the consumer needs. for (TermVectors termVectors : readerToTermVectors.values()) { termVectors.prefetch(docID); } } @Override public Fields get(int docID) throws IOException { ParallelFields fields = null; // Step 2: Fetch all term vectors once per reader for (Map.Entry entry : readerToTermVectors.entrySet()) { TermVectors termVectors = entry.getValue(); Fields docFields = termVectors.get(docID); // Fetch all fields at once if (docFields != null) { if (fields == null) { fields = new ParallelFields(); } // Step 3: Aggregate only required fields for (String fieldName : docFields) { Terms vector = docFields.terms(fieldName); if (vector != null) { fields.addField(fieldName, vector); } } } } return fields; } }; } @Override protected synchronized void doClose() throws IOException { IOException ioe = null; for (LeafReader reader : completeReaderSet) { try { if (closeSubReaders) { reader.close(); } else { reader.decRef(); } } catch (IOException e) { if (ioe == null) ioe = e; } } // throw the first exception if (ioe != null) throw ioe; } @Override public NumericDocValues getNumericDocValues(String field) throws IOException { ensureOpen(); LeafReader reader = fieldToReader.get(field); return reader == null ? null : reader.getNumericDocValues(field); } @Override public BinaryDocValues getBinaryDocValues(String field) throws IOException { ensureOpen(); LeafReader reader = fieldToReader.get(field); return reader == null ? null : reader.getBinaryDocValues(field); } @Override public SortedDocValues getSortedDocValues(String field) throws IOException { ensureOpen(); LeafReader reader = fieldToReader.get(field); return reader == null ? null : reader.getSortedDocValues(field); } @Override public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException { ensureOpen(); LeafReader reader = fieldToReader.get(field); return reader == null ? null : reader.getSortedNumericDocValues(field); } @Override public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { ensureOpen(); LeafReader reader = fieldToReader.get(field); return reader == null ? null : reader.getSortedSetDocValues(field); } @Override public DocValuesSkipper getDocValuesSkipper(String field) throws IOException { ensureOpen(); LeafReader reader = fieldToReader.get(field); return reader == null ? null : reader.getDocValuesSkipper(field); } @Override public NumericDocValues getNormValues(String field) throws IOException { ensureOpen(); LeafReader reader = fieldToReader.get(field); NumericDocValues values = reader == null ? null : reader.getNormValues(field); return values; } @Override public PointValues getPointValues(String fieldName) throws IOException { ensureOpen(); LeafReader reader = fieldToReader.get(fieldName); return reader == null ? null : reader.getPointValues(fieldName); } @Override public FloatVectorValues getFloatVectorValues(String fieldName) throws IOException { ensureOpen(); LeafReader reader = fieldToReader.get(fieldName); return reader == null ? null : reader.getFloatVectorValues(fieldName); } @Override public ByteVectorValues getByteVectorValues(String fieldName) throws IOException { ensureOpen(); LeafReader reader = fieldToReader.get(fieldName); return reader == null ? null : reader.getByteVectorValues(fieldName); } @Override public void searchNearestVectors( String fieldName, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { ensureOpen(); LeafReader reader = fieldToReader.get(fieldName); if (reader != null) { reader.searchNearestVectors(fieldName, target, knnCollector, acceptDocs); } } @Override public void searchNearestVectors( String fieldName, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { ensureOpen(); LeafReader reader = fieldToReader.get(fieldName); if (reader != null) { reader.searchNearestVectors(fieldName, target, knnCollector, acceptDocs); } } @Override public void checkIntegrity() throws IOException { ensureOpen(); for (LeafReader reader : completeReaderSet) { reader.checkIntegrity(); } } /** Returns the {@link LeafReader}s that were passed on init. */ public LeafReader[] getParallelReaders() { ensureOpen(); return parallelReaders; } @Override public LeafMetaData getMetaData() { return metaData; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy