
org.apache.lucene.index.ParallelLeafReader Maven / Gradle / Ivy
Show all versions of org.apache.servicemix.bundles.lucene
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.search.Sort;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.Version;
/**
* An {@link LeafReader} which reads multiple, parallel indexes. Each index added must have the same
* number of documents, but typically each contains different fields. Deletions are taken from the
* first reader. Each document contains the union of the fields of all documents with the same
* document number. When searching, matches for a query term are from the first index added that has
* the field.
*
* This is useful, e.g., with collections that have large fields which change rarely and small
* fields that change more frequently. The smaller fields may be re-indexed in a new index and both
* indexes may be searched together.
*
*
Warning: It is up to you to make sure all indexes are created and modified
* the same way. For example, if you add documents to one index, you need to add the same documents
* in the same order to the other indexes. Failure to do so will result in undefined
* behavior.
*/
public class ParallelLeafReader extends LeafReader {
private final FieldInfos fieldInfos;
private final LeafReader[] parallelReaders, storedFieldsReaders;
private final Set completeReaderSet =
Collections.newSetFromMap(new IdentityHashMap());
private final boolean closeSubReaders;
private final int maxDoc, numDocs;
private final boolean hasDeletions;
private final LeafMetaData metaData;
private final SortedMap tvFieldToReader = new TreeMap<>();
private final SortedMap fieldToReader = new TreeMap<>(); // TODO needn't sort?
private final Map termsFieldToReader = new HashMap<>();
/**
* Create a ParallelLeafReader based on the provided readers; auto-closes the given readers on
* {@link #close()}.
*/
public ParallelLeafReader(LeafReader... readers) throws IOException {
this(true, readers);
}
/** Create a ParallelLeafReader based on the provided readers. */
public ParallelLeafReader(boolean closeSubReaders, LeafReader... readers) throws IOException {
this(closeSubReaders, readers, readers);
}
/**
* Expert: create a ParallelLeafReader based on the provided readers and storedFieldReaders; when
* a document is loaded, only storedFieldsReaders will be used.
*/
public ParallelLeafReader(
boolean closeSubReaders, LeafReader[] readers, LeafReader[] storedFieldsReaders)
throws IOException {
this.closeSubReaders = closeSubReaders;
if (readers.length == 0 && storedFieldsReaders.length > 0)
throw new IllegalArgumentException(
"There must be at least one main reader if storedFieldsReaders are used.");
this.parallelReaders = readers.clone();
this.storedFieldsReaders = storedFieldsReaders.clone();
if (parallelReaders.length > 0) {
final LeafReader first = parallelReaders[0];
this.maxDoc = first.maxDoc();
this.numDocs = first.numDocs();
this.hasDeletions = first.hasDeletions();
} else {
this.maxDoc = this.numDocs = 0;
this.hasDeletions = false;
}
Collections.addAll(completeReaderSet, this.parallelReaders);
Collections.addAll(completeReaderSet, this.storedFieldsReaders);
// check compatibility:
for (LeafReader reader : completeReaderSet) {
if (reader.maxDoc() != maxDoc) {
throw new IllegalArgumentException(
"All readers must have same maxDoc: " + maxDoc + "!=" + reader.maxDoc());
}
}
final String softDeletesField =
completeReaderSet.stream()
.map(r -> r.getFieldInfos().getSoftDeletesField())
.filter(Objects::nonNull)
.findAny()
.orElse(null);
final String parentField =
completeReaderSet.stream()
.map(r -> r.getFieldInfos().getParentField())
.filter(Objects::nonNull)
.findAny()
.orElse(null);
// TODO: make this read-only in a cleaner way?
FieldInfos.Builder builder =
new FieldInfos.Builder(new FieldInfos.FieldNumbers(softDeletesField, parentField));
Sort indexSort = null;
int createdVersionMajor = -1;
// build FieldInfos and fieldToReader map:
for (final LeafReader reader : this.parallelReaders) {
LeafMetaData leafMetaData = reader.getMetaData();
Sort leafIndexSort = leafMetaData.sort();
if (indexSort == null) {
indexSort = leafIndexSort;
} else if (leafIndexSort != null && indexSort.equals(leafIndexSort) == false) {
throw new IllegalArgumentException(
"cannot combine LeafReaders that have different index sorts: saw both sort="
+ indexSort
+ " and "
+ leafIndexSort);
}
if (createdVersionMajor == -1) {
createdVersionMajor = leafMetaData.createdVersionMajor();
} else if (createdVersionMajor != leafMetaData.createdVersionMajor()) {
throw new IllegalArgumentException(
"cannot combine LeafReaders that have different creation versions: saw both version="
+ createdVersionMajor
+ " and "
+ leafMetaData.createdVersionMajor());
}
final FieldInfos readerFieldInfos = reader.getFieldInfos();
for (FieldInfo fieldInfo : readerFieldInfos) {
// NOTE: first reader having a given field "wins":
if (!fieldToReader.containsKey(fieldInfo.name)) {
builder.add(fieldInfo, fieldInfo.getDocValuesGen());
fieldToReader.put(fieldInfo.name, reader);
// only add these if the reader responsible for that field name is the current:
// TODO consider populating 1st leaf with vectors even if the field name has been seen on
// a previous leaf
if (fieldInfo.hasTermVectors()) {
tvFieldToReader.put(fieldInfo.name, reader);
}
// TODO consider populating 1st leaf with terms even if the field name has been seen on a
// previous leaf
if (fieldInfo.getIndexOptions() != IndexOptions.NONE) {
termsFieldToReader.put(fieldInfo.name, reader);
}
}
}
}
if (createdVersionMajor == -1) {
// empty reader
createdVersionMajor = Version.LATEST.major;
}
Version minVersion = Version.LATEST;
boolean hasBlocks = false;
for (final LeafReader reader : this.parallelReaders) {
Version leafVersion = reader.getMetaData().minVersion();
hasBlocks |= reader.getMetaData().hasBlocks();
if (leafVersion == null) {
minVersion = null;
break;
} else if (minVersion.onOrAfter(leafVersion)) {
minVersion = leafVersion;
}
}
fieldInfos = builder.finish();
this.metaData = new LeafMetaData(createdVersionMajor, minVersion, indexSort, hasBlocks);
// do this finally so any Exceptions occurred before don't affect refcounts:
for (LeafReader reader : completeReaderSet) {
if (!closeSubReaders) {
reader.incRef();
}
reader.registerParentReader(this);
}
}
@Override
public String toString() {
final StringBuilder buffer = new StringBuilder("ParallelLeafReader(");
for (final Iterator iter = completeReaderSet.iterator(); iter.hasNext(); ) {
buffer.append(iter.next());
if (iter.hasNext()) buffer.append(", ");
}
return buffer.append(')').toString();
}
// Single instance of this, per ParallelReader instance
private static final class ParallelFields extends Fields {
final Map fields = new TreeMap<>();
ParallelFields() {}
void addField(String fieldName, Terms terms) {
fields.put(fieldName, terms);
}
@Override
public Iterator iterator() {
return Collections.unmodifiableSet(fields.keySet()).iterator();
}
@Override
public Terms terms(String field) {
return fields.get(field);
}
@Override
public int size() {
return fields.size();
}
}
/**
* {@inheritDoc}
*
* NOTE: the returned field numbers will likely not correspond to the actual field numbers in
* the underlying readers, and codec metadata ({@link FieldInfo#getAttribute(String)} will be
* unavailable.
*/
@Override
public FieldInfos getFieldInfos() {
return fieldInfos;
}
@Override
public Bits getLiveDocs() {
ensureOpen();
return hasDeletions ? parallelReaders[0].getLiveDocs() : null;
}
@Override
public Terms terms(String field) throws IOException {
ensureOpen();
LeafReader leafReader = termsFieldToReader.get(field);
return leafReader == null ? null : leafReader.terms(field);
}
@Override
public int numDocs() {
// Don't call ensureOpen() here (it could affect performance)
return numDocs;
}
@Override
public int maxDoc() {
// Don't call ensureOpen() here (it could affect performance)
return maxDoc;
}
@Override
public StoredFields storedFields() throws IOException {
ensureOpen();
StoredFields[] fields = new StoredFields[storedFieldsReaders.length];
for (int i = 0; i < fields.length; i++) {
fields[i] = storedFieldsReaders[i].storedFields();
}
return new StoredFields() {
@Override
public void prefetch(int docID) throws IOException {
for (StoredFields reader : fields) {
reader.prefetch(docID);
}
}
@Override
public void document(int docID, StoredFieldVisitor visitor) throws IOException {
for (StoredFields reader : fields) {
reader.document(docID, visitor);
}
}
};
}
@Override
public CacheHelper getCoreCacheHelper() {
// ParallelReader instances can be short-lived, which would make caching trappy
// so we do not cache on them, unless they wrap a single reader in which
// case we delegate
if (parallelReaders.length == 1
&& storedFieldsReaders.length == 1
&& parallelReaders[0] == storedFieldsReaders[0]) {
return parallelReaders[0].getCoreCacheHelper();
}
return null;
}
@Override
public CacheHelper getReaderCacheHelper() {
// ParallelReader instances can be short-lived, which would make caching trappy
// so we do not cache on them, unless they wrap a single reader in which
// case we delegate
if (parallelReaders.length == 1
&& storedFieldsReaders.length == 1
&& parallelReaders[0] == storedFieldsReaders[0]) {
return parallelReaders[0].getReaderCacheHelper();
}
return null;
}
@Override
public TermVectors termVectors() throws IOException {
ensureOpen();
Map readerToTermVectors = new IdentityHashMap<>();
for (LeafReader reader : parallelReaders) {
if (reader.getFieldInfos().hasTermVectors()) {
TermVectors termVectors = reader.termVectors();
readerToTermVectors.put(reader, termVectors);
}
}
return new TermVectors() {
@Override
public void prefetch(int docID) throws IOException {
// Prefetch all vectors. Note that this may be wasteful if the consumer doesn't need to read
// all the fields but we have no way to know what fields the consumer needs.
for (TermVectors termVectors : readerToTermVectors.values()) {
termVectors.prefetch(docID);
}
}
@Override
public Fields get(int docID) throws IOException {
ParallelFields fields = null;
// Step 2: Fetch all term vectors once per reader
for (Map.Entry entry : readerToTermVectors.entrySet()) {
TermVectors termVectors = entry.getValue();
Fields docFields = termVectors.get(docID); // Fetch all fields at once
if (docFields != null) {
if (fields == null) {
fields = new ParallelFields();
}
// Step 3: Aggregate only required fields
for (String fieldName : docFields) {
Terms vector = docFields.terms(fieldName);
if (vector != null) {
fields.addField(fieldName, vector);
}
}
}
}
return fields;
}
};
}
@Override
protected synchronized void doClose() throws IOException {
IOException ioe = null;
for (LeafReader reader : completeReaderSet) {
try {
if (closeSubReaders) {
reader.close();
} else {
reader.decRef();
}
} catch (IOException e) {
if (ioe == null) ioe = e;
}
}
// throw the first exception
if (ioe != null) throw ioe;
}
@Override
public NumericDocValues getNumericDocValues(String field) throws IOException {
ensureOpen();
LeafReader reader = fieldToReader.get(field);
return reader == null ? null : reader.getNumericDocValues(field);
}
@Override
public BinaryDocValues getBinaryDocValues(String field) throws IOException {
ensureOpen();
LeafReader reader = fieldToReader.get(field);
return reader == null ? null : reader.getBinaryDocValues(field);
}
@Override
public SortedDocValues getSortedDocValues(String field) throws IOException {
ensureOpen();
LeafReader reader = fieldToReader.get(field);
return reader == null ? null : reader.getSortedDocValues(field);
}
@Override
public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException {
ensureOpen();
LeafReader reader = fieldToReader.get(field);
return reader == null ? null : reader.getSortedNumericDocValues(field);
}
@Override
public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
ensureOpen();
LeafReader reader = fieldToReader.get(field);
return reader == null ? null : reader.getSortedSetDocValues(field);
}
@Override
public DocValuesSkipper getDocValuesSkipper(String field) throws IOException {
ensureOpen();
LeafReader reader = fieldToReader.get(field);
return reader == null ? null : reader.getDocValuesSkipper(field);
}
@Override
public NumericDocValues getNormValues(String field) throws IOException {
ensureOpen();
LeafReader reader = fieldToReader.get(field);
NumericDocValues values = reader == null ? null : reader.getNormValues(field);
return values;
}
@Override
public PointValues getPointValues(String fieldName) throws IOException {
ensureOpen();
LeafReader reader = fieldToReader.get(fieldName);
return reader == null ? null : reader.getPointValues(fieldName);
}
@Override
public FloatVectorValues getFloatVectorValues(String fieldName) throws IOException {
ensureOpen();
LeafReader reader = fieldToReader.get(fieldName);
return reader == null ? null : reader.getFloatVectorValues(fieldName);
}
@Override
public ByteVectorValues getByteVectorValues(String fieldName) throws IOException {
ensureOpen();
LeafReader reader = fieldToReader.get(fieldName);
return reader == null ? null : reader.getByteVectorValues(fieldName);
}
@Override
public void searchNearestVectors(
String fieldName, float[] target, KnnCollector knnCollector, Bits acceptDocs)
throws IOException {
ensureOpen();
LeafReader reader = fieldToReader.get(fieldName);
if (reader != null) {
reader.searchNearestVectors(fieldName, target, knnCollector, acceptDocs);
}
}
@Override
public void searchNearestVectors(
String fieldName, byte[] target, KnnCollector knnCollector, Bits acceptDocs)
throws IOException {
ensureOpen();
LeafReader reader = fieldToReader.get(fieldName);
if (reader != null) {
reader.searchNearestVectors(fieldName, target, knnCollector, acceptDocs);
}
}
@Override
public void checkIntegrity() throws IOException {
ensureOpen();
for (LeafReader reader : completeReaderSet) {
reader.checkIntegrity();
}
}
/** Returns the {@link LeafReader}s that were passed on init. */
public LeafReader[] getParallelReaders() {
ensureOpen();
return parallelReaders;
}
@Override
public LeafMetaData getMetaData() {
return metaData;
}
}