org.apache.lucene.misc.index.IndexRearranger Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-misc Show documentation
Miscellaneous Lucene extensions
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.misc.index;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FilterCodecReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.SegmentCommitInfo;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.NamedThreadFactory;

/**
 * Copy and rearrange index according to document selectors, from input dir to output dir. Length of
 * documentSelectors determines how many segments there will be.
 *
 * Rearranging works in 3 steps: 1. Assume all docs in the original index are live and create the
 * rearranged index using the segment selectors. 2. Go through the rearranged index and apply
 * deletes requested by the deletes selector. 3. Reorder the segments to match the order of the
 * selectors and check the validity of the rearranged index.
 *
 * 
NB: You can't produce segments that only contain deletes. If you select all documents in a
 * segment for deletion, the entire segment will be discarded.
 *
 * 
Example use case: You are testing search performance after a change to indexing. You can index
 * the same content using the old and new indexers and then rearrange one of them to the shape of
 * the other. Using rearrange will give more accurate measurements, since you will not be
 * introducing noise from index geometry.
 *
 * TODO: another possible (faster) approach to do this is to manipulate FlushPolicy and
 * MergePolicy at indexing time to create small desired segments first and merge them accordingly
 * for details please see: https://markmail.org/message/lbtdntclpnocmfuf
 *
 * @lucene.experimental
 */
public class IndexRearranger {
  protected final Directory input, output;
  protected final IndexWriterConfig config;

  // Each of these selectors will produce a segment in the rearranged index.
  // The segments will appear in the index in the order of the selectors that produced them.
  protected final List segmentSelectors;

  // Documents selected here will be marked for deletion in the rearranged index, but not merged
  // away.
  protected final DocumentSelector deletedDocsSelector;

  /**
   * All args constructor
   *
   * @param input input dir
   * @param output output dir
   * @param config index writer config
   * @param segmentSelectors specify which documents are desired in the rearranged index segments;
   *     each selector corresponds to one segment
   * @param deletedDocsSelector specify which documents are to be marked for deletion in the
   *     rearranged index; this selector should be thread-safe
   */
  public IndexRearranger(
      Directory input,
      Directory output,
      IndexWriterConfig config,
      List segmentSelectors,
      DocumentSelector deletedDocsSelector) {
    this.input = input;
    this.output = output;
    this.config = config;
    this.segmentSelectors = segmentSelectors;
    this.deletedDocsSelector = deletedDocsSelector;
  }

  /** Constructor with no deletes to apply */
  public IndexRearranger(
      Directory input,
      Directory output,
      IndexWriterConfig config,
      List segmentSelectors) {
    this(input, output, config, segmentSelectors, null);
  }

  public void execute() throws Exception {
    ExecutorService executor =
        Executors.newFixedThreadPool(
            Math.min(Runtime.getRuntime().availableProcessors(), segmentSelectors.size()),
            new NamedThreadFactory("rearranger"));

    IndexWriterConfig createSegmentsConfig = new IndexWriterConfig(config.getAnalyzer());
    IndexWriterConfig applyDeletesConfig = new IndexWriterConfig(config.getAnalyzer());

    // Do not merge - each addIndexes call creates one segment
    createSegmentsConfig.setMergePolicy(NoMergePolicy.INSTANCE);
    applyDeletesConfig.setMergePolicy(NoMergePolicy.INSTANCE);

    try (IndexWriter writer = new IndexWriter(output, createSegmentsConfig);
        IndexReader reader = DirectoryReader.open(input)) {
      createRearrangedIndex(writer, reader, segmentSelectors, executor);
    }
    finalizeRearrange(output, segmentSelectors);

    try (IndexWriter writer = new IndexWriter(output, applyDeletesConfig);
        IndexReader reader = DirectoryReader.open(writer)) {
      applyDeletes(writer, reader, deletedDocsSelector, executor);
    }
    executor.shutdown();
  }

  /**
   * Place segments in the order of their respective selectors and ensure the rearrange was
   * performed correctly.
   */
  private static void finalizeRearrange(Directory output, List segmentSelectors)
      throws IOException {
    List ordered = new ArrayList<>();
    try (IndexReader reader = DirectoryReader.open(output)) {
      for (DocumentSelector ds : segmentSelectors) {
        int foundLeaf = -1;
        for (LeafReaderContext context : reader.leaves()) {
          SegmentReader sr = (SegmentReader) context.reader();
          int docFound = ds.getFilteredDocs(sr).nextSetBit(0);
          if (docFound != DocIdSetIterator.NO_MORE_DOCS) {
            // Each document can be mapped to one segment at most
            if (foundLeaf != -1) {
              throw new IllegalStateException(
                  "Document selector "
                      + ds
                      + " has matched more than 1 segments. Matched segments order: "
                      + foundLeaf
                      + ", "
                      + context.ord);
            }
            foundLeaf = context.ord;
            ordered.add(sr.getSegmentInfo());
          }
        }
        assert foundLeaf != -1;
      }
    }
    SegmentInfos sis = SegmentInfos.readLatestCommit(output);
    sis.clear();
    sis.addAll(ordered);
    sis.commit(output);
  }

  /**
   * Create the rearranged index as described by the segment selectors. Assume all documents in the
   * original index are live.
   */
  private static void createRearrangedIndex(
      IndexWriter writer,
      IndexReader reader,
      List selectors,
      ExecutorService executor)
      throws ExecutionException, InterruptedException {

    ArrayList> futures = new ArrayList<>();
    for (DocumentSelector selector : selectors) {
      Callable addSegment =
          () -> {
            addOneSegment(writer, reader, selector);
            return null;
          };
      futures.add(executor.submit(addSegment));
    }

    for (Future future : futures) {
      future.get();
    }
  }

  private static void addOneSegment(
      IndexWriter writer, IndexReader reader, DocumentSelector selector) throws IOException {
    CodecReader[] readers = new CodecReader[reader.leaves().size()];
    for (LeafReaderContext context : reader.leaves()) {
      readers[context.ord] =
          new DocSelectorFilteredCodecReader((CodecReader) context.reader(), selector);
    }
    writer.addIndexes(readers);
  }

  private static void applyDeletes(
      IndexWriter writer, IndexReader reader, DocumentSelector selector, ExecutorService executor)
      throws ExecutionException, InterruptedException {
    if (selector == null) {
      // There are no deletes to be applied
      return;
    }

    ArrayList> futures = new ArrayList<>();
    for (LeafReaderContext context : reader.leaves()) {
      Callable applyDeletesToSegment =
          () -> {
            applyDeletesToOneSegment(writer, (CodecReader) context.reader(), selector);
            return null;
          };
      futures.add(executor.submit(applyDeletesToSegment));
    }

    for (Future future : futures) {
      future.get();
    }
  }

  private static void applyDeletesToOneSegment(
      IndexWriter writer, CodecReader segmentReader, DocumentSelector selector) throws IOException {
    Bits deletedDocs = selector.getFilteredDocs(segmentReader);
    for (int docid = 0; docid < segmentReader.maxDoc(); ++docid) {
      if (deletedDocs.get(docid)) {
        if (writer.tryDeleteDocument(segmentReader, docid) == -1) {
          throw new IllegalStateException(
              "tryDeleteDocument has failed. This should never happen, since merging is disabled.");
        }
      }
    }
  }

  private static class DocSelectorFilteredCodecReader extends FilterCodecReader {

    BitSet filteredLiveDocs;
    int numDocs;

    public DocSelectorFilteredCodecReader(CodecReader in, DocumentSelector selector)
        throws IOException {
      super(in);
      filteredLiveDocs = selector.getFilteredDocs(in);
      numDocs = filteredLiveDocs.cardinality();
    }

    @Override
    public int numDocs() {
      return numDocs;
    }

    @Override
    public Bits getLiveDocs() {
      return filteredLiveDocs;
    }

    @Override
    public CacheHelper getCoreCacheHelper() {
      return in.getCoreCacheHelper();
    }

    @Override
    public CacheHelper getReaderCacheHelper() {
      return null;
    }
  }

  /** Select document within a CodecReader */
  public interface DocumentSelector {
    BitSet getFilteredDocs(CodecReader reader) throws IOException;
  }
}