All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.misc.index.BPIndexReorderer Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.misc.index;

import java.io.Closeable;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.Executor;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Sorter;
import org.apache.lucene.index.Sorter.DocMap;
import org.apache.lucene.index.SortingCodecReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TaskExecutor;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntroSelector;
import org.apache.lucene.util.IntroSorter;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.packed.PackedInts;

/**
 * Implementation of "recursive graph bisection", also called "bipartite graph partitioning" and
 * often abbreviated BP, an approach to doc ID assignment that aims at reducing the sum of the log
 * gap between consecutive postings. While originally targeted at reducing the size of postings,
 * this algorithm has been observed to also speed up queries significantly by clustering documents
 * that have similar sets of terms together.
 *
 * 

This algorithm was initially described by Dhulipala et al. in "Compressing graphs and inverted * indexes with recursive graph bisection". This implementation takes advantage of some * optimizations suggested by Mackenzie et al. in "Tradeoff Options for Bipartite Graph * Partitioning". * *

Typical usage would look like this: * *

 * LeafReader reader; // reader to reorder
 * Directory targetDir; // Directory where to write the reordered index
 *
 * Directory targetDir = FSDirectory.open(targetPath);
 * BPIndexReorderer reorderer = new BPIndexReorderer();
 * ForkJoinPool pool = new ForkJoinPool(Runtime.getRuntime().availableProcessors(),
 *     p -> new ForkJoinWorkerThread(p) {}, null, false);
 * reorderer.setForkJoinPool(pool);
 * reorderer.setFields(Collections.singleton("body"));
 * CodecReader reorderedReaderView = reorderer.reorder(SlowCodecReaderWrapper.wrap(reader), targetDir);
 * try (IndexWriter w = new IndexWriter(targetDir, new IndexWriterConfig().setOpenMode(OpenMode.CREATE))) {
 *   w.addIndexes(reorderedReaderView);
 * }
 * DirectoryReader reorderedReader = DirectoryReader.open(targetDir);
 * 
* *

Note: This is a slow operation that consumes O(maxDoc + numTerms * numThreads) memory. */ public final class BPIndexReorderer { /** Exception that is thrown when not enough RAM is available. */ public static class NotEnoughRAMException extends RuntimeException { private NotEnoughRAMException(String message) { super(message); } } /** Block size for terms in the forward index */ private static final int TERM_IDS_BLOCK_SIZE = 17; /** Minimum problem size that will result in tasks being splitted. */ private static final int FORK_THRESHOLD = 8192; /** Minimum required document frequency for terms to be considered: 4,096. */ public static final int DEFAULT_MIN_DOC_FREQ = 4096; /** * Minimum size of partitions. The algorithm will stop recursing when reaching partitions below * this number of documents: 32. */ public static final int DEFAULT_MIN_PARTITION_SIZE = 32; /** * Default maximum number of iterations per recursion level: 20. Higher numbers of iterations * typically don't help significantly. */ public static final int DEFAULT_MAX_ITERS = 20; private int minDocFreq; private float maxDocFreq; private int minPartitionSize; private int maxIters; private double ramBudgetMB; private Set fields; /** Constructor. */ public BPIndexReorderer() { setMinDocFreq(DEFAULT_MIN_DOC_FREQ); setMaxDocFreq(1f); setMinPartitionSize(DEFAULT_MIN_PARTITION_SIZE); setMaxIters(DEFAULT_MAX_ITERS); // 10% of the available heap size by default setRAMBudgetMB(Runtime.getRuntime().totalMemory() / 1024d / 1024d / 10d); setFields(null); } /** Set the minimum document frequency for terms to be considered, 4096 by default. */ public void setMinDocFreq(int minDocFreq) { if (minDocFreq < 1) { throw new IllegalArgumentException("minDocFreq must be at least 1, got " + minDocFreq); } this.minDocFreq = minDocFreq; } /** * Set the maximum document frequency for terms to be considered, as a ratio of {@code maxDoc}. * This is useful because very frequent terms (stop words) add significant overhead to the * reordering logic while not being very relevant for ordering. This value must be in (0, 1]. * Default value is 1. */ public void setMaxDocFreq(float maxDocFreq) { if (maxDocFreq > 0 == false || maxDocFreq <= 1 == false) { throw new IllegalArgumentException("maxDocFreq must be in (0, 1], got " + maxDocFreq); } this.maxDocFreq = maxDocFreq; } /** Set the minimum partition size, when the algorithm stops recursing, 32 by default. */ public void setMinPartitionSize(int minPartitionSize) { if (minPartitionSize < 1) { throw new IllegalArgumentException( "minPartitionSize must be at least 1, got " + minPartitionSize); } this.minPartitionSize = minPartitionSize; } /** * Set the maximum number of iterations on each recursion level, 20 by default. Experiments * suggests that values above 20 do not help much. However, values below 20 can be used to trade * effectiveness for faster reordering. */ public void setMaxIters(int maxIters) { if (maxIters < 1) { throw new IllegalArgumentException("maxIters must be at least 1, got " + maxIters); } this.maxIters = maxIters; } /** * Set the amount of RAM that graph partitioning is allowed to use. More RAM allows running * faster. If not enough RAM is provided, a {@link NotEnoughRAMException} will be thrown. This is * 10% of the total heap size by default. */ public void setRAMBudgetMB(double ramBudgetMB) { this.ramBudgetMB = ramBudgetMB; } /** * Sets the fields to use to perform partitioning. A {@code null} value indicates that all indexed * fields should be used. */ public void setFields(Set fields) { this.fields = fields == null ? null : Set.copyOf(fields); } private static class PerThreadState { final ForwardIndex forwardIndex; final int[] leftDocFreqs; final int[] rightDocFreqs; PerThreadState(int numTerms, ForwardIndex forwardIndex) { this.forwardIndex = forwardIndex; this.leftDocFreqs = new int[numTerms]; this.rightDocFreqs = new int[numTerms]; } } private abstract class BaseRecursiveAction implements Callable { protected final TaskExecutor executor; protected final int depth; BaseRecursiveAction(TaskExecutor executor, int depth) { this.executor = executor; this.depth = depth; } protected final boolean shouldFork(int problemSize, int totalProblemSize) { if (executor == null) { return false; } if (problemSize == totalProblemSize) { // Sometimes fork regardless of the problem size to make sure that unit tests also exercise // forking return true; } return problemSize > FORK_THRESHOLD; } @Override public abstract Void call(); protected final void invokeAll(BaseRecursiveAction... actions) { assert executor != null : "Only call invokeAll if shouldFork returned true"; try { executor.invokeAll(Arrays.asList(actions)); } catch (IOException e) { throw new UncheckedIOException(e); } } } private class IndexReorderingTask extends BaseRecursiveAction { private final IntsRef docIDs; private final float[] biases; private final CloseableThreadLocal threadLocal; private final BitSet parents; IndexReorderingTask( IntsRef docIDs, float[] biases, CloseableThreadLocal threadLocal, BitSet parents, TaskExecutor executor, int depth) { super(executor, depth); this.docIDs = docIDs; this.biases = biases; this.threadLocal = threadLocal; this.parents = parents; } private static void computeDocFreqs(IntsRef docs, ForwardIndex forwardIndex, int[] docFreqs) { try { Arrays.fill(docFreqs, 0); for (int i = docs.offset, end = docs.offset + docs.length; i < end; ++i) { final int doc = docs.ints[i]; forwardIndex.seek(doc); for (IntsRef terms = forwardIndex.nextTerms(); terms.length != 0; terms = forwardIndex.nextTerms()) { for (int j = 0; j < terms.length; ++j) { final int termID = terms.ints[terms.offset + j]; ++docFreqs[termID]; } } } } catch (IOException e) { throw new UncheckedIOException(e); } } @Override public Void call() { if (depth > 0) { Arrays.sort(docIDs.ints, docIDs.offset, docIDs.offset + docIDs.length); } else { assert sorted(docIDs); } assert assertParentStructure(); int halfLength = docIDs.length / 2; if (halfLength < minPartitionSize) { return null; } IntsRef left = new IntsRef(docIDs.ints, docIDs.offset, halfLength); IntsRef right = new IntsRef(docIDs.ints, docIDs.offset + halfLength, docIDs.length - halfLength); PerThreadState state = threadLocal.get(); ForwardIndex forwardIndex = state.forwardIndex; int[] leftDocFreqs = state.leftDocFreqs; int[] rightDocFreqs = state.rightDocFreqs; computeDocFreqs(left, forwardIndex, leftDocFreqs); computeDocFreqs(right, forwardIndex, rightDocFreqs); for (int iter = 0; iter < maxIters; ++iter) { boolean moved; try { moved = shuffle( forwardIndex, docIDs, right.offset, leftDocFreqs, rightDocFreqs, biases, parents, iter); } catch (IOException e) { throw new UncheckedIOException(e); } if (moved == false) { break; } } if (parents != null) { // Make sure we split just after a parent doc int lastLeftDocID = docIDs.ints[right.offset - 1]; int split = right.offset + parents.nextSetBit(lastLeftDocID) - lastLeftDocID; if (split == docIDs.offset + docIDs.length) { // No good split on the right side, look on the left side then. split = right.offset - (lastLeftDocID - parents.prevSetBit(lastLeftDocID)); if (split == docIDs.offset) { // No good split on the left side either: this slice has a single parent document, no // reordering is possible. Stop recursing. return null; } } assert parents.get(docIDs.ints[split - 1]); left = new IntsRef(docIDs.ints, docIDs.offset, split - docIDs.offset); right = new IntsRef(docIDs.ints, split, docIDs.offset + docIDs.length - split); } // It is fine for all tasks to share the same docs / biases array since they all work on // different slices of the array at a given point in time. IndexReorderingTask leftTask = new IndexReorderingTask(left, biases, threadLocal, parents, executor, depth + 1); IndexReorderingTask rightTask = new IndexReorderingTask(right, biases, threadLocal, parents, executor, depth + 1); if (shouldFork(docIDs.length, docIDs.ints.length)) { invokeAll(leftTask, rightTask); } else { leftTask.call(); rightTask.call(); } return null; } // used for asserts private boolean assertParentStructure() { if (parents == null) { return true; } int i = docIDs.offset; final int end = docIDs.offset + docIDs.length; while (i < end) { final int firstChild = docIDs.ints[i]; final int parent = parents.nextSetBit(firstChild); final int numChildren = parent - firstChild; assert i + numChildren < end; for (int j = 1; j <= numChildren; ++j) { assert docIDs.ints[i + j] == firstChild + j : "Parent structure has not been preserved"; } i += numChildren + 1; } assert i == end : "Last doc ID must be a parent doc"; return true; } /** * Shuffle doc IDs across both partitions so that each partition has lower gaps between * consecutive postings. */ private boolean shuffle( ForwardIndex forwardIndex, IntsRef docIDs, int midPoint, int[] leftDocFreqs, int[] rightDocFreqs, float[] biases, BitSet parents, int iter) throws IOException { // Computing biases is typically a bottleneck, because each iteration needs to iterate over // all postings to recompute biases, and the total number of postings is usually one order of // magnitude or more larger than the number of docs. So we try to parallelize it. new ComputeBiasTask( docIDs.ints, biases, docIDs.offset, docIDs.offset + docIDs.length, leftDocFreqs, rightDocFreqs, threadLocal, executor, depth) .call(); if (parents != null) { for (int i = docIDs.offset, end = docIDs.offset + docIDs.length; i < end; ) { final int firstChild = docIDs.ints[i]; final int numChildren = parents.nextSetBit(firstChild) - firstChild; assert parents.get(docIDs.ints[i + numChildren]); double cumulativeBias = 0; for (int j = 0; j <= numChildren; ++j) { cumulativeBias += biases[i + j]; } // Give all docs from the same block the same bias, which is the sum of biases of all // documents in the block. This helps ensure that the follow-up sort() call preserves the // block structure. Arrays.fill(biases, i, i + numChildren + 1, (float) cumulativeBias); i += numChildren + 1; } } float maxLeftBias = Float.NEGATIVE_INFINITY; for (int i = docIDs.offset; i < midPoint; ++i) { maxLeftBias = Math.max(maxLeftBias, biases[i]); } float minRightBias = Float.POSITIVE_INFINITY; for (int i = midPoint, end = docIDs.offset + docIDs.length; i < end; ++i) { minRightBias = Math.min(minRightBias, biases[i]); } float gain = maxLeftBias - minRightBias; // This uses the simulated annealing proposed by Mackenzie et al in "Tradeoff Options for // Bipartite Graph Partitioning" by comparing the gain of swapping the doc from the left side // that is most attracted to the right and the doc from the right side that is most attracted // to the left against `iter` rather than zero. if (gain <= iter) { return false; } class Selector extends IntroSelector { int pivotDoc; float pivotBias; @Override public void setPivot(int i) { pivotDoc = docIDs.ints[i]; pivotBias = biases[i]; } @Override public int comparePivot(int j) { int cmp = Float.compare(pivotBias, biases[j]); if (cmp == 0) { // Tie break on the doc ID to preserve doc ID ordering as much as possible cmp = pivotDoc - docIDs.ints[j]; } return cmp; } @Override public void swap(int i, int j) { float tmpBias = biases[i]; biases[i] = biases[j]; biases[j] = tmpBias; if (i < midPoint == j < midPoint) { int tmpDoc = docIDs.ints[i]; docIDs.ints[i] = docIDs.ints[j]; docIDs.ints[j] = tmpDoc; } else { // If we're swapping docs across the left and right sides, we need to keep doc freqs // up-to-date. int left = Math.min(i, j); int right = Math.max(i, j); try { swapDocsAndFreqs(docIDs.ints, left, right, forwardIndex, leftDocFreqs, rightDocFreqs); } catch (IOException e) { throw new UncheckedIOException(e); } } } } Selector selector = new Selector(); if (parents == null) { selector.select(docIDs.offset, docIDs.offset + docIDs.length, midPoint); } else { // When we have parents, we need to do a full sort to make sure we're not breaking the // parent structure. new IntroSorter() { @Override protected void setPivot(int i) { selector.setPivot(i); } @Override protected int comparePivot(int j) { return selector.comparePivot(j); } @Override protected void swap(int i, int j) { selector.swap(i, j); } }.sort(docIDs.offset, docIDs.offset + docIDs.length); } return true; } private static void swapDocsAndFreqs( int[] docs, int left, int right, ForwardIndex forwardIndex, int[] leftDocFreqs, int[] rightDocFreqs) throws IOException { assert left < right; int leftDoc = docs[left]; int rightDoc = docs[right]; // Now update the cache, this makes things much faster than invalidating it and having to // recompute doc freqs on the next iteration. forwardIndex.seek(leftDoc); for (IntsRef terms = forwardIndex.nextTerms(); terms.length != 0; terms = forwardIndex.nextTerms()) { for (int i = 0; i < terms.length; ++i) { final int termID = terms.ints[terms.offset + i]; --leftDocFreqs[termID]; ++rightDocFreqs[termID]; } } forwardIndex.seek(rightDoc); for (IntsRef terms = forwardIndex.nextTerms(); terms.length != 0; terms = forwardIndex.nextTerms()) { for (int i = 0; i < terms.length; ++i) { final int termID = terms.ints[terms.offset + i]; ++leftDocFreqs[termID]; --rightDocFreqs[termID]; } } docs[left] = rightDoc; docs[right] = leftDoc; } } private class ComputeBiasTask extends BaseRecursiveAction { private final int[] docs; private final float[] biases; private final int from; private final int to; private final int[] fromDocFreqs; private final int[] toDocFreqs; private final CloseableThreadLocal threadLocal; ComputeBiasTask( int[] docs, float[] biases, int from, int to, int[] fromDocFreqs, int[] toDocFreqs, CloseableThreadLocal threadLocal, TaskExecutor executor, int depth) { super(executor, depth); this.docs = docs; this.biases = biases; this.from = from; this.to = to; this.fromDocFreqs = fromDocFreqs; this.toDocFreqs = toDocFreqs; this.threadLocal = threadLocal; } @Override public Void call() { final int problemSize = to - from; if (problemSize > 1 && shouldFork(problemSize, docs.length)) { final int mid = (from + to) >>> 1; invokeAll( new ComputeBiasTask( docs, biases, from, mid, fromDocFreqs, toDocFreqs, threadLocal, executor, depth), new ComputeBiasTask( docs, biases, mid, to, fromDocFreqs, toDocFreqs, threadLocal, executor, depth)); } else { ForwardIndex forwardIndex = threadLocal.get().forwardIndex; try { for (int i = from; i < to; ++i) { biases[i] = computeBias(docs[i], forwardIndex, fromDocFreqs, toDocFreqs); } } catch (IOException e) { throw new UncheckedIOException(e); } } return null; } /** * Compute a float that is negative when a document is attracted to the left and positive * otherwise. */ private static float computeBias( int docID, ForwardIndex forwardIndex, int[] fromDocFreqs, int[] toDocFreqs) throws IOException { forwardIndex.seek(docID); double bias = 0; for (IntsRef terms = forwardIndex.nextTerms(); terms.length != 0; terms = forwardIndex.nextTerms()) { for (int i = 0; i < terms.length; ++i) { final int termID = terms.ints[terms.offset + i]; final int fromDocFreq = fromDocFreqs[termID]; final int toDocFreq = toDocFreqs[termID]; assert fromDocFreq >= 0; assert toDocFreq >= 0; bias += (toDocFreq == 0 ? 0 : fastLog2(toDocFreq)) - (fromDocFreq == 0 ? 0 : fastLog2(fromDocFreq)); } } return (float) bias; } } /** * A forward index. Like term vectors, but only for a subset of terms, and it produces term IDs * instead of whole terms. */ private static final class ForwardIndex implements Cloneable, Closeable { private final RandomAccessInput startOffsets; private final IndexInput startOffsetsInput, terms; private final int maxTerm; private long endOffset = -1; private final int[] buffer = new int[TERM_IDS_BLOCK_SIZE]; private final IntsRef bufferRef = new IntsRef(buffer, 0, 0); ForwardIndex(IndexInput startOffsetsInput, IndexInput terms, int maxTerm) { this.startOffsetsInput = startOffsetsInput; try { this.startOffsets = startOffsetsInput.randomAccessSlice( 0, startOffsetsInput.length() - CodecUtil.footerLength()); } catch (IOException e) { throw new UncheckedIOException(e); } this.terms = terms; this.maxTerm = maxTerm; } void seek(int docID) throws IOException { final long startOffset = startOffsets.readLong((long) docID * Long.BYTES); endOffset = startOffsets.readLong((docID + 1L) * Long.BYTES); terms.seek(startOffset); } IntsRef nextTerms() throws IOException { if (terms.getFilePointer() >= endOffset) { assert terms.getFilePointer() == endOffset; bufferRef.length = 0; } else { bufferRef.length = readMonotonicInts(terms, buffer); } return bufferRef; } @Override public ForwardIndex clone() { return new ForwardIndex(startOffsetsInput.clone(), terms.clone(), maxTerm); } @Override public void close() throws IOException { IOUtils.close(startOffsetsInput, terms); } } private int writePostings( CodecReader reader, Set fields, Directory tempDir, DataOutput postingsOut, int parallelism) throws IOException { final int maxNumTerms = (int) ((ramBudgetMB * 1024 * 1024 - docRAMRequirements(reader.maxDoc())) / parallelism / termRAMRequirementsPerThreadPerTerm()); final int maxDocFreq = (int) ((double) this.maxDocFreq * reader.maxDoc()); int numTerms = 0; for (String field : fields) { Terms terms = reader.terms(field); if (terms == null) { continue; } if (terms.size() != -1) { // Skip ID-like fields that have many terms where none is of interest final long maxPossibleDocFreq = 1 + terms.getSumDocFreq() - terms.size(); if (maxPossibleDocFreq < minDocFreq) { continue; } } TermsEnum iterator = terms.iterator(); PostingsEnum postings = null; for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { final int docFreq = iterator.docFreq(); if (docFreq < minDocFreq || docFreq > maxDocFreq) { continue; } if (numTerms >= ArrayUtil.MAX_ARRAY_LENGTH) { throw new IllegalArgumentException( "Cannot perform recursive graph bisection on more than " + ArrayUtil.MAX_ARRAY_LENGTH + " terms, the maximum array length"); } else if (numTerms >= maxNumTerms) { throw new NotEnoughRAMException( "Too many terms are matching given the RAM budget of " + ramBudgetMB + "MB. Consider raising the RAM budget, raising the minimum doc frequency, or decreasing concurrency"); } final int termID = numTerms++; postings = iterator.postings(postings, PostingsEnum.NONE); for (int doc = postings.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postings.nextDoc()) { postingsOut.writeLong(Integer.toUnsignedLong(termID) << 32 | Integer.toUnsignedLong(doc)); } } } return numTerms; } private ForwardIndex buildForwardIndex( Directory tempDir, String postingsFileName, int maxDoc, int maxTerm) throws IOException { String termIDsFileName; String startOffsetsFileName; try (IndexOutput termIDs = tempDir.createTempOutput("term-ids", "", IOContext.DEFAULT); IndexOutput startOffsets = tempDir.createTempOutput("start-offsets", "", IOContext.DEFAULT)) { termIDsFileName = termIDs.getName(); startOffsetsFileName = startOffsets.getName(); int[] buffer = new int[TERM_IDS_BLOCK_SIZE]; new ForwardIndexSorter(tempDir) .sortAndConsume( postingsFileName, maxDoc, new LongConsumer() { int prevDoc = -1; int bufferLen = 0; @Override public void accept(long value) throws IOException { int doc = (int) value; int termID = (int) (value >>> 32); if (doc != prevDoc) { if (bufferLen != 0) { writeMonotonicInts(buffer, bufferLen, termIDs); bufferLen = 0; } assert doc > prevDoc; for (int d = prevDoc + 1; d <= doc; ++d) { startOffsets.writeLong(termIDs.getFilePointer()); } prevDoc = doc; } assert termID < maxTerm : termID + " " + maxTerm; if (bufferLen == buffer.length) { writeMonotonicInts(buffer, bufferLen, termIDs); bufferLen = 0; } buffer[bufferLen++] = termID; } @Override public void onFinish() throws IOException { if (bufferLen != 0) { writeMonotonicInts(buffer, bufferLen, termIDs); } for (int d = prevDoc + 1; d <= maxDoc; ++d) { startOffsets.writeLong(termIDs.getFilePointer()); } CodecUtil.writeFooter(termIDs); CodecUtil.writeFooter(startOffsets); } }); } IndexInput termIDsInput = tempDir.openInput(termIDsFileName, IOContext.DEFAULT); IndexInput startOffsets = tempDir.openInput(startOffsetsFileName, IOContext.DEFAULT); return new ForwardIndex(startOffsets, termIDsInput, maxTerm); } /** * Expert: Compute the {@link DocMap} that holds the new doc ID numbering. This is exposed to * enable integration into {@link BPReorderingMergePolicy}, {@link #reorder(CodecReader, * Directory, Executor)} should be preferred in general. */ public Sorter.DocMap computeDocMap(CodecReader reader, Directory tempDir, Executor executor) throws IOException { if (docRAMRequirements(reader.maxDoc()) >= ramBudgetMB * 1024 * 1024) { throw new NotEnoughRAMException( "At least " + Math.ceil(docRAMRequirements(reader.maxDoc()) / 1024. / 1024.) + "MB of RAM are required to hold metadata about documents in RAM, but current RAM budget is " + ramBudgetMB + "MB"); } Set fields = this.fields; if (fields == null) { fields = new HashSet<>(); for (FieldInfo fi : reader.getFieldInfos()) { if (fi.getIndexOptions() != IndexOptions.NONE) { fields.add(fi.name); } } } TaskExecutor taskExecutor = executor == null ? null : new TaskExecutor(executor); int[] newToOld = computePermutation(reader, fields, tempDir, taskExecutor); int[] oldToNew = new int[newToOld.length]; for (int i = 0; i < newToOld.length; ++i) { oldToNew[newToOld[i]] = i; } return new Sorter.DocMap() { @Override public int size() { return newToOld.length; } @Override public int oldToNew(int docID) { return oldToNew[docID]; } @Override public int newToOld(int docID) { return newToOld[docID]; } }; } /** * Reorder the given {@link CodecReader} into a reader that tries to minimize the log gap between * consecutive documents in postings, which usually helps improve space efficiency and query * evaluation efficiency. Note that the returned {@link CodecReader} is slow and should typically * be used in a call to {@link IndexWriter#addIndexes(CodecReader...)}. * *

The provided {@link Executor} can be used to perform reordering concurrently. A value of * {@code null} indicates that reordering should be performed in the current thread. * *

NOTE: The provided {@link Executor} must not reject tasks. * * @throws NotEnoughRAMException if not enough RAM is provided */ public CodecReader reorder(CodecReader reader, Directory tempDir, Executor executor) throws IOException { Sorter.DocMap docMap = computeDocMap(reader, tempDir, executor); return SortingCodecReader.wrap(reader, docMap, null); } /** * Compute a permutation of the doc ID space that reduces log gaps between consecutive postings. */ private int[] computePermutation( CodecReader reader, Set fields, Directory dir, TaskExecutor executor) throws IOException { TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(dir); final int parallelism; if (executor == null) { parallelism = 1; } else { // Assume as many threads as processors parallelism = Runtime.getRuntime().availableProcessors(); } final int maxDoc = reader.maxDoc(); ForwardIndex forwardIndex = null; IndexOutput postingsOutput = null; boolean success = false; try { postingsOutput = trackingDir.createTempOutput("postings", "", IOContext.DEFAULT); int numTerms = writePostings(reader, fields, trackingDir, postingsOutput, parallelism); CodecUtil.writeFooter(postingsOutput); postingsOutput.close(); final ForwardIndex finalForwardIndex = forwardIndex = buildForwardIndex(trackingDir, postingsOutput.getName(), maxDoc, numTerms); trackingDir.deleteFile(postingsOutput.getName()); postingsOutput = null; int[] sortedDocs = new int[maxDoc]; for (int i = 0; i < maxDoc; ++i) { sortedDocs[i] = i; } BitSet parents = null; String parentField = reader.getFieldInfos().getParentField(); if (parentField != null) { parents = BitSet.of(DocValues.getNumeric(reader, parentField), maxDoc); } try (CloseableThreadLocal threadLocal = new CloseableThreadLocal<>() { @Override protected PerThreadState initialValue() { return new PerThreadState(numTerms, finalForwardIndex.clone()); } }) { IntsRef docs = new IntsRef(sortedDocs, 0, sortedDocs.length); new IndexReorderingTask(docs, new float[maxDoc], threadLocal, parents, executor, 0).call(); } success = true; return sortedDocs; } finally { if (success) { IOUtils.close(forwardIndex); IOUtils.deleteFiles(trackingDir, trackingDir.getCreatedFiles()); } else { IOUtils.closeWhileHandlingException(postingsOutput, forwardIndex); IOUtils.deleteFilesIgnoringExceptions(trackingDir, trackingDir.getCreatedFiles()); } } } /** Returns true if, and only if, the given {@link IntsRef} is sorted. */ private static boolean sorted(IntsRef intsRef) { for (int i = 1; i < intsRef.length; ++i) { if (intsRef.ints[intsRef.offset + i - 1] > intsRef.ints[intsRef.offset + i]) { return false; } } return true; } private static long docRAMRequirements(int maxDoc) { // We need one int per doc for the doc map, plus one float to store the bias associated with // this doc. return 2L * Integer.BYTES * maxDoc; } private static long termRAMRequirementsPerThreadPerTerm() { // Each thread needs two ints per term, one for left document frequencies, and one for right // document frequencies return 2L * Integer.BYTES; } private static final float[] LOG2_TABLE = new float[256]; static { LOG2_TABLE[0] = 1f; // float that has the biased exponent of 1f and zeros for sign and mantissa bits final int one = Float.floatToIntBits(1f); for (int i = 0; i < 256; ++i) { float f = Float.intBitsToFloat(one | (i << (23 - 8))); LOG2_TABLE[i] = (float) (Math.log(f) / Math.log(2)); } } /** An approximate log() function in base 2 which trades accuracy for much better performance. */ static float fastLog2(int i) { assert i > 0 : "Cannot compute log of i=" + i; // floorLog2 would be the exponent in the float representation of i int floorLog2 = 31 - Integer.numberOfLeadingZeros(i); // tableIndex would be the first 8 mantissa bits in the float representation of i, excluding // the implicit bit // the left shift clears the high bit, which is implicit in the float representation // the right shift moves the 8 higher mantissa bits to the lower 8 bits int tableIndex = i << (32 - floorLog2) >>> (32 - 8); // i = 1.tableIndex * 2 ^ floorLog2 // log(i) = log2(1.tableIndex) + floorLog2 return floorLog2 + LOG2_TABLE[tableIndex]; } // Simplified bit packing that focuses on the common / efficient case when term IDs can be encoded // on 16 bits // The decoding logic should auto-vectorize. /** * Simple bit packing that focuses on the common / efficient case when term IDs can be encoded on * 16 bits. */ static void writeMonotonicInts(int[] ints, int len, DataOutput out) throws IOException { assert len > 0; assert len <= TERM_IDS_BLOCK_SIZE; if (len >= 3 && ints[len - 1] - ints[0] <= 0xFFFF) { for (int i = 1; i < len; ++i) { ints[i] -= ints[0]; } final int numPacked = (len - 1) / 2; final int encodedLen = 1 + len / 2; for (int i = 0; i < numPacked; ++i) { ints[1 + i] |= ints[encodedLen + i] << 16; } out.writeByte((byte) ((len << 1) | 1)); for (int i = 0; i < encodedLen; ++i) { out.writeInt(ints[i]); } } else { out.writeByte((byte) (len << 1)); for (int i = 0; i < len; ++i) { out.writeInt(ints[i]); } } } /** * Decoding logic for {@link #writeMonotonicInts(int[], int, DataOutput)}. It should get * auto-vectorized. */ static int readMonotonicInts(DataInput in, int[] ints) throws IOException { int token = in.readByte() & 0xFF; int len = token >>> 1; boolean packed = (token & 1) != 0; if (packed) { final int encodedLen = 1 + len / 2; in.readInts(ints, 0, encodedLen); final int numPacked = (len - 1) / 2; for (int i = 0; i < numPacked; ++i) { ints[encodedLen + i] = ints[1 + i] >>> 16; ints[1 + i] &= 0xFFFF; } for (int i = 1; i < len; ++i) { ints[i] += ints[0]; } } else { in.readInts(ints, 0, len); } return len; } /** * Use a LSB Radix Sorter to sort the (docID, termID) entries. We only need to compare docIds * because LSB Radix Sorter is stable and termIDs already sorted. * *

This sorter will require at least 16MB ({@link #BUFFER_BYTES} * {@link #HISTOGRAM_SIZE}) * RAM. */ static class ForwardIndexSorter { private static final int HISTOGRAM_SIZE = 256; private static final int BUFFER_SIZE = 8192; private static final int BUFFER_BYTES = BUFFER_SIZE * Long.BYTES; private final Directory directory; private final Bucket[] buckets = new Bucket[HISTOGRAM_SIZE]; private static class Bucket { private final ByteBuffersDataOutput fps = new ByteBuffersDataOutput(); private final long[] buffer = new long[BUFFER_SIZE]; private IndexOutput output; private int bufferUsed; private int blockNum; private long lastFp; private int finalBlockSize; private void addEntry(long l) throws IOException { buffer[bufferUsed++] = l; if (bufferUsed == BUFFER_SIZE) { flush(false); } } private void flush(boolean isFinal) throws IOException { if (isFinal) { finalBlockSize = bufferUsed; } long fp = output.getFilePointer(); fps.writeVLong(encode(fp - lastFp)); lastFp = fp; for (int i = 0; i < bufferUsed; i++) { output.writeLong(buffer[i]); } lastFp = fp; blockNum++; bufferUsed = 0; } private void reset(IndexOutput resetOutput) { output = resetOutput; finalBlockSize = 0; bufferUsed = 0; blockNum = 0; lastFp = 0; fps.reset(); } } private static long encode(long fpDelta) { assert (fpDelta & 0x07) == 0 : "fpDelta should be multiple of 8"; if (fpDelta % BUFFER_BYTES == 0) { return ((fpDelta / BUFFER_BYTES) << 1) | 1; } else { return fpDelta; } } private static long decode(long fpDelta) { if ((fpDelta & 1) == 1) { return (fpDelta >>> 1) * BUFFER_BYTES; } else { return fpDelta; } } ForwardIndexSorter(Directory directory) { this.directory = directory; for (int i = 0; i < HISTOGRAM_SIZE; i++) { buckets[i] = new Bucket(); } } private void consume(String fileName, LongConsumer consumer) throws IOException { try (IndexInput in = directory.openInput(fileName, IOContext.READONCE)) { final long end = in.length() - CodecUtil.footerLength(); while (in.getFilePointer() < end) { consumer.accept(in.readLong()); } } consumer.onFinish(); } private void consume(String fileName, long indexFP, LongConsumer consumer) throws IOException { try (IndexInput index = directory.openInput(fileName, IOContext.READONCE); IndexInput value = directory.openInput(fileName, IOContext.READONCE)) { index.seek(indexFP); for (int i = 0; i < buckets.length; i++) { int blockNum = index.readVInt(); int finalBlockSize = index.readVInt(); long fp = decode(index.readVLong()); for (int block = 0; block < blockNum - 1; block++) { value.seek(fp); for (int j = 0; j < BUFFER_SIZE; j++) { consumer.accept(value.readLong()); } fp += decode(index.readVLong()); } value.seek(fp); for (int j = 0; j < finalBlockSize; j++) { consumer.accept(value.readLong()); } } consumer.onFinish(); } } private LongConsumer consumer(int shift) { return new LongConsumer() { @Override public void accept(long value) throws IOException { int b = (int) ((value >>> shift) & 0xFF); Bucket bucket = buckets[b]; bucket.addEntry(value); } @Override public void onFinish() throws IOException { for (Bucket bucket : buckets) { bucket.flush(true); } } }; } void sortAndConsume(String fileName, int maxDoc, LongConsumer consumer) throws IOException { int bitsRequired = PackedInts.bitsRequired(maxDoc); String sourceFileName = fileName; long indexFP = -1; for (int shift = 0; shift < bitsRequired; shift += 8) { try (IndexOutput output = directory.createTempOutput(fileName, "sort", IOContext.DEFAULT)) { Arrays.stream(buckets).forEach(b -> b.reset(output)); if (shift == 0) { consume(sourceFileName, consumer(shift)); } else { consume(sourceFileName, indexFP, consumer(shift)); directory.deleteFile(sourceFileName); } indexFP = output.getFilePointer(); for (Bucket bucket : buckets) { output.writeVInt(bucket.blockNum); output.writeVInt(bucket.finalBlockSize); bucket.fps.copyTo(output); } CodecUtil.writeFooter(output); sourceFileName = output.getName(); } } consume(sourceFileName, indexFP, consumer); } } interface LongConsumer { void accept(long value) throws IOException; default void onFinish() throws IOException {} } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy