All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.uniformsplit.BlockWriter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.codecs.uniformsplit;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;

/**
 * Writes blocks in the block file.
 *
 * 

According the Uniform Split technique, the writing combines three steps per block, and it is * repeated for all the field blocks: * *

    *
  1. Select the term with the shortest {@link TermBytes minimal distinguishing prefix} (MDP) in * the neighborhood of the {@link #targetNumBlockLines target block size} (+- {@link * #deltaNumLines delta size}) *
  2. The selected term becomes the first term of the next block, and its MDP is the next block * key. *
  3. The current block is written to the {@link * UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block file}. And its block key is {@link * IndexDictionary.Builder#add(BytesRef, long) added} to the {@link IndexDictionary index * dictionary}. *
* *

This stateful {@link BlockWriter} is called repeatedly to {@link #addLine(BytesRef, * BlockTermState, IndexDictionary.Builder) add} all the {@link BlockLine} terms of a field. Then * {@link #finishLastBlock} is called. And then this {@link BlockWriter} can be reused to add the * terms of another field. * * @lucene.experimental */ public class BlockWriter { protected final int targetNumBlockLines; protected final int deltaNumLines; protected final List blockLines; protected final IndexOutput blockOutput; protected final ByteBuffersDataOutput blockLinesWriteBuffer; protected final ByteBuffersDataOutput termStatesWriteBuffer; protected final BlockHeader.Serializer blockHeaderWriter; protected final BlockLine.Serializer blockLineWriter; protected final DeltaBaseTermStateSerializer termStateSerializer; protected final BlockEncoder blockEncoder; protected final ByteBuffersDataOutput blockWriteBuffer; protected FieldMetadata fieldMetadata; protected BytesRef lastTerm; protected final BlockHeader reusableBlockHeader; protected BytesRef scratchBytesRef; protected BlockWriter( IndexOutput blockOutput, int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder) { assert blockOutput != null; assert targetNumBlockLines > 0; assert deltaNumLines >= 0; assert deltaNumLines < targetNumBlockLines; this.blockOutput = blockOutput; this.targetNumBlockLines = targetNumBlockLines; this.deltaNumLines = deltaNumLines; this.blockEncoder = blockEncoder; this.blockLines = new ArrayList<>(targetNumBlockLines); this.blockHeaderWriter = createBlockHeaderSerializer(); this.blockLineWriter = createBlockLineSerializer(); this.termStateSerializer = createDeltaBaseTermStateSerializer(); this.blockLinesWriteBuffer = ByteBuffersDataOutput.newResettableInstance(); this.termStatesWriteBuffer = ByteBuffersDataOutput.newResettableInstance(); this.blockWriteBuffer = ByteBuffersDataOutput.newResettableInstance(); this.reusableBlockHeader = new BlockHeader(); this.scratchBytesRef = new BytesRef(); } protected BlockHeader.Serializer createBlockHeaderSerializer() { return new BlockHeader.Serializer(); } protected BlockLine.Serializer createBlockLineSerializer() { return new BlockLine.Serializer(); } protected DeltaBaseTermStateSerializer createDeltaBaseTermStateSerializer() { return new DeltaBaseTermStateSerializer(); } /** * Adds a new {@link BlockLine} term for the current field. * *

This method determines whether the new term is part of the current block, or if it is part * of the next block. In the latter case, a new block is started (including one or more of the * lastly added lines), the current block is written to the block file, and the current block key * is added to the {@link IndexDictionary.Builder}. * * @param term The block line term. The {@link BytesRef} instance is used directly, the caller is * responsible to make a deep copy if needed. This is required because we keep a list of block * lines until we decide to write the current block, and each line must have a different term * instance. * @param blockTermState Block line details. * @param dictionaryBuilder to which the block keys are added. */ protected void addLine( BytesRef term, BlockTermState blockTermState, IndexDictionary.Builder dictionaryBuilder) throws IOException { assert term != null; assert blockTermState != null; int mdpLength = TermBytes.computeMdpLength(lastTerm, term); blockLines.add(new BlockLine(new TermBytes(mdpLength, term), blockTermState)); lastTerm = term; if (blockLines.size() >= targetNumBlockLines + deltaNumLines) { splitAndWriteBlock(dictionaryBuilder); } } /** * This method is called when there is no more term for the field. It writes the remaining lines * added with {@link #addLine} as the last block of the field and resets this {@link BlockWriter} * state. Then this {@link BlockWriter} can be used for another field. */ protected void finishLastBlock(IndexDictionary.Builder dictionaryBuilder) throws IOException { while (!blockLines.isEmpty()) { splitAndWriteBlock(dictionaryBuilder); } fieldMetadata = null; lastTerm = null; } /** * Defines the new block start according to {@link #targetNumBlockLines} and {@link * #deltaNumLines}. The new block is started (including one or more of the lastly added lines), * the current block is written to the block file, and the current block key is added to the * {@link IndexDictionary.Builder}. */ protected void splitAndWriteBlock(IndexDictionary.Builder dictionaryBuilder) throws IOException { assert !blockLines.isEmpty(); int numLines = blockLines.size(); if (numLines <= targetNumBlockLines - deltaNumLines) { writeBlock(blockLines, dictionaryBuilder); blockLines.clear(); return; } int deltaStart = numLines - deltaNumLines * 2; assert deltaStart >= 1 : "blockLines size: " + numLines; int minMdpLength = Integer.MAX_VALUE; int minMdpEndIndex = 0; for (int i = deltaStart; i < numLines; i++) { TermBytes term = blockLines.get(i).getTermBytes(); int mdpLength = term.getMdpLength(); if (mdpLength <= minMdpLength) { minMdpLength = mdpLength; minMdpEndIndex = i; } } List subList = blockLines.subList(0, minMdpEndIndex); writeBlock(subList, dictionaryBuilder); // Clear the written block lines to keep only the lines composing the next block. // ArrayList.subList().clear() is O(N) but still fast since we work on a small list. // It is internally an array copy and an iteration to set array refs to null. // For clarity we keep that until the day a CircularArrayList is available in the jdk. subList.clear(); } /** Writes a block and adds its block key to the dictionary builder. */ protected void writeBlock(List blockLines, IndexDictionary.Builder dictionaryBuilder) throws IOException { long blockStartFP = blockOutput.getFilePointer(); addBlockKey(blockLines, dictionaryBuilder); int middle = blockLines.size() >> 1; int middleOffset = -1; BlockLine previousLine = null; for (int i = 0, size = blockLines.size(); i < size; i++) { boolean isIncrementalEncodingSeed = i == 0; if (i == middle) { middleOffset = Math.toIntExact(blockLinesWriteBuffer.size()); isIncrementalEncodingSeed = true; } BlockLine line = blockLines.get(i); writeBlockLine(isIncrementalEncodingSeed, line, previousLine); previousLine = line; } reusableBlockHeader.reset( blockLines.size(), termStateSerializer.getBaseDocStartFP(), termStateSerializer.getBasePosStartFP(), termStateSerializer.getBasePayStartFP(), Math.toIntExact(blockLinesWriteBuffer.size()), middleOffset); blockHeaderWriter.write(blockWriteBuffer, reusableBlockHeader); blockLinesWriteBuffer.copyTo(blockWriteBuffer); termStatesWriteBuffer.copyTo(blockWriteBuffer); if (blockEncoder == null) { blockOutput.writeVInt(Math.toIntExact(blockWriteBuffer.size())); blockWriteBuffer.copyTo(blockOutput); } else { BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(blockWriteBuffer.toDataInput(), blockWriteBuffer.size()); blockOutput.writeVInt(Math.toIntExact(encodedBytes.size())); encodedBytes.writeTo(blockOutput); } blockLinesWriteBuffer.reset(); termStatesWriteBuffer.reset(); blockWriteBuffer.reset(); termStateSerializer.resetBaseStartFP(); updateFieldMetadata(blockStartFP); } /** updates the field metadata after all lines were written for the block. */ protected void updateFieldMetadata(long blockStartFP) { assert fieldMetadata != null; if (fieldMetadata.getFirstBlockStartFP() == -1) { fieldMetadata.setFirstBlockStartFP(blockStartFP); } fieldMetadata.setLastBlockStartFP(blockStartFP); } void setField(FieldMetadata fieldMetadata) { this.fieldMetadata = fieldMetadata; } protected void writeBlockLine( boolean isIncrementalEncodingSeed, BlockLine line, BlockLine previousLine) throws IOException { assert fieldMetadata != null; blockLineWriter.writeLine( blockLinesWriteBuffer, line, previousLine, Math.toIntExact(termStatesWriteBuffer.size()), isIncrementalEncodingSeed); blockLineWriter.writeLineTermState( termStatesWriteBuffer, line, fieldMetadata.getFieldInfo(), termStateSerializer); } /** * Adds a new block key with its corresponding block file pointer to the {@link * IndexDictionary.Builder} . The block key is the MDP (see {@link TermBytes}) of the block first * term. */ protected void addBlockKey(List blockLines, IndexDictionary.Builder dictionaryBuilder) throws IOException { assert !blockLines.isEmpty(); assert dictionaryBuilder != null; TermBytes firstTerm = blockLines.get(0).getTermBytes(); assert firstTerm.getTerm().offset == 0; assert scratchBytesRef.offset == 0; scratchBytesRef.bytes = firstTerm.getTerm().bytes; scratchBytesRef.length = firstTerm.getMdpLength(); dictionaryBuilder.add(scratchBytesRef, blockOutput.getFilePointer()); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy