org.apache.lucene.codecs.uniformsplit.BlockWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Codecs and postings formats for Apache Lucene.
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.codecs.uniformsplit;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;

/**
 * Writes blocks in the block file.
 *
 * According the Uniform Split technique, the writing combines three steps per block, and it is
 * repeated for all the field blocks:
 *
 * 

 *   Select the term with the shortest {@link TermBytes minimal distinguishing prefix} (MDP) in
 *       the neighborhood of the {@link #targetNumBlockLines target block size} (+- {@link
 *       #deltaNumLines delta size})
 *   
The selected term becomes the first term of the next block, and its MDP is the next block
 *       key.
 *   
The current block is written to the {@link
 *       UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block file}. And its block key is {@link
 *       IndexDictionary.Builder#add(BytesRef, long) added} to the {@link IndexDictionary index
 *       dictionary}.
 * 
 *
 * This stateful {@link BlockWriter} is called repeatedly to {@link #addLine(BytesRef,
 * BlockTermState, IndexDictionary.Builder) add} all the {@link BlockLine} terms of a field. Then
 * {@link #finishLastBlock} is called. And then this {@link BlockWriter} can be reused to add the
 * terms of another field.
 *
 * @lucene.experimental
 */
public class BlockWriter {

  protected final int targetNumBlockLines;
  protected final int deltaNumLines;
  protected final List blockLines;

  protected final IndexOutput blockOutput;
  protected final ByteBuffersDataOutput blockLinesWriteBuffer;
  protected final ByteBuffersDataOutput termStatesWriteBuffer;

  protected final BlockHeader.Serializer blockHeaderWriter;
  protected final BlockLine.Serializer blockLineWriter;
  protected final DeltaBaseTermStateSerializer termStateSerializer;
  protected final BlockEncoder blockEncoder;
  protected final ByteBuffersDataOutput blockWriteBuffer;

  protected FieldMetadata fieldMetadata;
  protected BytesRef lastTerm;

  protected final BlockHeader reusableBlockHeader;
  protected BytesRef scratchBytesRef;

  protected BlockWriter(
      IndexOutput blockOutput,
      int targetNumBlockLines,
      int deltaNumLines,
      BlockEncoder blockEncoder) {
    assert blockOutput != null;
    assert targetNumBlockLines > 0;
    assert deltaNumLines >= 0;
    assert deltaNumLines < targetNumBlockLines;
    this.blockOutput = blockOutput;
    this.targetNumBlockLines = targetNumBlockLines;
    this.deltaNumLines = deltaNumLines;
    this.blockEncoder = blockEncoder;

    this.blockLines = new ArrayList<>(targetNumBlockLines);
    this.blockHeaderWriter = createBlockHeaderSerializer();
    this.blockLineWriter = createBlockLineSerializer();
    this.termStateSerializer = createDeltaBaseTermStateSerializer();

    this.blockLinesWriteBuffer = ByteBuffersDataOutput.newResettableInstance();
    this.termStatesWriteBuffer = ByteBuffersDataOutput.newResettableInstance();
    this.blockWriteBuffer = ByteBuffersDataOutput.newResettableInstance();

    this.reusableBlockHeader = new BlockHeader();
    this.scratchBytesRef = new BytesRef();
  }

  protected BlockHeader.Serializer createBlockHeaderSerializer() {
    return new BlockHeader.Serializer();
  }

  protected BlockLine.Serializer createBlockLineSerializer() {
    return new BlockLine.Serializer();
  }

  protected DeltaBaseTermStateSerializer createDeltaBaseTermStateSerializer() {
    return new DeltaBaseTermStateSerializer();
  }

  /**
   * Adds a new {@link BlockLine} term for the current field.
   *
   * This method determines whether the new term is part of the current block, or if it is part
   * of the next block. In the latter case, a new block is started (including one or more of the
   * lastly added lines), the current block is written to the block file, and the current block key
   * is added to the {@link IndexDictionary.Builder}.
   *
   * @param term The block line term. The {@link BytesRef} instance is used directly, the caller is
   *     responsible to make a deep copy if needed. This is required because we keep a list of block
   *     lines until we decide to write the current block, and each line must have a different term
   *     instance.
   * @param blockTermState Block line details.
   * @param dictionaryBuilder to which the block keys are added.
   */
  protected void addLine(
      BytesRef term, BlockTermState blockTermState, IndexDictionary.Builder dictionaryBuilder)
      throws IOException {
    assert term != null;
    assert blockTermState != null;
    int mdpLength = TermBytes.computeMdpLength(lastTerm, term);
    blockLines.add(new BlockLine(new TermBytes(mdpLength, term), blockTermState));
    lastTerm = term;
    if (blockLines.size() >= targetNumBlockLines + deltaNumLines) {
      splitAndWriteBlock(dictionaryBuilder);
    }
  }

  /**
   * This method is called when there is no more term for the field. It writes the remaining lines
   * added with {@link #addLine} as the last block of the field and resets this {@link BlockWriter}
   * state. Then this {@link BlockWriter} can be used for another field.
   */
  protected void finishLastBlock(IndexDictionary.Builder dictionaryBuilder) throws IOException {
    while (!blockLines.isEmpty()) {
      splitAndWriteBlock(dictionaryBuilder);
    }
    fieldMetadata = null;
    lastTerm = null;
  }

  /**
   * Defines the new block start according to {@link #targetNumBlockLines} and {@link
   * #deltaNumLines}. The new block is started (including one or more of the lastly added lines),
   * the current block is written to the block file, and the current block key is added to the
   * {@link IndexDictionary.Builder}.
   */
  protected void splitAndWriteBlock(IndexDictionary.Builder dictionaryBuilder) throws IOException {
    assert !blockLines.isEmpty();
    int numLines = blockLines.size();

    if (numLines <= targetNumBlockLines - deltaNumLines) {
      writeBlock(blockLines, dictionaryBuilder);
      blockLines.clear();
      return;
    }
    int deltaStart = numLines - deltaNumLines * 2;
    assert deltaStart >= 1 : "blockLines size: " + numLines;
    int minMdpLength = Integer.MAX_VALUE;
    int minMdpEndIndex = 0;

    for (int i = deltaStart; i < numLines; i++) {
      TermBytes term = blockLines.get(i).getTermBytes();
      int mdpLength = term.getMdpLength();
      if (mdpLength <= minMdpLength) {
        minMdpLength = mdpLength;
        minMdpEndIndex = i;
      }
    }

    List subList = blockLines.subList(0, minMdpEndIndex);
    writeBlock(subList, dictionaryBuilder);
    // Clear the written block lines to keep only the lines composing the next block.
    // ArrayList.subList().clear() is O(N) but still fast since we work on a small list.
    // It is internally an array copy and an iteration to set array refs to null.
    // For clarity we keep that until the day a CircularArrayList is available in the jdk.
    subList.clear();
  }

  /** Writes a block and adds its block key to the dictionary builder. */
  protected void writeBlock(List blockLines, IndexDictionary.Builder dictionaryBuilder)
      throws IOException {

    long blockStartFP = blockOutput.getFilePointer();

    addBlockKey(blockLines, dictionaryBuilder);

    int middle = blockLines.size() >> 1;
    int middleOffset = -1;
    BlockLine previousLine = null;
    for (int i = 0, size = blockLines.size(); i < size; i++) {
      boolean isIncrementalEncodingSeed = i == 0;
      if (i == middle) {
        middleOffset = Math.toIntExact(blockLinesWriteBuffer.size());
        isIncrementalEncodingSeed = true;
      }
      BlockLine line = blockLines.get(i);
      writeBlockLine(isIncrementalEncodingSeed, line, previousLine);
      previousLine = line;
    }

    reusableBlockHeader.reset(
        blockLines.size(),
        termStateSerializer.getBaseDocStartFP(),
        termStateSerializer.getBasePosStartFP(),
        termStateSerializer.getBasePayStartFP(),
        Math.toIntExact(blockLinesWriteBuffer.size()),
        middleOffset);
    blockHeaderWriter.write(blockWriteBuffer, reusableBlockHeader);

    blockLinesWriteBuffer.copyTo(blockWriteBuffer);
    termStatesWriteBuffer.copyTo(blockWriteBuffer);

    if (blockEncoder == null) {
      blockOutput.writeVInt(Math.toIntExact(blockWriteBuffer.size()));
      blockWriteBuffer.copyTo(blockOutput);
    } else {
      BlockEncoder.WritableBytes encodedBytes =
          blockEncoder.encode(blockWriteBuffer.toDataInput(), blockWriteBuffer.size());
      blockOutput.writeVInt(Math.toIntExact(encodedBytes.size()));
      encodedBytes.writeTo(blockOutput);
    }

    blockLinesWriteBuffer.reset();
    termStatesWriteBuffer.reset();
    blockWriteBuffer.reset();

    termStateSerializer.resetBaseStartFP();

    updateFieldMetadata(blockStartFP);
  }

  /** updates the field metadata after all lines were written for the block. */
  protected void updateFieldMetadata(long blockStartFP) {
    assert fieldMetadata != null;
    if (fieldMetadata.getFirstBlockStartFP() == -1) {
      fieldMetadata.setFirstBlockStartFP(blockStartFP);
    }
    fieldMetadata.setLastBlockStartFP(blockStartFP);
  }

  void setField(FieldMetadata fieldMetadata) {
    this.fieldMetadata = fieldMetadata;
  }

  protected void writeBlockLine(
      boolean isIncrementalEncodingSeed, BlockLine line, BlockLine previousLine)
      throws IOException {
    assert fieldMetadata != null;
    blockLineWriter.writeLine(
        blockLinesWriteBuffer,
        line,
        previousLine,
        Math.toIntExact(termStatesWriteBuffer.size()),
        isIncrementalEncodingSeed);
    blockLineWriter.writeLineTermState(
        termStatesWriteBuffer, line, fieldMetadata.getFieldInfo(), termStateSerializer);
  }

  /**
   * Adds a new block key with its corresponding block file pointer to the {@link
   * IndexDictionary.Builder} . The block key is the MDP (see {@link TermBytes}) of the block first
   * term.
   */
  protected void addBlockKey(List blockLines, IndexDictionary.Builder dictionaryBuilder)
      throws IOException {
    assert !blockLines.isEmpty();
    assert dictionaryBuilder != null;
    TermBytes firstTerm = blockLines.get(0).getTermBytes();
    assert firstTerm.getTerm().offset == 0;
    assert scratchBytesRef.offset == 0;
    scratchBytesRef.bytes = firstTerm.getTerm().bytes;
    scratchBytesRef.length = firstTerm.getMdpLength();
    dictionaryBuilder.add(scratchBytesRef, blockOutput.getFilePointer());
  }
}