All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.uniformsplit.BlockLine Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.codecs.uniformsplit;

import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;

/**
 * One term block line.
 *
 * 

Contains a term and its details as a {@link BlockTermState}. * *

The line is written to the {@link UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block * file} in two parts. The first part is the term followed by an offset to the details region. The * second part is the term {@link BlockTermState}, written in the details region, after all the * terms of the block. * *

The separate details region allows fast scan of the terms without having to decode the details * for each term. At read time, the {@link BlockLine.Serializer#readLine} only reads the term and * its offset to the details. The corresponding {@link BlockTermState} is decoded on demand in the * {@link BlockReader} (see {@link BlockReader#readTermStateIfNotRead}). * * @lucene.experimental */ public class BlockLine implements Accountable { private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(BlockLine.class); protected TermBytes termBytes; protected int termStateRelativeOffset; /** Only used for writing. */ protected final BlockTermState termState; /** Constructor used for writing a {@link BlockLine}. */ protected BlockLine(TermBytes termBytes, BlockTermState termState) { this(termBytes, -1, termState); } /** Constructor used for reading a {@link BlockLine}. */ protected BlockLine(TermBytes termBytes, int termStateRelativeOffset) { this(termBytes, termStateRelativeOffset, null); } private BlockLine(TermBytes termBytes, int termStateRelativeOffset, BlockTermState termState) { reset(termBytes, termStateRelativeOffset); this.termState = termState; } /** Resets this {@link BlockLine} to reuse it when reading. */ protected BlockLine reset(TermBytes termBytes, int termStateRelativeOffset) { assert termState == null; this.termBytes = termBytes; this.termStateRelativeOffset = termStateRelativeOffset; return this; } public TermBytes getTermBytes() { return termBytes; } /** * @return The offset of the {@link org.apache.lucene.index.TermState} bytes in the block, * relatively to the term states base offset. */ public int getTermStateRelativeOffset() { return termStateRelativeOffset; } @Override public long ramBytesUsed() { return BASE_RAM_USAGE + termBytes.ramBytesUsed() + RamUsageUtil.ramBytesUsed(termState); } /** * Reads/writes block lines with terms encoded incrementally inside a block. This class keeps a * state of the previous term read to decode the next term. */ public static class Serializer implements Accountable { private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(Serializer.class); protected final BytesRef currentTerm; public Serializer() { currentTerm = new BytesRef(64); } /** * Reads the current line. * * @param isIncrementalEncodingSeed Whether the term is a seed of the incremental encoding. * {@code true} for the first and middle term, {@code false} for other terms. * @param reuse A {@link BlockLine} instance to reuse; or null if none. */ public BlockLine readLine( DataInput blockInput, boolean isIncrementalEncodingSeed, BlockLine reuse) throws IOException { int termStateRelativeOffset = blockInput.readVInt(); if (termStateRelativeOffset < 0) { throw new CorruptIndexException( "Illegal termStateRelativeOffset= " + termStateRelativeOffset, blockInput); } return reuse == null ? new BlockLine( readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, null), termStateRelativeOffset) : reuse.reset( readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, reuse.termBytes), termStateRelativeOffset); } /** * Writes a line and its offset to the corresponding term state details in the details region. * * @param blockOutput The output pointing to the block terms region. * @param termStateRelativeOffset The offset to the corresponding term state details in the * details region. * @param isIncrementalEncodingSeed Whether the term is a seed of the incremental encoding. * {@code true} for the first and middle term, {@code false} for other terms. */ public void writeLine( DataOutput blockOutput, BlockLine line, BlockLine previousLine, int termStateRelativeOffset, boolean isIncrementalEncodingSeed) throws IOException { blockOutput.writeVInt(termStateRelativeOffset); writeIncrementallyEncodedTerm( line.getTermBytes(), previousLine == null ? null : previousLine.getTermBytes(), isIncrementalEncodingSeed, blockOutput); } /** * Writes the term state details of a line in the details region. * * @param termStatesOutput The output pointing to the details region. */ protected void writeLineTermState( DataOutput termStatesOutput, BlockLine line, FieldInfo fieldInfo, DeltaBaseTermStateSerializer encoder) throws IOException { assert line.termState != null; encoder.writeTermState(termStatesOutput, fieldInfo, line.termState); } protected void writeIncrementallyEncodedTerm( TermBytes termBytes, TermBytes previousTermBytes, boolean isIncrementalEncodingSeed, DataOutput blockOutput) throws IOException { BytesRef term = termBytes.getTerm(); assert term.offset == 0; if (isIncrementalEncodingSeed) { // Mdp length is always 1 for an incremental encoding seed. blockOutput.writeVLong(term.length); blockOutput.writeBytes(term.bytes, 0, term.length); return; } if (term.length == 0) { // Empty term. blockOutput.writeVLong(0); return; } // For other lines we store: // - Mdp length. // - Suffix length. // - Suffix bytes. // Instead of writing mdp length and suffix length with 2 VInt, we can compress the storage // by merging them in a single VLong. The idea is to leverage the information we have about // the previous line. We know the previous line term length. And we know that // new line mdp length <= (previous line term length + 1) // So if numMdpBits = numBitsToEncode(previous line term length), // then we know we can encode (new line mdp length - 1) in numMdpBits. // Hence we encode (new line mdp length - 1) in the rightmost numMdpBits of the VLong. // And we encode new line suffix length in the remaining left bits of the VLong. // Most of the time both values will be encoded in a single byte. assert previousTermBytes != null; assert termBytes.getMdpLength() >= 1; int numMdpBits = numBitsToEncode(previousTermBytes.getTerm().length); assert numBitsToEncode(termBytes.getMdpLength() - 1) <= numMdpBits; long mdpAndSuffixLengths = (((long) termBytes.getSuffixLength()) << numMdpBits) | (termBytes.getMdpLength() - 1); assert mdpAndSuffixLengths != 0; blockOutput.writeVLong(mdpAndSuffixLengths); blockOutput.writeBytes(term.bytes, termBytes.getSuffixOffset(), termBytes.getSuffixLength()); } protected TermBytes readIncrementallyEncodedTerm( DataInput blockInput, boolean isIncrementalEncodingSeed, TermBytes reuse) throws IOException { assert currentTerm.offset == 0; int mdpLength; if (isIncrementalEncodingSeed) { int length = (int) blockInput.readVLong(); mdpLength = length == 0 ? 0 : 1; readBytes(blockInput, currentTerm, 0, length); } else { long mdpAndSuffixLengths = blockInput.readVLong(); if (mdpAndSuffixLengths == 0) { // Empty term. mdpLength = 0; currentTerm.length = 0; } else { int numMdpBits = numBitsToEncode(currentTerm.length); mdpLength = (int) (mdpAndSuffixLengths & ((1 << numMdpBits) - 1)) + 1; // Get rightmost numMdpBits. int suffixLength = (int) (mdpAndSuffixLengths >>> numMdpBits); // Get remaining left bits. assert mdpLength >= 1; assert suffixLength >= 1; readBytes(blockInput, currentTerm, mdpLength - 1, suffixLength); } } return reuse == null ? new TermBytes(mdpLength, currentTerm) : reuse.reset(mdpLength, currentTerm); } /** * Reads {@code length} bytes from the given {@link DataInput} and stores them at {@code offset} * in {@code bytes.bytes}. */ protected void readBytes(DataInput input, BytesRef bytes, int offset, int length) throws IOException { assert bytes.offset == 0; bytes.length = offset + length; bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length); input.readBytes(bytes.bytes, offset, length); } @Override public long ramBytesUsed() { return BASE_RAM_USAGE + RamUsageUtil.ramBytesUsed(currentTerm); } /** * Gets the number of bits required to encode the value of the provided int. Returns 0 for int * value 0. Equivalent to (log2(i) + 1). */ protected static int numBitsToEncode(int i) { return 32 - Integer.numberOfLeadingZeros(i); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy