org.apache.lucene.codecs.uniformsplit.DeltaBaseTermStateSerializer Maven / Gradle / Ivy
Show all versions of lucene-codecs Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.uniformsplit;
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.BLOCK_SIZE;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.RamUsageEstimator;
/**
* {@link TermState} serializer which encodes each file pointer as a delta relative to a base file
* pointer. It differs from {@link Lucene101PostingsWriter#encodeTerm} which encodes each file
* pointer as a delta relative to the previous file pointer.
*
* It automatically sets the base file pointer to the first valid file pointer for doc start FP,
* pos start FP, pay start FP. These base file pointers have to be {@link #resetBaseStartFP() reset}
* by the caller before starting to write a new block.
*
* @lucene.experimental
*/
public class DeltaBaseTermStateSerializer implements Accountable {
private static final long RAM_USAGE =
RamUsageEstimator.shallowSizeOfInstance(DeltaBaseTermStateSerializer.class);
private static final long INT_BLOCK_TERM_STATE_RAM_USAGE =
RamUsageEstimator.shallowSizeOfInstance(IntBlockTermState.class);
protected long baseDocStartFP;
protected long basePosStartFP;
protected long basePayStartFP;
public DeltaBaseTermStateSerializer() {
resetBaseStartFP();
}
/**
* Resets the base file pointers to 0. This method has to be called before starting to write a new
* block.
*/
public void resetBaseStartFP() {
this.baseDocStartFP = 0;
this.basePosStartFP = 0;
this.basePayStartFP = 0;
}
/**
* @return The base doc start file pointer. It is the file pointer of the first {@link TermState}
* written after {@link #resetBaseStartFP()} is called.
*/
public long getBaseDocStartFP() {
return baseDocStartFP;
}
/**
* @return The base position start file pointer. It is the file pointer of the first {@link
* TermState} written after {@link #resetBaseStartFP()} is called.
*/
public long getBasePosStartFP() {
return basePosStartFP;
}
/**
* @return The base payload start file pointer. It is the file pointer of the first {@link
* TermState} written after {@link #resetBaseStartFP()} is called.
*/
public long getBasePayStartFP() {
return basePayStartFP;
}
/**
* Writes a {@link BlockTermState} to the provided {@link DataOutput}.
*
*
Simpler variant of {@link Lucene101PostingsWriter#encodeTerm(DataOutput, FieldInfo,
* BlockTermState, boolean)}.
*/
public void writeTermState(
DataOutput termStatesOutput, FieldInfo fieldInfo, BlockTermState termState)
throws IOException {
IndexOptions indexOptions = fieldInfo.getIndexOptions();
boolean hasFreqs = indexOptions != IndexOptions.DOCS;
boolean hasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
boolean hasOffsets =
indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
boolean hasPayloads = fieldInfo.hasPayloads();
IntBlockTermState intTermState = (IntBlockTermState) termState;
termStatesOutput.writeVInt(intTermState.docFreq);
if (hasFreqs) {
assert intTermState.totalTermFreq >= intTermState.docFreq;
termStatesOutput.writeVLong(intTermState.totalTermFreq - intTermState.docFreq);
}
if (intTermState.singletonDocID != -1) {
termStatesOutput.writeVInt(intTermState.singletonDocID);
} else {
if (baseDocStartFP == 0) {
baseDocStartFP = intTermState.docStartFP;
}
termStatesOutput.writeVLong(intTermState.docStartFP - baseDocStartFP);
}
if (hasPositions) {
if (basePosStartFP == 0) {
basePosStartFP = intTermState.posStartFP;
}
termStatesOutput.writeVLong(intTermState.posStartFP - basePosStartFP);
if (hasPayloads || hasOffsets) {
if (basePayStartFP == 0) {
basePayStartFP = intTermState.payStartFP;
}
termStatesOutput.writeVLong(intTermState.payStartFP - basePayStartFP);
}
if (intTermState.lastPosBlockOffset != -1) {
termStatesOutput.writeVLong(intTermState.lastPosBlockOffset);
}
}
}
/**
* Reads a {@link BlockTermState} from the provided {@link DataInput}.
*
*
Simpler variant of {@link Lucene101PostingsReader#decodeTerm(DataInput, FieldInfo,
* BlockTermState, boolean)}.
*
* @param reuse {@link BlockTermState} to reuse; or null to create a new one.
*/
public BlockTermState readTermState(
long baseDocStartFP,
long basePosStartFP,
long basePayStartFP,
DataInput termStatesInput,
FieldInfo fieldInfo,
BlockTermState reuse)
throws IOException {
IndexOptions indexOptions = fieldInfo.getIndexOptions();
boolean hasFreqs = indexOptions != IndexOptions.DOCS;
boolean hasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
IntBlockTermState intTermState =
reuse != null ? reset((IntBlockTermState) reuse) : new IntBlockTermState();
intTermState.docFreq = termStatesInput.readVInt();
intTermState.totalTermFreq =
hasFreqs ? intTermState.docFreq + termStatesInput.readVLong() : intTermState.docFreq;
assert intTermState.totalTermFreq >= intTermState.docFreq;
if (intTermState.docFreq == 1) {
intTermState.singletonDocID = termStatesInput.readVInt();
} else {
intTermState.docStartFP = baseDocStartFP + termStatesInput.readVLong();
}
if (hasPositions) {
intTermState.posStartFP = basePosStartFP + termStatesInput.readVLong();
boolean hasOffsets =
indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
if (hasOffsets || fieldInfo.hasPayloads()) {
intTermState.payStartFP = basePayStartFP + termStatesInput.readVLong();
}
if (intTermState.totalTermFreq > BLOCK_SIZE) {
intTermState.lastPosBlockOffset = termStatesInput.readVLong();
}
}
return intTermState;
}
protected IntBlockTermState reset(IntBlockTermState termState) {
// OrdTermState.
termState.ord = 0;
// BlockTermState.
termState.docFreq = 0;
termState.totalTermFreq = 0;
termState.termBlockOrd = 0;
termState.blockFilePointer = 0;
// IntBlockTermState.
termState.docStartFP = 0;
termState.posStartFP = 0;
termState.payStartFP = 0;
termState.lastPosBlockOffset = -1;
termState.singletonDocID = -1;
return termState;
}
@Override
public long ramBytesUsed() {
return RAM_USAGE;
}
/**
* @return The estimated RAM usage of the given {@link TermState}.
*/
public static long ramBytesUsed(TermState termState) {
return termState instanceof IntBlockTermState
? INT_BLOCK_TERM_STATE_RAM_USAGE
: RamUsageEstimator.shallowSizeOf(termState);
}
}