org.apache.lucene.codecs.uniformsplit.FSTDictionary Maven / Gradle / Ivy
Show all versions of lucene-codecs Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.uniformsplit;
import java.io.IOException;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.OffHeapFSTStore;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
/**
* Immutable stateless {@link FST}-based index dictionary kept in memory.
*
* Use {@link IndexDictionary.Builder} to build the {@link IndexDictionary}.
*
*
Create a stateful {@link IndexDictionary.Browser} to seek a term in this {@link
* IndexDictionary} and get its corresponding block file pointer to the terms block file.
*
*
Its greatest advantage is to be very compact in memory thanks to both the compaction of the
* {@link FST} as a byte array, and the incremental encoding of the leaves block pointer values,
* which are long integers in increasing order, with {@link PositiveIntOutputs}.
* With a compact dictionary in memory we can increase the number of blocks. This allows us to
* reduce the average block size, which means faster scan inside a block.
*
* @lucene.experimental
*/
public class FSTDictionary implements IndexDictionary {
protected final FST fst;
protected FSTDictionary(FST fst) {
this.fst = fst;
}
@Override
public void write(DataOutput output, BlockEncoder blockEncoder) throws IOException {
if (blockEncoder == null) {
fst.save(output, output);
} else {
ByteBuffersDataOutput bytesDataOutput = ByteBuffersDataOutput.newResettableInstance();
fst.save(bytesDataOutput, bytesDataOutput);
BlockEncoder.WritableBytes encodedBytes =
blockEncoder.encode(bytesDataOutput.toDataInput(), bytesDataOutput.size());
output.writeVLong(encodedBytes.size());
encodedBytes.writeTo(output);
}
}
/**
* Reads a {@link FSTDictionary} from the provided input.
*
* @param blockDecoder The {@link BlockDecoder} to use for specific decoding; or null if none.
*/
protected static FSTDictionary read(
DataInput input, BlockDecoder blockDecoder, boolean isFSTOnHeap) throws IOException {
DataInput fstDataInput;
if (blockDecoder == null) {
fstDataInput = input;
} else {
long numBytes = input.readVLong();
BytesRef decodedBytes = blockDecoder.decode(input, numBytes);
fstDataInput = new ByteArrayDataInput(decodedBytes.bytes, 0, decodedBytes.length);
// OffHeapFSTStore.init() requires a DataInput which is an instance of IndexInput.
// When the block is decoded we must load the FST on heap.
isFSTOnHeap = true;
}
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
FST.FSTMetadata metadata = FST.readMetadata(fstDataInput, fstOutputs);
FST fst;
if (isFSTOnHeap) {
fst = new FST<>(metadata, fstDataInput);
} else {
final IndexInput indexInput = (IndexInput) fstDataInput;
fst =
FST.fromFSTReader(
metadata, new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), metadata));
}
return new FSTDictionary(fst);
}
@Override
public Browser browser() {
return new Browser();
}
/**
* Stateful {@link Browser} to seek a term in this {@link FSTDictionary} and get its corresponding
* block file pointer in the block file.
*/
protected class Browser implements IndexDictionary.Browser {
protected final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum<>(fst);
@Override
public long seekBlock(BytesRef term) throws IOException {
BytesRefFSTEnum.InputOutput seekFloor = fstEnum.seekFloor(term);
return seekFloor == null ? -1 : seekFloor.output;
}
}
/**
* Provides stateful {@link Browser} to seek in the {@link FSTDictionary}.
*
* @lucene.experimental
*/
public static class BrowserSupplier implements IndexDictionary.BrowserSupplier {
protected final IndexInput dictionaryInput;
protected final BlockDecoder blockDecoder;
protected final boolean isFSTOnHeap;
/**
* Lazy loaded immutable index dictionary FST. The FST is either kept off-heap, or hold in RAM
* on-heap.
*/
protected IndexDictionary dictionary;
public BrowserSupplier(
IndexInput dictionaryInput,
long dictionaryStartFP,
BlockDecoder blockDecoder,
boolean isFSTOnHeap)
throws IOException {
this.dictionaryInput = dictionaryInput.clone();
this.dictionaryInput.seek(dictionaryStartFP);
this.blockDecoder = blockDecoder;
this.isFSTOnHeap = isFSTOnHeap;
}
@Override
public IndexDictionary.Browser get() throws IOException {
// This double-check idiom does not require the dictionary to be volatile
// because it is immutable. See section "Double-Checked Locking Immutable Objects"
// of https://www.cs.umd.edu/~pugh/java/memoryModel/DoubleCheckedLocking.html.
if (dictionary == null) {
synchronized (this) {
if (dictionary == null) {
dictionary = read(dictionaryInput, blockDecoder, isFSTOnHeap);
}
}
}
return dictionary.browser();
}
}
/**
* Builds an immutable {@link FSTDictionary}.
*
* @lucene.experimental
*/
public static class Builder implements IndexDictionary.Builder {
protected final FSTCompiler fstCompiler;
protected final IntsRefBuilder scratchInts;
public Builder() throws IOException {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
scratchInts = new IntsRefBuilder();
}
@Override
public void add(BytesRef blockKey, long blockFilePointer) throws IOException {
fstCompiler.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer);
}
@Override
public FSTDictionary build() throws IOException {
return new FSTDictionary(
FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()));
}
}
}