org.apache.lucene.codecs.blockterms.FixedGapTermsIndexWriter Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.blockterms;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
/**
* Selects every Nth term as and index term, and hold term bytes (mostly) fully expanded in memory.
* This terms index supports seeking by ord. See {@link VariableGapTermsIndexWriter} for a more
* memory efficient terms index that does not support seeking by ord.
*
* @lucene.experimental
*/
public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
protected IndexOutput out;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tii";
static final String CODEC_NAME = "FixedGapTermsIndex";
static final int VERSION_START = 4;
static final int VERSION_CURRENT = VERSION_START;
static final int BLOCKSIZE = 4096;
private final int termIndexInterval;
public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;
private final List fields = new ArrayList<>();
public FixedGapTermsIndexWriter(SegmentWriteState state) throws IOException {
this(state, DEFAULT_TERM_INDEX_INTERVAL);
}
public FixedGapTermsIndexWriter(SegmentWriteState state, int termIndexInterval)
throws IOException {
if (termIndexInterval <= 0) {
throw new IllegalArgumentException("invalid termIndexInterval: " + termIndexInterval);
}
this.termIndexInterval = termIndexInterval;
final String indexFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
out = state.directory.createOutput(indexFileName, state.context);
boolean success = false;
try {
CodecUtil.writeIndexHeader(
out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
out.writeVInt(termIndexInterval);
out.writeVInt(PackedInts.VERSION_CURRENT);
out.writeVInt(BLOCKSIZE);
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(out);
}
}
}
@Override
public FieldWriter addField(FieldInfo field, long termsFilePointer) {
// System.out.println("FGW: addFfield=" + field.name);
SimpleFieldWriter writer = new SimpleFieldWriter(field, termsFilePointer);
fields.add(writer);
return writer;
}
/**
* NOTE: if your codec does not sort in unicode code point order, you must override this method,
* to simply return indexedTerm.length.
*/
protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) {
// As long as codec sorts terms in unicode codepoint
// order, we can safely strip off the non-distinguishing
// suffix to save RAM in the loaded terms index.
return StringHelper.sortKeyLength(priorTerm, indexedTerm);
}
private class SimpleFieldWriter extends FieldWriter {
final FieldInfo fieldInfo;
int numIndexTerms;
final long indexStart;
final long termsStart;
long packedIndexStart;
long packedOffsetsStart;
private long numTerms;
private ByteBuffersDataOutput offsetsBuffer = ByteBuffersDataOutput.newResettableInstance();
private MonotonicBlockPackedWriter termOffsets =
new MonotonicBlockPackedWriter(offsetsBuffer, BLOCKSIZE);
private long currentOffset;
private ByteBuffersDataOutput addressBuffer = ByteBuffersDataOutput.newResettableInstance();
private MonotonicBlockPackedWriter termAddresses =
new MonotonicBlockPackedWriter(addressBuffer, BLOCKSIZE);
private final BytesRefBuilder lastTerm = new BytesRefBuilder();
SimpleFieldWriter(FieldInfo fieldInfo, long termsFilePointer) {
this.fieldInfo = fieldInfo;
indexStart = out.getFilePointer();
termsStart = termsFilePointer;
// we write terms+1 offsets, term n's length is n+1 - n
try {
termOffsets.add(0L);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
@Override
public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
// First term is first indexed term:
// System.out.println("FGW: checkIndexTerm text=" + text.utf8ToString());
if (0 == (numTerms++ % termIndexInterval)) {
return true;
} else {
if (0 == numTerms % termIndexInterval) {
// save last term just before next index term so we
// can compute wasted suffix
lastTerm.copyBytes(text);
}
return false;
}
}
@Override
public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
final int indexedTermLength;
if (numIndexTerms == 0) {
// no previous term: no bytes to write
indexedTermLength = 0;
} else {
indexedTermLength = indexedTermPrefixLength(lastTerm.get(), text);
}
// System.out.println("FGW: add text=" + text.utf8ToString() + " " + text + " fp=" +
// termsFilePointer);
// write only the min prefix that shows the diff
// against prior term
out.writeBytes(text.bytes, text.offset, indexedTermLength);
// save delta terms pointer
termAddresses.add(termsFilePointer - termsStart);
// save term length (in bytes)
assert indexedTermLength <= Short.MAX_VALUE;
currentOffset += indexedTermLength;
termOffsets.add(currentOffset);
lastTerm.copyBytes(text);
numIndexTerms++;
}
@Override
public void finish(long termsFilePointer) throws IOException {
// write primary terms dict offsets
packedIndexStart = out.getFilePointer();
// relative to our indexStart
termAddresses.finish();
addressBuffer.copyTo(out);
packedOffsetsStart = out.getFilePointer();
// write offsets into the byte[] terms
termOffsets.finish();
offsetsBuffer.copyTo(out);
// our referrer holds onto us, while other fields are
// being written, so don't tie up this RAM:
termOffsets = termAddresses = null;
addressBuffer = null;
offsetsBuffer = null;
}
}
@Override
public void close() throws IOException {
if (out != null) {
boolean success = false;
try {
final long dirStart = out.getFilePointer();
final int fieldCount = fields.size();
int nonNullFieldCount = 0;
for (int i = 0; i < fieldCount; i++) {
SimpleFieldWriter field = fields.get(i);
if (field.numIndexTerms > 0) {
nonNullFieldCount++;
}
}
out.writeVInt(nonNullFieldCount);
for (int i = 0; i < fieldCount; i++) {
SimpleFieldWriter field = fields.get(i);
if (field.numIndexTerms > 0) {
out.writeVInt(field.fieldInfo.number);
out.writeVInt(field.numIndexTerms);
out.writeVLong(field.termsStart);
out.writeVLong(field.indexStart);
out.writeVLong(field.packedIndexStart);
out.writeVLong(field.packedOffsetsStart);
}
}
writeTrailer(dirStart);
CodecUtil.writeFooter(out);
success = true;
} finally {
if (success) {
IOUtils.close(out);
} else {
IOUtils.closeWhileHandlingException(out);
}
out = null;
}
}
}
private void writeTrailer(long dirStart) throws IOException {
out.writeLong(dirStart);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy