org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter Maven / Gradle / Ivy
Show all versions of lucene-codecs Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.blockterms;
import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
/**
* Selects index terms according to provided pluggable {@link IndexTermSelector}, and stores them in
* a prefix trie that's loaded entirely in RAM stored as an FST. This terms index only supports
* unsigned byte term sort order (unicode codepoint order when the bytes are UTF8).
*
* @lucene.experimental
*/
public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
protected IndexOutput metaOut;
protected IndexOutput out;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tiv";
/** Extension of terms meta file */
static final String TERMS_META_EXTENSION = "tmv";
static final String META_CODEC_NAME = "VariableGapTermsMeta";
static final String CODEC_NAME = "VariableGapTermsIndex";
static final int VERSION_START = 4;
static final int VERSION_CURRENT = VERSION_START;
@SuppressWarnings("unused")
private final FieldInfos fieldInfos; // unread
private final IndexTermSelector policy;
/**
* Hook for selecting which terms should be placed in the terms index.
*
* {@link #newField} is called at the start of each new field, and {@link #isIndexTerm} for
* each term in that field.
*
* @lucene.experimental
*/
public abstract static class IndexTermSelector {
/**
* Called sequentially on every term being written, returning true if this term should be
* indexed
*/
public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
/** Called when a new field is started. */
public abstract void newField(FieldInfo fieldInfo);
}
/** Same policy as {@link FixedGapTermsIndexWriter} */
public static final class EveryNTermSelector extends IndexTermSelector {
private int count;
private final int interval;
public EveryNTermSelector(int interval) {
this.interval = interval;
// First term is first indexed term:
count = interval;
}
@Override
public boolean isIndexTerm(BytesRef term, TermStats stats) {
if (count >= interval) {
count = 1;
return true;
} else {
count++;
return false;
}
}
@Override
public void newField(FieldInfo fieldInfo) {
count = interval;
}
}
/**
* Sets an index term when docFreq >= docFreqThresh, or every interval terms. This should
* reduce seek time to high docFreq terms.
*/
public static final class EveryNOrDocFreqTermSelector extends IndexTermSelector {
private int count;
private final int docFreqThresh;
private final int interval;
public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval) {
this.interval = interval;
this.docFreqThresh = docFreqThresh;
// First term is first indexed term:
count = interval;
}
@Override
public boolean isIndexTerm(BytesRef term, TermStats stats) {
if (stats.docFreq() >= docFreqThresh || count >= interval) {
count = 1;
return true;
} else {
count++;
return false;
}
}
@Override
public void newField(FieldInfo fieldInfo) {
count = interval;
}
}
// TODO: it'd be nice to let the FST builder prune based
// on term count of each node (the prune1/prune2 that it
// accepts), and build the index based on that. This
// should result in a more compact terms index, more like
// a prefix trie than the other selectors, because it
// only stores enough leading bytes to get down to N
// terms that may complete that prefix. It becomes
// "deeper" when terms are dense, and "shallow" when they
// are less dense.
//
// However, it's not easy to make that work this this
// API, because that pruning doesn't immediately know on
// seeing each term whether that term will be a seek point
// or not. It requires some non-causality in the API, ie
// only on seeing some number of future terms will the
// builder decide which past terms are seek points.
// Somehow the API'd need to be able to return a "I don't
// know" value, eg like a Future, which only later on is
// flipped (frozen) to true or false.
//
// We could solve this with a 2-pass approach, where the
// first pass would build an FSA (no outputs) solely to
// determine which prefixes are the 'leaves' in the
// pruning. The 2nd pass would then look at this prefix
// trie to mark the seek points and build the FST mapping
// to the true output.
//
// But, one downside to this approach is that it'd result
// in uneven index term selection. EG with prune1=10, the
// resulting index terms could be as frequent as every 10
// terms or as rare as every * 10 (eg 2560),
// in the extremes.
public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy)
throws IOException {
fieldInfos = state.fieldInfos;
this.policy = policy;
final String metaFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, TERMS_META_EXTENSION);
final String indexFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
boolean success = false;
try {
metaOut = state.directory.createOutput(metaFileName, state.context);
out = state.directory.createOutput(indexFileName, state.context);
CodecUtil.writeIndexHeader(
metaOut,
META_CODEC_NAME,
VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
CodecUtil.writeIndexHeader(
out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(this);
}
}
}
@Override
public FieldWriter addField(FieldInfo field, long termsFilePointer) throws IOException {
//// System.out.println("VGW: field=" + field.name);
policy.newField(field);
return new FSTFieldWriter(field, termsFilePointer);
}
/**
* NOTE: if your codec does not sort in unicode code point order, you must override this method,
* to simply return indexedTerm.length.
*/
protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) {
// As long as codec sorts terms in unicode codepoint
// order, we can safely strip off the non-distinguishing
// suffix to save RAM in the loaded terms index.
final int idxTermOffset = indexedTerm.offset;
final int priorTermOffset = priorTerm.offset;
final int limit = Math.min(priorTerm.length, indexedTerm.length);
for (int byteIdx = 0; byteIdx < limit; byteIdx++) {
if (priorTerm.bytes[priorTermOffset + byteIdx]
!= indexedTerm.bytes[idxTermOffset + byteIdx]) {
return byteIdx + 1;
}
}
return Math.min(1 + priorTerm.length, indexedTerm.length);
}
private class FSTFieldWriter extends FieldWriter {
private final FSTCompiler fstCompiler;
private final PositiveIntOutputs fstOutputs;
private final long startTermsFilePointer;
final FieldInfo fieldInfo;
FST fst;
private final BytesRefBuilder lastTerm = new BytesRefBuilder();
private boolean first = true;
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
this.fieldInfo = fieldInfo;
fstOutputs = PositiveIntOutputs.getSingleton();
fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs).build();
//// System.out.println("VGW: field=" + fieldInfo.name);
// Always put empty string in
fstCompiler.add(new IntsRef(), termsFilePointer);
startTermsFilePointer = termsFilePointer;
}
@Override
public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
// System.out.println("VGW: index term=" + text.utf8ToString());
// NOTE: we must force the first term per field to be
// indexed, in case policy doesn't:
if (policy.isIndexTerm(text, stats) || first) {
first = false;
// System.out.println(" YES");
return true;
} else {
lastTerm.copyBytes(text);
return false;
}
}
private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
@Override
public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
if (text.length == 0) {
// We already added empty string in ctor
assert termsFilePointer == startTermsFilePointer;
return;
}
final int lengthSave = text.length;
text.length = indexedTermPrefixLength(lastTerm.get(), text);
try {
fstCompiler.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
} finally {
text.length = lengthSave;
}
lastTerm.copyBytes(text);
}
@Override
public void finish(long termsFilePointer) throws IOException {
fst = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader());
if (fst != null) {
metaOut.writeInt(fieldInfo.number);
metaOut.writeVLong(out.getFilePointer());
fst.save(metaOut, out);
}
}
}
@Override
public void close() throws IOException {
try {
if (metaOut != null) {
metaOut.writeInt(-1);
CodecUtil.writeFooter(metaOut);
}
if (out != null) {
CodecUtil.writeFooter(out);
}
} finally {
try {
IOUtils.close(out, metaOut);
} finally {
out = null;
metaOut = null;
}
}
}
}