org.apache.lucene.codecs.blockterms.FixedGapTermsIndexWriter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Show all versions of lucene-codecs Show documentation
Apache Lucene (module: codecs)
package org.apache.lucene.codecs.blockterms;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
import java.util.List;
import java.util.ArrayList;
import java.io.IOException;
/**
* Selects every Nth term as and index term, and hold term
* bytes (mostly) fully expanded in memory. This terms index
* supports seeking by ord. See {@link
* VariableGapTermsIndexWriter} for a more memory efficient
* terms index that does not support seeking by ord.
*
* @lucene.experimental */
public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
protected final IndexOutput out;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tii";
final static String CODEC_NAME = "SIMPLE_STANDARD_TERMS_INDEX";
final static int VERSION_START = 0;
final static int VERSION_APPEND_ONLY = 1;
final static int VERSION_CURRENT = VERSION_APPEND_ONLY;
final private int termIndexInterval;
private final List fields = new ArrayList();
@SuppressWarnings("unused") private final FieldInfos fieldInfos; // unread
public FixedGapTermsIndexWriter(SegmentWriteState state) throws IOException {
final String indexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
termIndexInterval = state.termIndexInterval;
out = state.directory.createOutput(indexFileName, state.context);
boolean success = false;
try {
fieldInfos = state.fieldInfos;
writeHeader(out);
out.writeInt(termIndexInterval);
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(out);
}
}
}
private void writeHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
}
@Override
public FieldWriter addField(FieldInfo field, long termsFilePointer) {
//System.out.println("FGW: addFfield=" + field.name);
SimpleFieldWriter writer = new SimpleFieldWriter(field, termsFilePointer);
fields.add(writer);
return writer;
}
/** NOTE: if your codec does not sort in unicode code
* point order, you must override this method, to simply
* return indexedTerm.length. */
protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) {
// As long as codec sorts terms in unicode codepoint
// order, we can safely strip off the non-distinguishing
// suffix to save RAM in the loaded terms index.
final int idxTermOffset = indexedTerm.offset;
final int priorTermOffset = priorTerm.offset;
final int limit = Math.min(priorTerm.length, indexedTerm.length);
for(int byteIdx=0;byteIdx-->