Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.blocktreeords;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter; // javadocs
import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.PackedInts;
/*
TODO:
- Currently there is a one-to-one mapping of indexed
term to term block, but we could decouple the two, ie,
put more terms into the index than there are blocks.
The index would take up more RAM but then it'd be able
to avoid seeking more often and could make PK/FuzzyQ
faster if the additional indexed terms could store
the offset into the terms block.
- The blocks are not written in true depth-first
order, meaning if you just next() the file pointer will
sometimes jump backwards. For example, block foo* will
be written before block f* because it finished before.
This could possibly hurt performance if the terms dict is
not hot, since OSs anticipate sequential file access. We
could fix the writer to re-order the blocks as a 2nd
pass.
- Each block encodes the term suffixes packed
sequentially using a separate vInt per term, which is
1) wasteful and 2) slow (must linear scan to find a
particular suffix). We should instead 1) make
random-access array so we can directly access the Nth
suffix, and 2) bulk-encode this array using bulk int[]
codecs; then at search time we can binary search when
we seek a particular term.
*/
/**
* This is just like {@link BlockTreeTermsWriter}, except it also stores a version per term, and adds a method to its TermsEnum
* implementation to seekExact only if the version is >= the specified version. The version is added to the terms index to avoid seeking if
* no term in the block has a high enough version. The term blocks file is .tiv and the terms index extension is .tipv.
*
* @lucene.experimental
*/
public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
static final FSTOrdsOutputs FST_OUTPUTS = new FSTOrdsOutputs();
static final Output NO_OUTPUT = FST_OUTPUTS.getNoOutput();
/** Suggested default value for the {@code
* minItemsInBlock} parameter to {@link
* #OrdsBlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */
public final static int DEFAULT_MIN_BLOCK_SIZE = 25;
/** Suggested default value for the {@code
* maxItemsInBlock} parameter to {@link
* #OrdsBlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */
public final static int DEFAULT_MAX_BLOCK_SIZE = 48;
// public final static boolean DEBUG = false;
//private final static boolean SAVE_DOT_FILES = false;
static final int OUTPUT_FLAGS_NUM_BITS = 2;
static final int OUTPUT_FLAGS_MASK = 0x3;
static final int OUTPUT_FLAG_IS_FLOOR = 0x1;
static final int OUTPUT_FLAG_HAS_TERMS = 0x2;
/** Extension of terms file */
static final String TERMS_EXTENSION = "tio";
final static String TERMS_CODEC_NAME = "OrdsBlockTreeTerms";
/** Initial terms format. */
public static final int VERSION_START = 1;
/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_START;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tipo";
final static String TERMS_INDEX_CODEC_NAME = "OrdsBlockTreeIndex";
private final IndexOutput out;
private final IndexOutput indexOut;
final int maxDoc;
final int minItemsInBlock;
final int maxItemsInBlock;
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
private static class FieldMetaData {
public final FieldInfo fieldInfo;
public final Output rootCode;
public final long numTerms;
public final long indexStartFP;
public final long sumTotalTermFreq;
public final long sumDocFreq;
public final int docCount;
private final int longsSize;
public final BytesRef minTerm;
public final BytesRef maxTerm;
public FieldMetaData(FieldInfo fieldInfo, Output rootCode, long numTerms, long indexStartFP,
long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize,
BytesRef minTerm, BytesRef maxTerm) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms;
this.rootCode = rootCode;
this.indexStartFP = indexStartFP;
this.numTerms = numTerms;
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
this.longsSize = longsSize;
this.minTerm = minTerm;
this.maxTerm = maxTerm;
}
}
private final List fields = new ArrayList<>();
// private final String segment;
/** Create a new writer. The number of items (terms or
* sub-blocks) per block will aim to be between
* minItemsPerBlock and maxItemsPerBlock, though in some
* cases the blocks may be smaller than the min. */
public OrdsBlockTreeTermsWriter(
SegmentWriteState state,
PostingsWriterBase postingsWriter,
int minItemsInBlock,
int maxItemsInBlock)
throws IOException
{
BlockTreeTermsWriter.validateSettings(minItemsInBlock, maxItemsInBlock);
maxDoc = state.segmentInfo.maxDoc();
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
out = state.directory.createOutput(termsFileName, state.context);
boolean success = false;
IndexOutput indexOut = null;
try {
fieldInfos = state.fieldInfos;
this.minItemsInBlock = minItemsInBlock;
this.maxItemsInBlock = maxItemsInBlock;
CodecUtil.writeIndexHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
indexOut = state.directory.createOutput(termsIndexFileName, state.context);
CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
this.postingsWriter = postingsWriter;
// segment = state.segmentInfo.name;
// System.out.println("BTW.init seg=" + state.segmentName);
postingsWriter.init(out, state); // have consumer write its format/header
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(out, indexOut);
}
}
this.indexOut = indexOut;
}
@Override
public void write(Fields fields) throws IOException {
String lastField = null;
for(String field : fields) {
assert lastField == null || lastField.compareTo(field) < 0;
lastField = field;
Terms terms = fields.terms(field);
if (terms == null) {
continue;
}
TermsEnum termsEnum = terms.iterator();
TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field));
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
termsWriter.write(term, termsEnum);
}
termsWriter.finish();
}
}
static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) {
assert fp < (1L << 62);
return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0);
}
private static class PendingEntry {
public final boolean isTerm;
protected PendingEntry(boolean isTerm) {
this.isTerm = isTerm;
}
}
private static final class PendingTerm extends PendingEntry {
public final byte[] termBytes;
// stats + metadata
public final BlockTermState state;
public PendingTerm(BytesRef term, BlockTermState state) {
super(true);
this.termBytes = new byte[term.length];
System.arraycopy(term.bytes, term.offset, termBytes, 0, term.length);
this.state = state;
}
@Override
public String toString() {
return brToString(termBytes);
}
}
// for debugging
@SuppressWarnings("unused")
static String brToString(BytesRef b) {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
// If BytesRef isn't actually UTF8, or it's eg a
// prefix of UTF8 that ends mid-unicode-char, we
// fallback to hex:
return b.toString();
}
}
// for debugging
@SuppressWarnings("unused")
static String brToString(byte[] b) {
return brToString(new BytesRef(b));
}
private static final class SubIndex {
public final FST