
org.apache.lucene.codecs.compressing.CompressingTermVectorsReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene Show documentation
Show all versions of lucene Show documentation
Libraries for Elasticsearch
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.compressing;
import java.io.Closeable;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.NoSuchElementException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongsRef;
import org.apache.lucene.util.packed.BlockPackedReaderIterator;
import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.CODEC_SFX_DAT;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.CODEC_SFX_IDX;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.FLAGS_BITS;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.OFFSETS;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.PACKED_BLOCK_SIZE;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.PAYLOADS;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.POSITIONS;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_EXTENSION;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_INDEX_EXTENSION;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_CURRENT;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_START;
/**
* {@link TermVectorsReader} for {@link CompressingTermVectorsFormat}.
* @lucene.experimental
*/
public final class CompressingTermVectorsReader extends TermVectorsReader implements Closeable {
private final FieldInfos fieldInfos;
final CompressingStoredFieldsIndexReader indexReader;
final IndexInput vectorsStream;
private final int version;
private final int packedIntsVersion;
private final CompressionMode compressionMode;
private final Decompressor decompressor;
private final int chunkSize;
private final int numDocs;
private boolean closed;
private final BlockPackedReaderIterator reader;
private final long numChunks; // number of compressed blocks written
private final long numDirtyChunks; // number of incomplete compressed blocks written
private final long maxPointer; // end of the data section
// used by clone
private CompressingTermVectorsReader(CompressingTermVectorsReader reader) {
this.fieldInfos = reader.fieldInfos;
this.vectorsStream = reader.vectorsStream.clone();
this.indexReader = reader.indexReader.clone();
this.packedIntsVersion = reader.packedIntsVersion;
this.compressionMode = reader.compressionMode;
this.decompressor = reader.decompressor.clone();
this.chunkSize = reader.chunkSize;
this.numDocs = reader.numDocs;
this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, PACKED_BLOCK_SIZE, 0);
this.version = reader.version;
this.numChunks = reader.numChunks;
this.numDirtyChunks = reader.numDirtyChunks;
this.maxPointer = reader.maxPointer;
this.closed = false;
}
/** Sole constructor. */
public CompressingTermVectorsReader(Directory d, SegmentInfo si, String segmentSuffix, FieldInfos fn,
IOContext context, String formatName, CompressionMode compressionMode) throws IOException {
this.compressionMode = compressionMode;
final String segment = si.name;
boolean success = false;
fieldInfos = fn;
numDocs = si.maxDoc();
int version = -1;
CompressingStoredFieldsIndexReader indexReader = null;
long maxPointer = -1;
// Load the index into memory
final String indexName = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_INDEX_EXTENSION);
try (ChecksumIndexInput input = d.openChecksumInput(indexName, context)) {
Throwable priorE = null;
try {
final String codecNameIdx = formatName + CODEC_SFX_IDX;
version = CodecUtil.checkIndexHeader(input, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == input.getFilePointer();
indexReader = new CompressingStoredFieldsIndexReader(input, si);
maxPointer = input.readVLong(); // the end of the data section
} catch (Throwable exception) {
priorE = exception;
} finally {
CodecUtil.checkFooter(input, priorE);
}
}
this.version = version;
this.indexReader = indexReader;
this.maxPointer = maxPointer;
try {
// Open the data file and read metadata
final String vectorsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION);
vectorsStream = d.openInput(vectorsStreamFN, context);
final String codecNameDat = formatName + CODEC_SFX_DAT;
int version2 = CodecUtil.checkIndexHeader(vectorsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
if (version != version2) {
throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + version2, vectorsStream);
}
assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer();
long pos = vectorsStream.getFilePointer();
vectorsStream.seek(maxPointer);
numChunks = vectorsStream.readVLong();
numDirtyChunks = vectorsStream.readVLong();
if (numDirtyChunks > numChunks) {
throw new CorruptIndexException("invalid chunk counts: dirty=" + numDirtyChunks + ", total=" + numChunks, vectorsStream);
}
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(vectorsStream);
vectorsStream.seek(pos);
packedIntsVersion = vectorsStream.readVInt();
chunkSize = vectorsStream.readVInt();
decompressor = compressionMode.newDecompressor();
this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, PACKED_BLOCK_SIZE, 0);
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(this);
}
}
}
public CompressionMode getCompressionMode() {
return compressionMode;
}
public int getChunkSize() {
return chunkSize;
}
public int getPackedIntsVersion() {
return packedIntsVersion;
}
public int getVersion() {
return version;
}
public CompressingStoredFieldsIndexReader getIndexReader() {
return indexReader;
}
public IndexInput getVectorsStream() {
return vectorsStream;
}
public long getMaxPointer() {
return maxPointer;
}
public long getNumChunks() {
return numChunks;
}
public long getNumDirtyChunks() {
return numDirtyChunks;
}
/**
* @throws AlreadyClosedException if this TermVectorsReader is closed
*/
private void ensureOpen() throws AlreadyClosedException {
if (closed) {
throw new AlreadyClosedException("this FieldsReader is closed");
}
}
@Override
public void close() throws IOException {
if (!closed) {
IOUtils.close(vectorsStream);
closed = true;
}
}
@Override
public TermVectorsReader clone() {
return new CompressingTermVectorsReader(this);
}
@Override
public Fields get(int doc) throws IOException {
ensureOpen();
// seek to the right place
{
final long startPointer = indexReader.getStartPointer(doc);
vectorsStream.seek(startPointer);
}
// decode
// - docBase: first doc ID of the chunk
// - chunkDocs: number of docs of the chunk
final int docBase = vectorsStream.readVInt();
final int chunkDocs = vectorsStream.readVInt();
if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs) {
throw new CorruptIndexException("docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc, vectorsStream);
}
final int skip; // number of fields to skip
final int numFields; // number of fields of the document we're looking for
final int totalFields; // total number of fields of the chunk (sum for all docs)
if (chunkDocs == 1) {
skip = 0;
numFields = totalFields = vectorsStream.readVInt();
} else {
reader.reset(vectorsStream, chunkDocs);
int sum = 0;
for (int i = docBase; i < doc; ++i) {
sum += reader.next();
}
skip = sum;
numFields = (int) reader.next();
sum += numFields;
for (int i = doc + 1; i < docBase + chunkDocs; ++i) {
sum += reader.next();
}
totalFields = sum;
}
if (numFields == 0) {
// no vectors
return null;
}
// read field numbers that have term vectors
final int[] fieldNums;
{
final int token = vectorsStream.readByte() & 0xFF;
assert token != 0; // means no term vectors, cannot happen since we checked for numFields == 0
final int bitsPerFieldNum = token & 0x1F;
int totalDistinctFields = token >>> 5;
if (totalDistinctFields == 0x07) {
totalDistinctFields += vectorsStream.readVInt();
}
++totalDistinctFields;
final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalDistinctFields, bitsPerFieldNum, 1);
fieldNums = new int[totalDistinctFields];
for (int i = 0; i < totalDistinctFields; ++i) {
fieldNums[i] = (int) it.next();
}
}
// read field numbers and flags
final int[] fieldNumOffs = new int[numFields];
final PackedInts.Reader flags;
{
final int bitsPerOff = PackedInts.bitsRequired(fieldNums.length - 1);
final PackedInts.Reader allFieldNumOffs = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsPerOff);
switch (vectorsStream.readVInt()) {
case 0:
final PackedInts.Reader fieldFlags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, fieldNums.length, FLAGS_BITS);
PackedInts.Mutable f = PackedInts.getMutable(totalFields, FLAGS_BITS, PackedInts.COMPACT);
for (int i = 0; i < totalFields; ++i) {
final int fieldNumOff = (int) allFieldNumOffs.get(i);
assert fieldNumOff >= 0 && fieldNumOff < fieldNums.length;
final int fgs = (int) fieldFlags.get(fieldNumOff);
f.set(i, fgs);
}
flags = f;
break;
case 1:
flags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, FLAGS_BITS);
break;
default:
throw new AssertionError();
}
for (int i = 0; i < numFields; ++i) {
fieldNumOffs[i] = (int) allFieldNumOffs.get(skip + i);
}
}
// number of terms per field for all fields
final PackedInts.Reader numTerms;
final int totalTerms;
{
final int bitsRequired = vectorsStream.readVInt();
numTerms = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsRequired);
int sum = 0;
for (int i = 0; i < totalFields; ++i) {
sum += numTerms.get(i);
}
totalTerms = sum;
}
// term lengths
int docOff = 0, docLen = 0, totalLen;
final int[] fieldLengths = new int[numFields];
final int[][] prefixLengths = new int[numFields][];
final int[][] suffixLengths = new int[numFields][];
{
reader.reset(vectorsStream, totalTerms);
// skip
int toSkip = 0;
for (int i = 0; i < skip; ++i) {
toSkip += numTerms.get(i);
}
reader.skip(toSkip);
// read prefix lengths
for (int i = 0; i < numFields; ++i) {
final int termCount = (int) numTerms.get(skip + i);
final int[] fieldPrefixLengths = new int[termCount];
prefixLengths[i] = fieldPrefixLengths;
for (int j = 0; j < termCount; ) {
final LongsRef next = reader.next(termCount - j);
for (int k = 0; k < next.length; ++k) {
fieldPrefixLengths[j++] = (int) next.longs[next.offset + k];
}
}
}
reader.skip(totalTerms - reader.ord());
reader.reset(vectorsStream, totalTerms);
// skip
toSkip = 0;
for (int i = 0; i < skip; ++i) {
for (int j = 0; j < numTerms.get(i); ++j) {
docOff += reader.next();
}
}
for (int i = 0; i < numFields; ++i) {
final int termCount = (int) numTerms.get(skip + i);
final int[] fieldSuffixLengths = new int[termCount];
suffixLengths[i] = fieldSuffixLengths;
for (int j = 0; j < termCount; ) {
final LongsRef next = reader.next(termCount - j);
for (int k = 0; k < next.length; ++k) {
fieldSuffixLengths[j++] = (int) next.longs[next.offset + k];
}
}
fieldLengths[i] = sum(suffixLengths[i]);
docLen += fieldLengths[i];
}
totalLen = docOff + docLen;
for (int i = skip + numFields; i < totalFields; ++i) {
for (int j = 0; j < numTerms.get(i); ++j) {
totalLen += reader.next();
}
}
}
// term freqs
final int[] termFreqs = new int[totalTerms];
{
reader.reset(vectorsStream, totalTerms);
for (int i = 0; i < totalTerms; ) {
final LongsRef next = reader.next(totalTerms - i);
for (int k = 0; k < next.length; ++k) {
termFreqs[i++] = 1 + (int) next.longs[next.offset + k];
}
}
}
// total number of positions, offsets and payloads
int totalPositions = 0, totalOffsets = 0, totalPayloads = 0;
for (int i = 0, termIndex = 0; i < totalFields; ++i) {
final int f = (int) flags.get(i);
final int termCount = (int) numTerms.get(i);
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex++];
if ((f & POSITIONS) != 0) {
totalPositions += freq;
}
if ((f & OFFSETS) != 0) {
totalOffsets += freq;
}
if ((f & PAYLOADS) != 0) {
totalPayloads += freq;
}
}
assert i != totalFields - 1 || termIndex == totalTerms : termIndex + " " + totalTerms;
}
final int[][] positionIndex = positionIndex(skip, numFields, numTerms, termFreqs);
final int[][] positions, startOffsets, lengths;
if (totalPositions > 0) {
positions = readPositions(skip, numFields, flags, numTerms, termFreqs, POSITIONS, totalPositions, positionIndex);
} else {
positions = new int[numFields][];
}
if (totalOffsets > 0) {
// average number of chars per term
final float[] charsPerTerm = new float[fieldNums.length];
for (int i = 0; i < charsPerTerm.length; ++i) {
charsPerTerm[i] = Float.intBitsToFloat(vectorsStream.readInt());
}
startOffsets = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
lengths = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
for (int i = 0; i < numFields; ++i) {
final int[] fStartOffsets = startOffsets[i];
final int[] fPositions = positions[i];
// patch offsets from positions
if (fStartOffsets != null && fPositions != null) {
final float fieldCharsPerTerm = charsPerTerm[fieldNumOffs[i]];
for (int j = 0; j < startOffsets[i].length; ++j) {
fStartOffsets[j] += (int) (fieldCharsPerTerm * fPositions[j]);
}
}
if (fStartOffsets != null) {
final int[] fPrefixLengths = prefixLengths[i];
final int[] fSuffixLengths = suffixLengths[i];
final int[] fLengths = lengths[i];
for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
// delta-decode start offsets and patch lengths using term lengths
final int termLength = fPrefixLengths[j] + fSuffixLengths[j];
lengths[i][positionIndex[i][j]] += termLength;
for (int k = positionIndex[i][j] + 1; k < positionIndex[i][j + 1]; ++k) {
fStartOffsets[k] += fStartOffsets[k - 1];
fLengths[k] += termLength;
}
}
}
}
} else {
startOffsets = lengths = new int[numFields][];
}
if (totalPositions > 0) {
// delta-decode positions
for (int i = 0; i < numFields; ++i) {
final int[] fPositions = positions[i];
final int[] fpositionIndex = positionIndex[i];
if (fPositions != null) {
for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
// delta-decode start offsets
for (int k = fpositionIndex[j] + 1; k < fpositionIndex[j + 1]; ++k) {
fPositions[k] += fPositions[k - 1];
}
}
}
}
}
// payload lengths
final int[][] payloadIndex = new int[numFields][];
int totalPayloadLength = 0;
int payloadOff = 0;
int payloadLen = 0;
if (totalPayloads > 0) {
reader.reset(vectorsStream, totalPayloads);
// skip
int termIndex = 0;
for (int i = 0; i < skip; ++i) {
final int f = (int) flags.get(i);
final int termCount = (int) numTerms.get(i);
if ((f & PAYLOADS) != 0) {
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex + j];
for (int k = 0; k < freq; ++k) {
final int l = (int) reader.next();
payloadOff += l;
}
}
}
termIndex += termCount;
}
totalPayloadLength = payloadOff;
// read doc payload lengths
for (int i = 0; i < numFields; ++i) {
final int f = (int) flags.get(skip + i);
final int termCount = (int) numTerms.get(skip + i);
if ((f & PAYLOADS) != 0) {
final int totalFreq = positionIndex[i][termCount];
payloadIndex[i] = new int[totalFreq + 1];
int posIdx = 0;
payloadIndex[i][posIdx] = payloadLen;
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex + j];
for (int k = 0; k < freq; ++k) {
final int payloadLength = (int) reader.next();
payloadLen += payloadLength;
payloadIndex[i][posIdx+1] = payloadLen;
++posIdx;
}
}
assert posIdx == totalFreq;
}
termIndex += termCount;
}
totalPayloadLength += payloadLen;
for (int i = skip + numFields; i < totalFields; ++i) {
final int f = (int) flags.get(i);
final int termCount = (int) numTerms.get(i);
if ((f & PAYLOADS) != 0) {
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex + j];
for (int k = 0; k < freq; ++k) {
totalPayloadLength += reader.next();
}
}
}
termIndex += termCount;
}
assert termIndex == totalTerms : termIndex + " " + totalTerms;
}
// decompress data
final BytesRef suffixBytes = new BytesRef();
decompressor.decompress(vectorsStream, totalLen + totalPayloadLength, docOff + payloadOff, docLen + payloadLen, suffixBytes);
suffixBytes.length = docLen;
final BytesRef payloadBytes = new BytesRef(suffixBytes.bytes, suffixBytes.offset + docLen, payloadLen);
final int[] fieldFlags = new int[numFields];
for (int i = 0; i < numFields; ++i) {
fieldFlags[i] = (int) flags.get(skip + i);
}
final int[] fieldNumTerms = new int[numFields];
for (int i = 0; i < numFields; ++i) {
fieldNumTerms[i] = (int) numTerms.get(skip + i);
}
final int[][] fieldTermFreqs = new int[numFields][];
{
int termIdx = 0;
for (int i = 0; i < skip; ++i) {
termIdx += numTerms.get(i);
}
for (int i = 0; i < numFields; ++i) {
final int termCount = (int) numTerms.get(skip + i);
fieldTermFreqs[i] = new int[termCount];
for (int j = 0; j < termCount; ++j) {
fieldTermFreqs[i][j] = termFreqs[termIdx++];
}
}
}
assert sum(fieldLengths) == docLen : sum(fieldLengths) + " != " + docLen;
return new TVFields(fieldNums, fieldFlags, fieldNumOffs, fieldNumTerms, fieldLengths,
prefixLengths, suffixLengths, fieldTermFreqs,
positionIndex, positions, startOffsets, lengths,
payloadBytes, payloadIndex,
suffixBytes);
}
// field -> term index -> position index
private int[][] positionIndex(int skip, int numFields, PackedInts.Reader numTerms, int[] termFreqs) {
final int[][] positionIndex = new int[numFields][];
int termIndex = 0;
for (int i = 0; i < skip; ++i) {
final int termCount = (int) numTerms.get(i);
termIndex += termCount;
}
for (int i = 0; i < numFields; ++i) {
final int termCount = (int) numTerms.get(skip + i);
positionIndex[i] = new int[termCount + 1];
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex+j];
positionIndex[i][j + 1] = positionIndex[i][j] + freq;
}
termIndex += termCount;
}
return positionIndex;
}
private int[][] readPositions(int skip, int numFields, PackedInts.Reader flags, PackedInts.Reader numTerms, int[] termFreqs, int flag, final int totalPositions, int[][] positionIndex) throws IOException {
final int[][] positions = new int[numFields][];
reader.reset(vectorsStream, totalPositions);
// skip
int toSkip = 0;
int termIndex = 0;
for (int i = 0; i < skip; ++i) {
final int f = (int) flags.get(i);
final int termCount = (int) numTerms.get(i);
if ((f & flag) != 0) {
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex+j];
toSkip += freq;
}
}
termIndex += termCount;
}
reader.skip(toSkip);
// read doc positions
for (int i = 0; i < numFields; ++i) {
final int f = (int) flags.get(skip + i);
final int termCount = (int) numTerms.get(skip + i);
if ((f & flag) != 0) {
final int totalFreq = positionIndex[i][termCount];
final int[] fieldPositions = new int[totalFreq];
positions[i] = fieldPositions;
for (int j = 0; j < totalFreq; ) {
final LongsRef nextPositions = reader.next(totalFreq - j);
for (int k = 0; k < nextPositions.length; ++k) {
fieldPositions[j++] = (int) nextPositions.longs[nextPositions.offset + k];
}
}
}
termIndex += termCount;
}
reader.skip(totalPositions - reader.ord());
return positions;
}
private class TVFields extends Fields {
private final int[] fieldNums, fieldFlags, fieldNumOffs, numTerms, fieldLengths;
private final int[][] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
private final BytesRef suffixBytes, payloadBytes;
public TVFields(int[] fieldNums, int[] fieldFlags, int[] fieldNumOffs, int[] numTerms, int[] fieldLengths,
int[][] prefixLengths, int[][] suffixLengths, int[][] termFreqs,
int[][] positionIndex, int[][] positions, int[][] startOffsets, int[][] lengths,
BytesRef payloadBytes, int[][] payloadIndex,
BytesRef suffixBytes) {
this.fieldNums = fieldNums;
this.fieldFlags = fieldFlags;
this.fieldNumOffs = fieldNumOffs;
this.numTerms = numTerms;
this.fieldLengths = fieldLengths;
this.prefixLengths = prefixLengths;
this.suffixLengths = suffixLengths;
this.termFreqs = termFreqs;
this.positionIndex = positionIndex;
this.positions = positions;
this.startOffsets = startOffsets;
this.lengths = lengths;
this.payloadBytes = payloadBytes;
this.payloadIndex = payloadIndex;
this.suffixBytes = suffixBytes;
}
@Override
public Iterator iterator() {
return new Iterator() {
int i = 0;
@Override
public boolean hasNext() {
return i < fieldNumOffs.length;
}
@Override
public String next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
final int fieldNum = fieldNums[fieldNumOffs[i++]];
return fieldInfos.fieldInfo(fieldNum).name;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
@Override
public Terms terms(String field) throws IOException {
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
return null;
}
int idx = -1;
for (int i = 0; i < fieldNumOffs.length; ++i) {
if (fieldNums[fieldNumOffs[i]] == fieldInfo.number) {
idx = i;
break;
}
}
if (idx == -1 || numTerms[idx] == 0) {
// no term
return null;
}
int fieldOff = 0, fieldLen = -1;
for (int i = 0; i < fieldNumOffs.length; ++i) {
if (i < idx) {
fieldOff += fieldLengths[i];
} else {
fieldLen = fieldLengths[i];
break;
}
}
assert fieldLen >= 0;
return new TVTerms(numTerms[idx], fieldFlags[idx],
prefixLengths[idx], suffixLengths[idx], termFreqs[idx],
positionIndex[idx], positions[idx], startOffsets[idx], lengths[idx],
payloadIndex[idx], payloadBytes,
new BytesRef(suffixBytes.bytes, suffixBytes.offset + fieldOff, fieldLen));
}
@Override
public int size() {
return fieldNumOffs.length;
}
}
private static class TVTerms extends Terms {
private final int numTerms, flags;
private final int[] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
private final BytesRef termBytes, payloadBytes;
TVTerms(int numTerms, int flags, int[] prefixLengths, int[] suffixLengths, int[] termFreqs,
int[] positionIndex, int[] positions, int[] startOffsets, int[] lengths,
int[] payloadIndex, BytesRef payloadBytes,
BytesRef termBytes) {
this.numTerms = numTerms;
this.flags = flags;
this.prefixLengths = prefixLengths;
this.suffixLengths = suffixLengths;
this.termFreqs = termFreqs;
this.positionIndex = positionIndex;
this.positions = positions;
this.startOffsets = startOffsets;
this.lengths = lengths;
this.payloadIndex = payloadIndex;
this.payloadBytes = payloadBytes;
this.termBytes = termBytes;
}
@Override
public TermsEnum iterator() throws IOException {
TVTermsEnum termsEnum = new TVTermsEnum();
termsEnum.reset(numTerms, flags, prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths,
payloadIndex, payloadBytes,
new ByteArrayDataInput(termBytes.bytes, termBytes.offset, termBytes.length));
return termsEnum;
}
@Override
public long size() throws IOException {
return numTerms;
}
@Override
public long getSumTotalTermFreq() throws IOException {
return -1L;
}
@Override
public long getSumDocFreq() throws IOException {
return numTerms;
}
@Override
public int getDocCount() throws IOException {
return 1;
}
@Override
public boolean hasFreqs() {
return true;
}
@Override
public boolean hasOffsets() {
return (flags & OFFSETS) != 0;
}
@Override
public boolean hasPositions() {
return (flags & POSITIONS) != 0;
}
@Override
public boolean hasPayloads() {
return (flags & PAYLOADS) != 0;
}
}
private static class TVTermsEnum extends TermsEnum {
private int numTerms, startPos, ord;
private int[] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
private ByteArrayDataInput in;
private BytesRef payloads;
private final BytesRef term;
private TVTermsEnum() {
term = new BytesRef(16);
}
void reset(int numTerms, int flags, int[] prefixLengths, int[] suffixLengths, int[] termFreqs, int[] positionIndex, int[] positions, int[] startOffsets, int[] lengths,
int[] payloadIndex, BytesRef payloads, ByteArrayDataInput in) {
this.numTerms = numTerms;
this.prefixLengths = prefixLengths;
this.suffixLengths = suffixLengths;
this.termFreqs = termFreqs;
this.positionIndex = positionIndex;
this.positions = positions;
this.startOffsets = startOffsets;
this.lengths = lengths;
this.payloadIndex = payloadIndex;
this.payloads = payloads;
this.in = in;
startPos = in.getPosition();
reset();
}
void reset() {
term.length = 0;
in.setPosition(startPos);
ord = -1;
}
@Override
public BytesRef next() throws IOException {
if (ord == numTerms - 1) {
return null;
} else {
assert ord < numTerms;
++ord;
}
// read term
term.offset = 0;
term.length = prefixLengths[ord] + suffixLengths[ord];
if (term.length > term.bytes.length) {
term.bytes = ArrayUtil.grow(term.bytes, term.length);
}
in.readBytes(term.bytes, prefixLengths[ord], suffixLengths[ord]);
return term;
}
@Override
public SeekStatus seekCeil(BytesRef text)
throws IOException {
if (ord < numTerms && ord >= 0) {
final int cmp = term().compareTo(text);
if (cmp == 0) {
return SeekStatus.FOUND;
} else if (cmp > 0) {
reset();
}
}
// linear scan
while (true) {
final BytesRef term = next();
if (term == null) {
return SeekStatus.END;
}
final int cmp = term.compareTo(text);
if (cmp > 0) {
return SeekStatus.NOT_FOUND;
} else if (cmp == 0) {
return SeekStatus.FOUND;
}
}
}
@Override
public void seekExact(long ord) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public BytesRef term() throws IOException {
return term;
}
@Override
public long ord() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docFreq() throws IOException {
return 1;
}
@Override
public long totalTermFreq() throws IOException {
return termFreqs[ord];
}
@Override
public final PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
final TVPostingsEnum docsEnum;
if (reuse != null && reuse instanceof TVPostingsEnum) {
docsEnum = (TVPostingsEnum) reuse;
} else {
docsEnum = new TVPostingsEnum();
}
docsEnum.reset(termFreqs[ord], positionIndex[ord], positions, startOffsets, lengths, payloads, payloadIndex);
return docsEnum;
}
}
private static class TVPostingsEnum extends PostingsEnum {
private int doc = -1;
private int termFreq;
private int positionIndex;
private int[] positions;
private int[] startOffsets;
private int[] lengths;
private final BytesRef payload;
private int[] payloadIndex;
private int basePayloadOffset;
private int i;
TVPostingsEnum() {
payload = new BytesRef();
}
public void reset(int freq, int positionIndex, int[] positions,
int[] startOffsets, int[] lengths, BytesRef payloads,
int[] payloadIndex) {
this.termFreq = freq;
this.positionIndex = positionIndex;
this.positions = positions;
this.startOffsets = startOffsets;
this.lengths = lengths;
this.basePayloadOffset = payloads.offset;
this.payload.bytes = payloads.bytes;
payload.offset = payload.length = 0;
this.payloadIndex = payloadIndex;
doc = i = -1;
}
private void checkDoc() {
if (doc == NO_MORE_DOCS) {
throw new IllegalStateException("DocsEnum exhausted");
} else if (doc == -1) {
throw new IllegalStateException("DocsEnum not started");
}
}
private void checkPosition() {
checkDoc();
if (i < 0) {
throw new IllegalStateException("Position enum not started");
} else if (i >= termFreq) {
throw new IllegalStateException("Read past last position");
}
}
@Override
public int nextPosition() throws IOException {
if (doc != 0) {
throw new IllegalStateException();
} else if (i >= termFreq - 1) {
throw new IllegalStateException("Read past last position");
}
++i;
if (payloadIndex != null) {
payload.offset = basePayloadOffset + payloadIndex[positionIndex + i];
payload.length = payloadIndex[positionIndex + i + 1] - payloadIndex[positionIndex + i];
}
if (positions == null) {
return -1;
} else {
return positions[positionIndex + i];
}
}
@Override
public int startOffset() throws IOException {
checkPosition();
if (startOffsets == null) {
return -1;
} else {
return startOffsets[positionIndex + i];
}
}
@Override
public int endOffset() throws IOException {
checkPosition();
if (startOffsets == null) {
return -1;
} else {
return startOffsets[positionIndex + i] + lengths[positionIndex + i];
}
}
@Override
public BytesRef getPayload() throws IOException {
checkPosition();
if (payloadIndex == null || payload.length == 0) {
return null;
} else {
return payload;
}
}
@Override
public int freq() throws IOException {
checkDoc();
return termFreq;
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
if (doc == -1) {
return (doc = 0);
} else {
return (doc = NO_MORE_DOCS);
}
}
@Override
public int advance(int target) throws IOException {
return slowAdvance(target);
}
@Override
public long cost() {
return 1;
}
}
private static int sum(int[] arr) {
int sum = 0;
for (int el : arr) {
sum += el;
}
return sum;
}
@Override
public long ramBytesUsed() {
return indexReader.ramBytesUsed();
}
@Override
public Collection getChildResources() {
return Collections.singleton(Accountables.namedAccountable("term vector index", indexReader));
}
@Override
public void checkIntegrity() throws IOException {
CodecUtil.checksumEntireFile(vectorsStream);
}
@Override
public String toString() {
return getClass().getSimpleName() + "(mode=" + compressionMode + ",chunksize=" + chunkSize + ")";
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy