org.apache.lucene.codecs.uniformsplit.FieldMetadata Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.uniformsplit;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
/**
* Metadata and stats for one field in the index.
*
* There is only one instance of {@link FieldMetadata} per {@link FieldInfo}.
*
* @lucene.experimental
*/
public class FieldMetadata {
protected final FieldInfo fieldInfo;
protected final boolean isMutable;
protected final FixedBitSet docsSeen;
protected long sumDocFreq;
protected long numTerms;
protected long sumTotalTermFreq;
protected int docCount;
protected long dictionaryStartFP;
protected long firstBlockStartFP;
protected long lastBlockStartFP;
protected BytesRef lastTerm;
/**
* Constructs field metadata for writing.
*
* @param maxDoc The total number of documents in the segment being written.
*/
public FieldMetadata(FieldInfo fieldInfo, int maxDoc) {
this(fieldInfo, maxDoc, true);
}
/** Constructs immutable virtual field metadata for reading. */
public FieldMetadata(
long dictionaryStartFP, long firstBlockStartFP, long lastBlockStartFP, BytesRef lastTerm) {
this(null, 0, false);
this.dictionaryStartFP = dictionaryStartFP;
this.firstBlockStartFP = firstBlockStartFP;
this.lastBlockStartFP = lastBlockStartFP;
this.lastTerm = lastTerm;
}
/**
* Constructs field metadata for reading or writing.
*
* @param maxDoc The total number of documents in the segment being written.
* @param isMutable Set true if this FieldMetadata is created for writing the index. Set false if
* it is used for reading the index.
*/
protected FieldMetadata(FieldInfo fieldInfo, int maxDoc, boolean isMutable) {
assert isMutable || maxDoc == 0;
this.fieldInfo = fieldInfo;
this.isMutable = isMutable;
// docsSeen must not be set if this FieldMetadata is immutable, that means it is used for
// reading the index.
this.docsSeen = isMutable ? new FixedBitSet(maxDoc) : null;
this.dictionaryStartFP = -1;
this.firstBlockStartFP = -1;
this.lastBlockStartFP = -1;
}
/**
* Updates the field stats with the given {@link BlockTermState} for the current block line (for
* one term).
*/
public void updateStats(BlockTermState state) {
assert isMutable;
assert state.docFreq > 0;
sumDocFreq += state.docFreq;
if (state.totalTermFreq > 0) {
sumTotalTermFreq += state.totalTermFreq;
}
numTerms++;
}
/**
* Provides the {@link FixedBitSet} to keep track of the docs seen when calling {@link
* org.apache.lucene.codecs.PostingsWriterBase#writeTerm(BytesRef, TermsEnum, FixedBitSet,
* org.apache.lucene.codecs.NormsProducer)}.
*
*
The returned {@link FixedBitSet} is created once in this {@link FieldMetadata} constructor.
*
* @return The {@link FixedBitSet} for the docs seen, during segment writing; or null if this
* {@link FieldMetadata} is created immutable during segment reading.
*/
public FixedBitSet getDocsSeen() {
return docsSeen;
}
public FieldInfo getFieldInfo() {
return fieldInfo;
}
public long getSumDocFreq() {
return sumDocFreq;
}
public long getNumTerms() {
return numTerms;
}
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
public int getDocCount() {
return isMutable ? docsSeen.cardinality() : docCount;
}
/**
* @return The file pointer to the start of the first block of the field.
*/
public long getFirstBlockStartFP() {
return firstBlockStartFP;
}
/** Sets the file pointer to the start of the first block of the field. */
public void setFirstBlockStartFP(long firstBlockStartFP) {
assert isMutable;
this.firstBlockStartFP = firstBlockStartFP;
}
/**
* @return The start file pointer for the last block of the field.
*/
public long getLastBlockStartFP() {
return lastBlockStartFP;
}
/** Sets the file pointer after the end of the last block of the field. */
public void setLastBlockStartFP(long lastBlockStartFP) {
assert isMutable;
this.lastBlockStartFP = lastBlockStartFP;
}
/**
* @return The file pointer to the start of the dictionary of the field.
*/
public long getDictionaryStartFP() {
return dictionaryStartFP;
}
/** Sets the file pointer to the start of the dictionary of the field. */
public void setDictionaryStartFP(long dictionaryStartFP) {
assert isMutable;
this.dictionaryStartFP = dictionaryStartFP;
}
public void setLastTerm(BytesRef lastTerm) {
assert lastTerm != null;
this.lastTerm = lastTerm;
}
public BytesRef getLastTerm() {
return lastTerm;
}
/** Reads/writes field metadata. */
public static class Serializer {
/** Stateless singleton. */
public static final Serializer INSTANCE = new Serializer();
public void write(DataOutput output, FieldMetadata fieldMetadata) throws IOException {
assert fieldMetadata.dictionaryStartFP >= 0;
assert fieldMetadata.firstBlockStartFP >= 0;
assert fieldMetadata.lastBlockStartFP >= 0;
assert fieldMetadata.numTerms > 0
: "There should be at least one term for field "
+ fieldMetadata.fieldInfo.name
+ ": "
+ fieldMetadata.numTerms;
assert fieldMetadata.firstBlockStartFP <= fieldMetadata.lastBlockStartFP
: "start: " + fieldMetadata.firstBlockStartFP + " end: " + fieldMetadata.lastBlockStartFP;
assert fieldMetadata.lastTerm != null : "you must set the last term";
output.writeVInt(fieldMetadata.fieldInfo.number);
output.writeVLong(fieldMetadata.numTerms);
output.writeVLong(fieldMetadata.sumDocFreq);
if (fieldMetadata.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) {
assert fieldMetadata.sumTotalTermFreq >= fieldMetadata.sumDocFreq
: "sumTotalFQ: "
+ fieldMetadata.sumTotalTermFreq
+ " sumDocFQ: "
+ fieldMetadata.sumDocFreq;
output.writeVLong(fieldMetadata.sumTotalTermFreq - fieldMetadata.sumDocFreq);
}
output.writeVInt(fieldMetadata.getDocCount());
output.writeVLong(fieldMetadata.dictionaryStartFP);
output.writeVLong(fieldMetadata.firstBlockStartFP);
output.writeVLong(fieldMetadata.lastBlockStartFP);
if (fieldMetadata.lastTerm.length > 0) {
output.writeVInt(fieldMetadata.lastTerm.length);
output.writeBytes(
fieldMetadata.lastTerm.bytes,
fieldMetadata.lastTerm.offset,
fieldMetadata.lastTerm.length);
} else {
output.writeVInt(0);
}
}
public FieldMetadata read(DataInput input, FieldInfos fieldInfos, int maxNumDocs)
throws IOException {
int fieldId = input.readVInt();
FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldId);
if (fieldInfo == null) {
throw new CorruptIndexException("Illegal field id= " + fieldId, input);
}
FieldMetadata fieldMetadata = new FieldMetadata(fieldInfo, 0, false);
fieldMetadata.numTerms = input.readVLong();
if (fieldMetadata.numTerms <= 0) {
throw new CorruptIndexException(
"Illegal number of terms= " + fieldMetadata.numTerms + " for field= " + fieldId, input);
}
fieldMetadata.sumDocFreq = input.readVLong();
fieldMetadata.sumTotalTermFreq = fieldMetadata.sumDocFreq;
if (fieldMetadata.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) {
fieldMetadata.sumTotalTermFreq += input.readVLong();
if (fieldMetadata.sumTotalTermFreq < fieldMetadata.sumDocFreq) {
// #positions must be >= #postings.
throw new CorruptIndexException(
"Illegal sumTotalTermFreq= "
+ fieldMetadata.sumTotalTermFreq
+ " sumDocFreq= "
+ fieldMetadata.sumDocFreq
+ " for field= "
+ fieldId,
input);
}
}
fieldMetadata.docCount = input.readVInt();
if (fieldMetadata.docCount < 0 || fieldMetadata.docCount > maxNumDocs) {
// #docs with field must be <= #docs.
throw new CorruptIndexException(
"Illegal number of docs= "
+ fieldMetadata.docCount
+ " maxNumDocs= "
+ maxNumDocs
+ " for field="
+ fieldId,
input);
}
if (fieldMetadata.sumDocFreq < fieldMetadata.docCount) {
// #postings must be >= #docs with field.
throw new CorruptIndexException(
"Illegal sumDocFreq= "
+ fieldMetadata.sumDocFreq
+ " docCount= "
+ fieldMetadata.docCount
+ " for field= "
+ fieldId,
input);
}
fieldMetadata.dictionaryStartFP = input.readVLong();
fieldMetadata.firstBlockStartFP = input.readVLong();
fieldMetadata.lastBlockStartFP = input.readVLong();
int lastTermLength = input.readVInt();
BytesRef lastTerm = new BytesRef(lastTermLength);
if (lastTermLength > 0) {
input.readBytes(lastTerm.bytes, 0, lastTermLength);
lastTerm.length = lastTermLength;
} else if (lastTermLength < 0) {
throw new CorruptIndexException(
"Illegal last term length= " + lastTermLength + " for field= " + fieldId, input);
}
fieldMetadata.setLastTerm(lastTerm);
return fieldMetadata;
}
}
}