org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat Maven / Gradle / Ivy
Show all versions of aem-sdk-api Show documentation
/*
* COPIED FROM APACHE LUCENE 4.7.2
*
* Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
*
* (see https://issues.apache.org/jira/browse/OAK-10786 for details)
*/
package org.apache.lucene.codecs.lucene45;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.FieldInfo.DocValuesType;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.packed.BlockPackedWriter;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
/**
* Lucene 4.5 DocValues format.
*
* Encodes the four per-document value types (Numeric,Binary,Sorted,SortedSet) with these strategies:
*
* {@link DocValuesType#NUMERIC NUMERIC}:
*
* - Delta-compressed: per-document integers written in blocks of 16k. For each block
* the minimum value in that block is encoded, and each entry is a delta from that
* minimum value. Each block of deltas is compressed with bitpacking. For more
* information, see {@link BlockPackedWriter}.
*
- Table-compressed: when the number of unique values is very small (< 256), and
* when there are unused "gaps" in the range of values used (such as {@link SmallFloat}),
* a lookup table is written instead. Each per-document entry is instead the ordinal
* to this table, and those ordinals are compressed with bitpacking ({@link PackedInts}).
*
- GCD-compressed: when all numbers share a common divisor, such as dates, the greatest
* common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
*
*
* {@link DocValuesType#BINARY BINARY}:
*
* - Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
* Each document's value can be addressed directly with multiplication ({@code docID * length}).
*
- Variable-width Binary: one large concatenated byte[] is written, along with end addresses
* for each document. The addresses are written in blocks of 16k, with the current absolute
* start for the block, and the average (expected) delta per entry. For each document the
* deviation from the delta (actual - expected) is written.
*
- Prefix-compressed Binary: values are written in chunks of 16, with the first value written
* completely and other values sharing prefixes. chunk addresses are written in blocks of 16k,
* with the current absolute start for the block, and the average (expected) delta per entry.
* For each chunk the deviation from the delta (actual - expected) is written.
*
*
* {@link DocValuesType#SORTED SORTED}:
*
* - Sorted: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary,
* along with the per-document ordinals written using one of the numeric strategies above.
*
*
* {@link DocValuesType#SORTED_SET SORTED_SET}:
*
* - SortedSet: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary,
* an ordinal list and per-document index into this list are written using the numeric strategies
* above.
*
*
* Files:
*
* - .dvd: DocValues data
* - .dvm: DocValues metadata
*
*
* -
*
The DocValues metadata or .dvm file.
* For DocValues field, this stores metadata, such as the offset into the
* DocValues data (.dvd)
* DocValues metadata (.dvm) --> Header,<Entry>NumFields
*
* - Entry --> NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry
* - NumericEntry --> GCDNumericEntry | TableNumericEntry | DeltaNumericEntry
* - GCDNumericEntry --> NumericHeader,MinValue,GCD
* - TableNumericEntry --> NumericHeader,TableSize,{@link DataOutput#writeLong Int64}TableSize
* - DeltaNumericEntry --> NumericHeader
* - NumericHeader --> FieldNumber,EntryType,NumericType,MissingOffset,PackedVersion,DataOffset,Count,BlockSize
* - BinaryEntry --> FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry
* - FixedBinaryEntry --> BinaryHeader
* - VariableBinaryEntry --> BinaryHeader,AddressOffset,PackedVersion,BlockSize
* - PrefixBinaryEntry --> BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize
* - BinaryHeader --> FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset
* - SortedEntry --> FieldNumber,EntryType,BinaryEntry,NumericEntry
* - SortedSetEntry --> EntryType,BinaryEntry,NumericEntry,NumericEntry
* - FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}
* - EntryType,CompressionType --> {@link DataOutput#writeByte Byte}
* - Header --> {@link CodecUtil#writeHeader CodecHeader}
* - MinValue,GCD,MissingOffset,AddressOffset,DataOffset --> {@link DataOutput#writeLong Int64}
* - TableSize --> {@link DataOutput#writeVInt vInt}
*
* Sorted fields have two entries: a BinaryEntry with the value metadata,
* and an ordinary NumericEntry for the document-to-ord metadata.
* SortedSet fields have three entries: a BinaryEntry with the value metadata,
* and two NumericEntries for the document-to-ord-index and ordinal list metadata.
* FieldNumber of -1 indicates the end of metadata.
* EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)
* DataOffset is the pointer to the start of the data in the DocValues data (.dvd)
* NumericType indicates how Numeric values will be compressed:
*
* - 0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded
* from the minimum value within the block.
*
- 1 -->, gcd-compressed. When all integers share a common divisor, only quotients are stored
* using blocks of delta-encoded ints.
*
- 2 --> table-compressed. When the number of unique numeric values is small and it would save space,
* a lookup table of unique values is written, followed by the ordinal for each document.
*
* BinaryType indicates how Binary values will be stored:
*
* - 0 --> fixed-width. All values have the same length, addressing by multiplication.
*
- 1 -->, variable-width. An address for each value is stored.
*
- 2 --> prefix-compressed. An address to the start of every interval'th value is stored.
*
* MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
* If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
* Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
* is written for the addresses.
*
MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field.
* If its -1, then there are no missing values.
*
-
*
The DocValues data or .dvd file.
* For DocValues field, this stores the actual per-document data (the heavy-lifting)
* DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData>NumFields
*
* - NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | GCDCompressedNumerics
* - BinaryData --> {@link DataOutput#writeByte Byte}DataLength,Addresses
* - SortedData --> {@link FST FST<Int64>}
* - DeltaCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}
* - TableCompressedNumerics --> {@link PackedInts PackedInts}
* - GCDCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}
* - Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}
*
* SortedSet entries store the list of ordinals in their BinaryData as a
* sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.
*
* @lucene.experimental
*/
public final class Lucene45DocValuesFormat extends DocValuesFormat {
/** Sole Constructor */
public Lucene45DocValuesFormat() {
super("Lucene45");
}
@Override
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new Lucene45DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
}
@Override
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
return new Lucene45DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
}
static final String DATA_CODEC = "Lucene45DocValuesData";
static final String DATA_EXTENSION = "dvd";
static final String META_CODEC = "Lucene45ValuesMetadata";
static final String META_EXTENSION = "dvm";
static final int VERSION_START = 0;
static final int VERSION_SORTED_SET_SINGLE_VALUE_OPTIMIZED = 1;
static final int VERSION_CURRENT = VERSION_SORTED_SET_SINGLE_VALUE_OPTIMIZED;
static final byte NUMERIC = 0;
static final byte BINARY = 1;
static final byte SORTED = 2;
static final byte SORTED_SET = 3;
}