org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of oak-lucene Show documentation
Oak Lucene integration subproject
There is a newer version: 1.9.8
/*
 * COPIED FROM APACHE LUCENE 4.7.2
 *
 * Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
 *
 * (see https://issues.apache.org/jira/browse/OAK-10786 for details)
 */

package org.apache.lucene.codecs.lucene42;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.BlockPackedWriter;

/**
 * Lucene 4.2 DocValues format.
 * 
 * Encodes the four per-document value types (Numeric,Binary,Sorted,SortedSet) with seven basic strategies.
 * 

 * 

 *    Delta-compressed Numerics: per-document integers written in blocks of 4096. For each block
 *        the minimum value is encoded, and each entry is a delta from that minimum value.
 *    
Table-compressed Numerics: when the number of unique values is very small, a lookup table
 *        is written instead. Each per-document entry is instead the ordinal to this table.
 *    
Uncompressed Numerics: when all values would fit into a single byte, and the 
 *        acceptableOverheadRatio would pack values into 8 bits per value anyway, they
 *        are written as absolute values (with no indirection or packing) for performance.
 *    
GCD-compressed Numerics: when all numbers share a common divisor, such as dates, the greatest
 *        common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
 *    
Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
 *        Each document's value can be addressed by maxDoc*length. 
 *    
Variable-width Binary: one large concatenated byte[] is written, along with end addresses 
 *        for each document. The addresses are written in blocks of 4096, with the current absolute
 *        start for the block, and the average (expected) delta per entry. For each document the 
 *        deviation from the delta (actual - expected) is written.
 *    
Sorted: an FST mapping deduplicated terms to ordinals is written, along with the per-document
 *        ordinals written using one of the numeric strategies above.
 *    
SortedSet: an FST mapping deduplicated terms to ordinals is written, along with the per-document
 *        ordinal list written using one of the binary strategies above.  
 * 
 * 
 * Files:
 * 

 *   .dvd: DocValues data
 *   .dvm: DocValues metadata
 * 
 * 
 *   
 *   The DocValues metadata or .dvm file.
 *   For DocValues field, this stores metadata, such as the offset into the 
 *      DocValues data (.dvd)
 *   DocValues metadata (.dvm) --> Header,<FieldNumber,EntryType,Entry>^NumFields
 *   
 *     Entry --> NumericEntry | BinaryEntry | SortedEntry
 *     NumericEntry --> DataOffset,CompressionType,PackedVersion
 *     BinaryEntry --> DataOffset,DataLength,MinLength,MaxLength,PackedVersion?,BlockSize?
 *     SortedEntry --> DataOffset,ValueCount
 *     FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}
 *     DataOffset,DataLength --> {@link DataOutput#writeLong Int64}
 *     EntryType,CompressionType --> {@link DataOutput#writeByte Byte}
 *     Header --> {@link CodecUtil#writeHeader CodecHeader}
 *   
 *   Sorted fields have two entries: a SortedEntry with the FST metadata,
 *      and an ordinary NumericEntry for the document-to-ord metadata.
 *   SortedSet fields have two entries: a SortedEntry with the FST metadata,
 *      and an ordinary BinaryEntry for the document-to-ord-list metadata.
 *   FieldNumber of -1 indicates the end of metadata.
 *   EntryType is a 0 (NumericEntry), 1 (BinaryEntry, or 2 (SortedEntry)
 *   DataOffset is the pointer to the start of the data in the DocValues data (.dvd)
 *   CompressionType indicates how Numeric values will be compressed:
 *      

 *         0 --> delta-compressed. For each block of 4096 integers, every integer is delta-encoded
 *             from the minimum value within the block. 
 *         
1 --> table-compressed. When the number of unique numeric values is small and it would save space,
 *             a lookup table of unique values is written, followed by the ordinal for each document.
 *         
2 --> uncompressed. When the acceptableOverheadRatio parameter would upgrade the number
 *             of bits required to 8, and all values fit in a byte, these are written as absolute binary values
 *             for performance.
 *         
3 -->, gcd-compressed. When all integers share a common divisor, only quotients are stored
 *             using blocks of delta-encoded ints.
 *      
 *   MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
 *      If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
 *      Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
 *      is written for the addresses.
 *   

 *   The DocValues data or .dvd file.
 *   For DocValues field, this stores the actual per-document data (the heavy-lifting)
 *   DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData>^NumFields
 *   
 *     NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | UncompressedNumerics | GCDCompressedNumerics
 *     BinaryData -->  {@link DataOutput#writeByte Byte}^DataLength,Addresses
 *     SortedData --> {@link FST FST<Int64>}
 *     DeltaCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=4096)}
 *     TableCompressedNumerics --> TableSize,{@link DataOutput#writeLong Int64}^TableSize,{@link PackedInts PackedInts}
 *     UncompressedNumerics --> {@link DataOutput#writeByte Byte}^maxdoc
 *     Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=4096)}
 *   
 *   SortedSet entries store the list of ordinals in their BinaryData as a
 *      sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.       
 * 
 * 
 * Limitations:
 * 

 *    Binary doc values can be at most {@link #MAX_BINARY_FIELD_LENGTH} in length.
 * 
 * @deprecated Only for reading old 4.2 segments
 */
@Deprecated
public class Lucene42DocValuesFormat extends DocValuesFormat {

  /** Maximum length for each binary doc values field. */
  public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2;
  
  final float acceptableOverheadRatio;
  
  /** 
   * Calls {@link #Lucene42DocValuesFormat(float) 
   * Lucene42DocValuesFormat(PackedInts.DEFAULT)} 
   */
  public Lucene42DocValuesFormat() {
    this(PackedInts.DEFAULT);
  }
  
  /**
   * Creates a new Lucene42DocValuesFormat with the specified
   * acceptableOverheadRatio for NumericDocValues.
   * @param acceptableOverheadRatio compression parameter for numerics. 
   *        Currently this is only used when the number of unique values is small.
   *        
   * @lucene.experimental
   */
  public Lucene42DocValuesFormat(float acceptableOverheadRatio) {
    super("Lucene42");
    this.acceptableOverheadRatio = acceptableOverheadRatio;
  }

  @Override
  public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
    throw new UnsupportedOperationException("this codec can only be used for reading");
  }
  
  @Override
  public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
    return new Lucene42DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
  }
  
  static final String DATA_CODEC = "Lucene42DocValuesData";
  static final String DATA_EXTENSION = "dvd";
  static final String METADATA_CODEC = "Lucene42DocValuesMetadata";
  static final String METADATA_EXTENSION = "dvm";
}