All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.lucene40.Lucene40DocValuesFormat Maven / Gradle / Ivy

There is a newer version: 1.9.8
Show newest version
/*
 * COPIED FROM APACHE LUCENE 4.7.2
 *
 * Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
 *
 * (see https://issues.apache.org/jira/browse/OAK-10786 for details)
 */

package org.apache.lucene.codecs.lucene40;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.CompoundFileDirectory;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.packed.PackedInts;

/**
 * Lucene 4.0 DocValues format.
 * 

* Files: *

    *
  • .dv.cfs: {@link CompoundFileDirectory compound container}
  • *
  • .dv.cfe: {@link CompoundFileDirectory compound entries}
  • *
* Entries within the compound file: *
    *
  • <segment>_<fieldNumber>.dat: data values
  • *
  • <segment>_<fieldNumber>.idx: index into the .dat for DEREF types
  • *
*

* There are several many types of {@code DocValues} with different encodings. * From the perspective of filenames, all types store their values in .dat * entries within the compound file. In the case of dereferenced/sorted types, the .dat * actually contains only the unique values, and an additional .idx file contains * pointers to these unique values. *

* Formats: *
    *
  • {@code VAR_INTS} .dat --> Header, PackedType, MinValue, * DefaultValue, PackedStream
  • *
  • {@code FIXED_INTS_8} .dat --> Header, ValueSize, * {@link DataOutput#writeByte Byte}maxdoc
  • *
  • {@code FIXED_INTS_16} .dat --> Header, ValueSize, * {@link DataOutput#writeShort Short}maxdoc
  • *
  • {@code FIXED_INTS_32} .dat --> Header, ValueSize, * {@link DataOutput#writeInt Int32}maxdoc
  • *
  • {@code FIXED_INTS_64} .dat --> Header, ValueSize, * {@link DataOutput#writeLong Int64}maxdoc
  • *
  • {@code FLOAT_32} .dat --> Header, ValueSize, Float32maxdoc
  • *
  • {@code FLOAT_64} .dat --> Header, ValueSize, Float64maxdoc
  • *
  • {@code BYTES_FIXED_STRAIGHT} .dat --> Header, ValueSize, * ({@link DataOutput#writeByte Byte} * ValueSize)maxdoc
  • *
  • {@code BYTES_VAR_STRAIGHT} .idx --> Header, TotalBytes, Addresses
  • *
  • {@code BYTES_VAR_STRAIGHT} .dat --> Header, ({@link DataOutput#writeByte Byte} * variable ValueSize)maxdoc
  • *
  • {@code BYTES_FIXED_DEREF} .idx --> Header, NumValues, Addresses
  • *
  • {@code BYTES_FIXED_DEREF} .dat --> Header, ValueSize, * ({@link DataOutput#writeByte Byte} * ValueSize)NumValues
  • *
  • {@code BYTES_VAR_DEREF} .idx --> Header, TotalVarBytes, Addresses
  • *
  • {@code BYTES_VAR_DEREF} .dat --> Header, * (LengthPrefix + {@link DataOutput#writeByte Byte} * variable ValueSize)NumValues
  • *
  • {@code BYTES_FIXED_SORTED} .idx --> Header, NumValues, Ordinals
  • *
  • {@code BYTES_FIXED_SORTED} .dat --> Header, ValueSize, * ({@link DataOutput#writeByte Byte} * ValueSize)NumValues
  • *
  • {@code BYTES_VAR_SORTED} .idx --> Header, TotalVarBytes, Addresses, Ordinals
  • *
  • {@code BYTES_VAR_SORTED} .dat --> Header, * ({@link DataOutput#writeByte Byte} * variable ValueSize)NumValues
  • *
* Data Types: *
    *
  • Header --> {@link CodecUtil#writeHeader CodecHeader}
  • *
  • PackedType --> {@link DataOutput#writeByte Byte}
  • *
  • MaxAddress, MinValue, DefaultValue --> {@link DataOutput#writeLong Int64}
  • *
  • PackedStream, Addresses, Ordinals --> {@link PackedInts}
  • *
  • ValueSize, NumValues --> {@link DataOutput#writeInt Int32}
  • *
  • Float32 --> 32-bit float encoded with {@link Float#floatToRawIntBits(float)} * then written as {@link DataOutput#writeInt Int32}
  • *
  • Float64 --> 64-bit float encoded with {@link Double#doubleToRawLongBits(double)} * then written as {@link DataOutput#writeLong Int64}
  • *
  • TotalBytes --> {@link DataOutput#writeVLong VLong}
  • *
  • TotalVarBytes --> {@link DataOutput#writeLong Int64}
  • *
  • LengthPrefix --> Length of the data value as {@link DataOutput#writeVInt VInt} (maximum * of 2 bytes)
  • *
* Notes: *
    *
  • PackedType is a 0 when compressed, 1 when the stream is written as 64-bit integers.
  • *
  • Addresses stores pointers to the actual byte location (indexed by docid). In the VAR_STRAIGHT * case, each entry can have a different length, so to determine the length, docid+1 is * retrieved. A sentinel address is written at the end for the VAR_STRAIGHT case, so the Addresses * stream contains maxdoc+1 indices. For the deduplicated VAR_DEREF case, each length * is encoded as a prefix to the data itself as a {@link DataOutput#writeVInt VInt} * (maximum of 2 bytes).
  • *
  • Ordinals stores the term ID in sorted order (indexed by docid). In the FIXED_SORTED case, * the address into the .dat can be computed from the ordinal as * Header+ValueSize+(ordinal*ValueSize) because the byte length is fixed. * In the VAR_SORTED case, there is double indirection (docid -> ordinal -> address), but * an additional sentinel ordinal+address is always written (so there are NumValues+1 ordinals). To * determine the length, ord+1's address is looked up as well.
  • *
  • {@code BYTES_VAR_STRAIGHT BYTES_VAR_STRAIGHT} in contrast to other straight * variants uses a .idx file to improve lookup perfromance. In contrast to * {@code BYTES_VAR_DEREF BYTES_VAR_DEREF} it doesn't apply deduplication of the document values. *
  • *
*

* Limitations: *

    *
  • Binary doc values can be at most {@link #MAX_BINARY_FIELD_LENGTH} in length. *
* @deprecated Only for reading old 4.0 and 4.1 segments */ @Deprecated // NOTE: not registered in SPI, doesnt respect segment suffix, etc // for back compat only! public class Lucene40DocValuesFormat extends DocValuesFormat { /** Maximum length for each binary doc values field. */ public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2; /** Sole constructor. */ public Lucene40DocValuesFormat() { super("Lucene40"); } @Override public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { throw new UnsupportedOperationException("this codec can only be used for reading"); } @Override public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { String filename = IndexFileNames.segmentFileName(state.segmentInfo.name, "dv", IndexFileNames.COMPOUND_FILE_EXTENSION); return new Lucene40DocValuesReader(state, filename, Lucene40FieldInfosReader.LEGACY_DV_TYPE_KEY); } // constants for VAR_INTS static final String VAR_INTS_CODEC_NAME = "PackedInts"; static final int VAR_INTS_VERSION_START = 0; static final int VAR_INTS_VERSION_CURRENT = VAR_INTS_VERSION_START; static final byte VAR_INTS_PACKED = 0x00; static final byte VAR_INTS_FIXED_64 = 0x01; // constants for FIXED_INTS_8, FIXED_INTS_16, FIXED_INTS_32, FIXED_INTS_64 static final String INTS_CODEC_NAME = "Ints"; static final int INTS_VERSION_START = 0; static final int INTS_VERSION_CURRENT = INTS_VERSION_START; // constants for FLOAT_32, FLOAT_64 static final String FLOATS_CODEC_NAME = "Floats"; static final int FLOATS_VERSION_START = 0; static final int FLOATS_VERSION_CURRENT = FLOATS_VERSION_START; // constants for BYTES_FIXED_STRAIGHT static final String BYTES_FIXED_STRAIGHT_CODEC_NAME = "FixedStraightBytes"; static final int BYTES_FIXED_STRAIGHT_VERSION_START = 0; static final int BYTES_FIXED_STRAIGHT_VERSION_CURRENT = BYTES_FIXED_STRAIGHT_VERSION_START; // constants for BYTES_VAR_STRAIGHT static final String BYTES_VAR_STRAIGHT_CODEC_NAME_IDX = "VarStraightBytesIdx"; static final String BYTES_VAR_STRAIGHT_CODEC_NAME_DAT = "VarStraightBytesDat"; static final int BYTES_VAR_STRAIGHT_VERSION_START = 0; static final int BYTES_VAR_STRAIGHT_VERSION_CURRENT = BYTES_VAR_STRAIGHT_VERSION_START; // constants for BYTES_FIXED_DEREF static final String BYTES_FIXED_DEREF_CODEC_NAME_IDX = "FixedDerefBytesIdx"; static final String BYTES_FIXED_DEREF_CODEC_NAME_DAT = "FixedDerefBytesDat"; static final int BYTES_FIXED_DEREF_VERSION_START = 0; static final int BYTES_FIXED_DEREF_VERSION_CURRENT = BYTES_FIXED_DEREF_VERSION_START; // constants for BYTES_VAR_DEREF static final String BYTES_VAR_DEREF_CODEC_NAME_IDX = "VarDerefBytesIdx"; static final String BYTES_VAR_DEREF_CODEC_NAME_DAT = "VarDerefBytesDat"; static final int BYTES_VAR_DEREF_VERSION_START = 0; static final int BYTES_VAR_DEREF_VERSION_CURRENT = BYTES_VAR_DEREF_VERSION_START; // constants for BYTES_FIXED_SORTED static final String BYTES_FIXED_SORTED_CODEC_NAME_IDX = "FixedSortedBytesIdx"; static final String BYTES_FIXED_SORTED_CODEC_NAME_DAT = "FixedSortedBytesDat"; static final int BYTES_FIXED_SORTED_VERSION_START = 0; static final int BYTES_FIXED_SORTED_VERSION_CURRENT = BYTES_FIXED_SORTED_VERSION_START; // constants for BYTES_VAR_SORTED // NOTE THIS IS NOT A BUG! 4.0 actually screwed this up (VAR_SORTED and VAR_DEREF have same codec header) static final String BYTES_VAR_SORTED_CODEC_NAME_IDX = "VarDerefBytesIdx"; static final String BYTES_VAR_SORTED_CODEC_NAME_DAT = "VarDerefBytesDat"; static final int BYTES_VAR_SORTED_VERSION_START = 0; static final int BYTES_VAR_SORTED_VERSION_CURRENT = BYTES_VAR_SORTED_VERSION_START; }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy