
org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat Maven / Gradle / Ivy
/*
* COPIED FROM APACHE LUCENE 4.7.2
*
* Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
*
* (see https://issues.apache.org/jira/browse/OAK-10786 for details)
*/
package org.apache.lucene.codecs.lucene41;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter;
import org.apache.lucene.codecs.compressing.CompressionMode;
import org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsFormat;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.packed.PackedInts;
/**
* Lucene 4.1 stored fields format.
*
* Principle
* This {@link StoredFieldsFormat} compresses blocks of 16KB of documents in
* order to improve the compression ratio compared to document-level
* compression. It uses the LZ4
* compression algorithm, which is fast to compress and very fast to decompress
* data. Although the compression method that is used focuses more on speed
* than on compression ratio, it should provide interesting compression ratios
* for redundant inputs (such as log files, HTML or plain text).
* File formats
* Stored fields are represented by two files:
*
* -
*
A fields data file (extension .fdt). This file stores a compact
* representation of documents in compressed blocks of 16KB or more. When
* writing a segment, documents are appended to an in-memory byte[]
* buffer. When its size reaches 16KB or more, some metadata about the documents
* is flushed to disk, immediately followed by a compressed representation of
* the buffer using the
* LZ4
* compression format.
* Here is a more detailed description of the field data file format:
*
* - FieldData (.fdt) --> <Header>, PackedIntsVersion, <Chunk>ChunkCount
* - Header --> {@link CodecUtil#writeHeader CodecHeader}
* - PackedIntsVersion --> {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}
* - ChunkCount is not known in advance and is the number of chunks necessary to store all document of the segment
* - Chunk --> DocBase, ChunkDocs, DocFieldCounts, DocLengths, <CompressedDocs>
* - DocBase --> the ID of the first document of the chunk as a {@link DataOutput#writeVInt VInt}
* - ChunkDocs --> the number of documents in the chunk as a {@link DataOutput#writeVInt VInt}
* - DocFieldCounts --> the number of stored fields of every document in the chunk, encoded as followed:
* - if chunkDocs=1, the unique value is encoded as a {@link DataOutput#writeVInt VInt}
* - else read a {@link DataOutput#writeVInt VInt} (let's call it bitsRequired)
* - if bitsRequired is 0 then all values are equal, and the common value is the following {@link DataOutput#writeVInt VInt}
* - else bitsRequired is the number of bits required to store any value, and values are stored in a {@link PackedInts packed} array where every value is stored on exactly bitsRequired bits
*
*
* - DocLengths --> the lengths of all documents in the chunk, encoded with the same method as DocFieldCounts
* - CompressedDocs --> a compressed representation of <Docs> using the LZ4 compression format
* - Docs --> <Doc>ChunkDocs
* - Doc --> <FieldNumAndType, Value>DocFieldCount
* - FieldNumAndType --> a {@link DataOutput#writeVLong VLong}, whose 3 last bits are Type and other bits are FieldNum
* - Type -->
* - 0: Value is String
* - 1: Value is BinaryValue
* - 2: Value is Int
* - 3: Value is Float
* - 4: Value is Long
* - 5: Value is Double
* - 6, 7: unused
*
* - FieldNum --> an ID of the field
* - Value --> {@link DataOutput#writeString(String) String} | BinaryValue | Int | Float | Long | Double depending on Type
* - BinaryValue --> ValueLength <Byte>ValueLength
*
* Notes
*
* - If documents are larger than 16KB then chunks will likely contain only
* one document. However, documents can never spread across several chunks (all
* fields of a single document are in the same chunk).
* - When at least one document in a chunk is large enough so that the chunk
* is larger than 32KB, the chunk will actually be compressed in several LZ4
* blocks of 16KB. This allows {@link StoredFieldVisitor}s which are only
* interested in the first fields of a document to not have to decompress 10MB
* of data if the document is 10MB, but only 16KB.
* - Given that the original lengths are written in the metadata of the chunk,
* the decompressor can leverage this information to stop decoding as soon as
* enough data has been decompressed.
* - In case documents are incompressible, CompressedDocs will be less than
* 0.5% larger than Docs.
*
*
* -
*
A fields index file (extension .fdx).
*
* - FieldsIndex (.fdx) --> <Header>, <ChunkIndex>
* - Header --> {@link CodecUtil#writeHeader CodecHeader}
* - ChunkIndex: See {@link CompressingStoredFieldsIndexWriter}
*
*
*
* Known limitations
* This {@link StoredFieldsFormat} does not support individual documents
* larger than (231 - 214) bytes. In case this
* is a problem, you should use another format, such as
* {@link Lucene40StoredFieldsFormat}.
* @lucene.experimental
*/
public final class Lucene41StoredFieldsFormat extends CompressingStoredFieldsFormat {
/** Sole constructor. */
public Lucene41StoredFieldsFormat() {
super("Lucene41StoredFields", CompressionMode.FAST, 1 << 14);
}
}