org.apache.lucene.codecs.CodecUtil Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.store.BufferedChecksumIndexInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
/**
* Utility class for reading and writing versioned headers.
*
* Writing codec headers is useful to ensure that a file is in
* the format you think it is.
*
* @lucene.experimental
*/
public final class CodecUtil {
private CodecUtil() {} // no instance
/**
* Constant to identify the start of a codec header.
*/
public final static int CODEC_MAGIC = 0x3fd76c17;
/**
* Constant to identify the start of a codec footer.
*/
public final static int FOOTER_MAGIC = ~CODEC_MAGIC;
/**
* Writes a codec header, which records both a string to
* identify the file and a version number. This header can
* be parsed and validated with
* {@link #checkHeader(DataInput, String, int, int) checkHeader()}.
*
* CodecHeader --> Magic,CodecName,Version
*
* - Magic --> {@link DataOutput#writeInt Uint32}. This
* identifies the start of the header. It is always {@value #CODEC_MAGIC}.
*
- CodecName --> {@link DataOutput#writeString String}. This
* is a string to identify this file.
*
- Version --> {@link DataOutput#writeInt Uint32}. Records
* the version of the file.
*
*
* Note that the length of a codec header depends only upon the
* name of the codec, so this length can be computed at any time
* with {@link #headerLength(String)}.
*
* @param out Output stream
* @param codec String to identify this file. It should be simple ASCII,
* less than 128 characters in length.
* @param version Version number
* @throws IOException If there is an I/O error writing to the underlying medium.
* @throws IllegalArgumentException If the codec name is not simple ASCII, or is more than 127 characters in length
*/
public static void writeHeader(DataOutput out, String codec, int version) throws IOException {
BytesRef bytes = new BytesRef(codec);
if (bytes.length != codec.length() || bytes.length >= 128) {
throw new IllegalArgumentException("codec must be simple ASCII, less than 128 characters in length [got " + codec + "]");
}
out.writeInt(CODEC_MAGIC);
out.writeString(codec);
out.writeInt(version);
}
/**
* Writes a codec header for an index file, which records both a string to
* identify the format of the file, a version number, and data to identify
* the file instance (ID and auxiliary suffix such as generation).
*
* This header can be parsed and validated with
* {@link #checkIndexHeader(DataInput, String, int, int, byte[], String) checkIndexHeader()}.
*
* IndexHeader --> CodecHeader,ObjectID,ObjectSuffix
*
* - CodecHeader --> {@link #writeHeader}
*
- ObjectID --> {@link DataOutput#writeByte byte}16
*
- ObjectSuffix --> SuffixLength,SuffixBytes
*
- SuffixLength --> {@link DataOutput#writeByte byte}
*
- SuffixBytes --> {@link DataOutput#writeByte byte}SuffixLength
*
*
* Note that the length of an index header depends only upon the
* name of the codec and suffix, so this length can be computed at any time
* with {@link #indexHeaderLength(String,String)}.
*
* @param out Output stream
* @param codec String to identify the format of this file. It should be simple ASCII,
* less than 128 characters in length.
* @param id Unique identifier for this particular file instance.
* @param suffix auxiliary suffix information for the file. It should be simple ASCII,
* less than 256 characters in length.
* @param version Version number
* @throws IOException If there is an I/O error writing to the underlying medium.
* @throws IllegalArgumentException If the codec name is not simple ASCII, or
* is more than 127 characters in length, or if id is invalid,
* or if the suffix is not simple ASCII, or more than 255 characters
* in length.
*/
public static void writeIndexHeader(DataOutput out, String codec, int version, byte[] id, String suffix) throws IOException {
if (id.length != StringHelper.ID_LENGTH) {
throw new IllegalArgumentException("Invalid id: " + StringHelper.idToString(id));
}
writeHeader(out, codec, version);
out.writeBytes(id, 0, id.length);
BytesRef suffixBytes = new BytesRef(suffix);
if (suffixBytes.length != suffix.length() || suffixBytes.length >= 256) {
throw new IllegalArgumentException("suffix must be simple ASCII, less than 256 characters in length [got " + suffix + "]");
}
out.writeByte((byte) suffixBytes.length);
out.writeBytes(suffixBytes.bytes, suffixBytes.offset, suffixBytes.length);
}
/**
* Computes the length of a codec header.
*
* @param codec Codec name.
* @return length of the entire codec header.
* @see #writeHeader(DataOutput, String, int)
*/
public static int headerLength(String codec) {
return 9+codec.length();
}
/**
* Computes the length of an index header.
*
* @param codec Codec name.
* @return length of the entire index header.
* @see #writeIndexHeader(DataOutput, String, int, byte[], String)
*/
public static int indexHeaderLength(String codec, String suffix) {
return headerLength(codec) + StringHelper.ID_LENGTH + 1 + suffix.length();
}
/**
* Reads and validates a header previously written with
* {@link #writeHeader(DataOutput, String, int)}.
*
* When reading a file, supply the expected codec
and
* an expected version range (minVersion to maxVersion
).
*
* @param in Input stream, positioned at the point where the
* header was previously written. Typically this is located
* at the beginning of the file.
* @param codec The expected codec name.
* @param minVersion The minimum supported expected version number.
* @param maxVersion The maximum supported expected version number.
* @return The actual version found, when a valid header is found
* that matches codec
, with an actual version
* where {@code minVersion <= actual <= maxVersion}.
* Otherwise an exception is thrown.
* @throws CorruptIndexException If the first four bytes are not
* {@link #CODEC_MAGIC}, or if the actual codec found is
* not codec
.
* @throws IndexFormatTooOldException If the actual version is less
* than minVersion
.
* @throws IndexFormatTooNewException If the actual version is greater
* than maxVersion
.
* @throws IOException If there is an I/O error reading from the underlying medium.
* @see #writeHeader(DataOutput, String, int)
*/
public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion) throws IOException {
// Safety to guard against reading a bogus string:
final int actualHeader = in.readInt();
if (actualHeader != CODEC_MAGIC) {
throw new CorruptIndexException("codec header mismatch: actual header=" + actualHeader + " vs expected header=" + CODEC_MAGIC, in);
}
return checkHeaderNoMagic(in, codec, minVersion, maxVersion);
}
/** Like {@link
* #checkHeader(DataInput,String,int,int)} except this
* version assumes the first int has already been read
* and validated from the input. */
public static int checkHeaderNoMagic(DataInput in, String codec, int minVersion, int maxVersion) throws IOException {
final String actualCodec = in.readString();
if (!actualCodec.equals(codec)) {
throw new CorruptIndexException("codec mismatch: actual codec=" + actualCodec + " vs expected codec=" + codec, in);
}
final int actualVersion = in.readInt();
if (actualVersion < minVersion) {
throw new IndexFormatTooOldException(in, actualVersion, minVersion, maxVersion);
}
if (actualVersion > maxVersion) {
throw new IndexFormatTooNewException(in, actualVersion, minVersion, maxVersion);
}
return actualVersion;
}
/**
* Reads and validates a header previously written with
* {@link #writeIndexHeader(DataOutput, String, int, byte[], String)}.
*
* When reading a file, supply the expected codec
,
* expected version range (minVersion to maxVersion
),
* and object ID and suffix.
*
* @param in Input stream, positioned at the point where the
* header was previously written. Typically this is located
* at the beginning of the file.
* @param codec The expected codec name.
* @param minVersion The minimum supported expected version number.
* @param maxVersion The maximum supported expected version number.
* @param expectedID The expected object identifier for this file.
* @param expectedSuffix The expected auxiliary suffix for this file.
* @return The actual version found, when a valid header is found
* that matches codec
, with an actual version
* where {@code minVersion <= actual <= maxVersion},
* and matching expectedID
and expectedSuffix
* Otherwise an exception is thrown.
* @throws CorruptIndexException If the first four bytes are not
* {@link #CODEC_MAGIC}, or if the actual codec found is
* not codec
, or if the expectedID
* or expectedSuffix
do not match.
* @throws IndexFormatTooOldException If the actual version is less
* than minVersion
.
* @throws IndexFormatTooNewException If the actual version is greater
* than maxVersion
.
* @throws IOException If there is an I/O error reading from the underlying medium.
* @see #writeIndexHeader(DataOutput, String, int, byte[],String)
*/
public static int checkIndexHeader(DataInput in, String codec, int minVersion, int maxVersion, byte[] expectedID, String expectedSuffix) throws IOException {
int version = checkHeader(in, codec, minVersion, maxVersion);
checkIndexHeaderID(in, expectedID);
checkIndexHeaderSuffix(in, expectedSuffix);
return version;
}
/**
* Expert: verifies the incoming {@link IndexInput} has an index header
* and that its segment ID matches the expected one, and then copies
* that index header into the provided {@link DataOutput}. This is
* useful when building compound files.
*
* @param in Input stream, positioned at the point where the
* index header was previously written. Typically this is located
* at the beginning of the file.
* @param out Output stream, where the header will be copied to.
* @param expectedID Expected segment ID
* @throws CorruptIndexException If the first four bytes are not
* {@link #CODEC_MAGIC}, or if the expectedID
* does not match.
* @throws IOException If there is an I/O error reading from the underlying medium.
*
* @lucene.internal
*/
public static void verifyAndCopyIndexHeader(IndexInput in, DataOutput out, byte[] expectedID) throws IOException {
// make sure it's large enough to have a header and footer
if (in.length() < footerLength() + headerLength("")) {
throw new CorruptIndexException("compound sub-files must have a valid codec header and footer: file is too small (" + in.length() + " bytes)", in);
}
int actualHeader = in.readInt();
if (actualHeader != CODEC_MAGIC) {
throw new CorruptIndexException("compound sub-files must have a valid codec header and footer: codec header mismatch: actual header=" + actualHeader + " vs expected header=" + CodecUtil.CODEC_MAGIC, in);
}
// we can't verify these, so we pass-through:
String codec = in.readString();
int version = in.readInt();
// verify id:
checkIndexHeaderID(in, expectedID);
// we can't verify extension either, so we pass-through:
int suffixLength = in.readByte() & 0xFF;
byte[] suffixBytes = new byte[suffixLength];
in.readBytes(suffixBytes, 0, suffixLength);
// now write the header we just verified
out.writeInt(CodecUtil.CODEC_MAGIC);
out.writeString(codec);
out.writeInt(version);
out.writeBytes(expectedID, 0, expectedID.length);
out.writeByte((byte) suffixLength);
out.writeBytes(suffixBytes, 0, suffixLength);
}
/** Retrieves the full index header from the provided {@link IndexInput}.
* This throws {@link CorruptIndexException} if this file does
* not appear to be an index file. */
public static byte[] readIndexHeader(IndexInput in) throws IOException {
in.seek(0);
final int actualHeader = in.readInt();
if (actualHeader != CODEC_MAGIC) {
throw new CorruptIndexException("codec header mismatch: actual header=" + actualHeader + " vs expected header=" + CODEC_MAGIC, in);
}
String codec = in.readString();
in.readInt();
in.seek(in.getFilePointer() + StringHelper.ID_LENGTH);
int suffixLength = in.readByte() & 0xFF;
byte[] bytes = new byte[headerLength(codec) + StringHelper.ID_LENGTH + 1 + suffixLength];
in.seek(0);
in.readBytes(bytes, 0, bytes.length);
return bytes;
}
/** Retrieves the full footer from the provided {@link IndexInput}. This throws
* {@link CorruptIndexException} if this file does not have a valid footer. */
public static byte[] readFooter(IndexInput in) throws IOException {
if (in.length() < footerLength()) {
throw new CorruptIndexException("misplaced codec footer (file truncated?): length=" + in.length() + " but footerLength==" + footerLength(), in);
}
in.seek(in.length() - footerLength());
validateFooter(in);
in.seek(in.length() - footerLength());
byte[] bytes = new byte[footerLength()];
in.readBytes(bytes, 0, bytes.length);
return bytes;
}
/** Expert: just reads and verifies the object ID of an index header */
public static byte[] checkIndexHeaderID(DataInput in, byte[] expectedID) throws IOException {
byte id[] = new byte[StringHelper.ID_LENGTH];
in.readBytes(id, 0, id.length);
if (!Arrays.equals(id, expectedID)) {
throw new CorruptIndexException("file mismatch, expected id=" + StringHelper.idToString(expectedID)
+ ", got=" + StringHelper.idToString(id), in);
}
return id;
}
/** Expert: just reads and verifies the suffix of an index header */
public static String checkIndexHeaderSuffix(DataInput in, String expectedSuffix) throws IOException {
int suffixLength = in.readByte() & 0xFF;
byte suffixBytes[] = new byte[suffixLength];
in.readBytes(suffixBytes, 0, suffixBytes.length);
String suffix = new String(suffixBytes, 0, suffixBytes.length, StandardCharsets.UTF_8);
if (!suffix.equals(expectedSuffix)) {
throw new CorruptIndexException("file mismatch, expected suffix=" + expectedSuffix
+ ", got=" + suffix, in);
}
return suffix;
}
/**
* Writes a codec footer, which records both a checksum
* algorithm ID and a checksum. This footer can
* be parsed and validated with
* {@link #checkFooter(ChecksumIndexInput) checkFooter()}.
*
* CodecFooter --> Magic,AlgorithmID,Checksum
*
* - Magic --> {@link DataOutput#writeInt Uint32}. This
* identifies the start of the footer. It is always {@value #FOOTER_MAGIC}.
*
- AlgorithmID --> {@link DataOutput#writeInt Uint32}. This
* indicates the checksum algorithm used. Currently this is always 0,
* for zlib-crc32.
*
- Checksum --> {@link DataOutput#writeLong Uint64}. The
* actual checksum value for all previous bytes in the stream, including
* the bytes from Magic and AlgorithmID.
*
*
* @param out Output stream
* @throws IOException If there is an I/O error writing to the underlying medium.
*/
public static void writeFooter(IndexOutput out) throws IOException {
out.writeInt(FOOTER_MAGIC);
out.writeInt(0);
writeCRC(out);
}
/**
* Computes the length of a codec footer.
*
* @return length of the entire codec footer.
* @see #writeFooter(IndexOutput)
*/
public static int footerLength() {
return 16;
}
/**
* Validates the codec footer previously written by {@link #writeFooter}.
* @return actual checksum value
* @throws IOException if the footer is invalid, if the checksum does not match,
* or if {@code in} is not properly positioned before the footer
* at the end of the stream.
*/
public static long checkFooter(ChecksumIndexInput in) throws IOException {
validateFooter(in);
long actualChecksum = in.getChecksum();
long expectedChecksum = readCRC(in);
if (expectedChecksum != actualChecksum) {
throw new CorruptIndexException("checksum failed (hardware problem?) : expected=" + Long.toHexString(expectedChecksum) +
" actual=" + Long.toHexString(actualChecksum), in);
}
return actualChecksum;
}
/**
* Validates the codec footer previously written by {@link #writeFooter}, optionally
* passing an unexpected exception that has already occurred.
*
* When a {@code priorException} is provided, this method will add a suppressed exception
* indicating whether the checksum for the stream passes, fails, or cannot be computed, and
* rethrow it. Otherwise it behaves the same as {@link #checkFooter(ChecksumIndexInput)}.
*
* Example usage:
*
* try (ChecksumIndexInput input = ...) {
* Throwable priorE = null;
* try {
* // ... read a bunch of stuff ...
* } catch (Throwable exception) {
* priorE = exception;
* } finally {
* CodecUtil.checkFooter(input, priorE);
* }
* }
*
*/
public static void checkFooter(ChecksumIndexInput in, Throwable priorException) throws IOException {
if (priorException == null) {
checkFooter(in);
} else {
try {
long remaining = in.length() - in.getFilePointer();
if (remaining < footerLength()) {
// corruption caused us to read into the checksum footer already: we can't proceed
priorException.addSuppressed(new CorruptIndexException("checksum status indeterminate: remaining=" + remaining +
", please run checkindex for more details", in));
} else {
// otherwise, skip any unread bytes.
in.skipBytes(remaining - footerLength());
// now check the footer
try {
long checksum = checkFooter(in);
priorException.addSuppressed(new CorruptIndexException("checksum passed (" + Long.toHexString(checksum) +
"). possibly transient resource issue, or a Lucene or JVM bug", in));
} catch (CorruptIndexException t) {
priorException.addSuppressed(t);
}
}
} catch (Throwable t) {
// catch-all for things that shouldn't go wrong (e.g. OOM during readInt) but could...
priorException.addSuppressed(new CorruptIndexException("checksum status indeterminate: unexpected exception", in, t));
}
throw IOUtils.rethrowAlways(priorException);
}
}
/**
* Returns (but does not validate) the checksum previously written by {@link #checkFooter}.
* @return actual checksum value
* @throws IOException if the footer is invalid
*/
public static long retrieveChecksum(IndexInput in) throws IOException {
if (in.length() < footerLength()) {
throw new CorruptIndexException("misplaced codec footer (file truncated?): length=" + in.length() + " but footerLength==" + footerLength(), in);
}
in.seek(in.length() - footerLength());
validateFooter(in);
return readCRC(in);
}
private static void validateFooter(IndexInput in) throws IOException {
long remaining = in.length() - in.getFilePointer();
long expected = footerLength();
if (remaining < expected) {
throw new CorruptIndexException("misplaced codec footer (file truncated?): remaining=" + remaining + ", expected=" + expected + ", fp=" + in.getFilePointer(), in);
} else if (remaining > expected) {
throw new CorruptIndexException("misplaced codec footer (file extended?): remaining=" + remaining + ", expected=" + expected + ", fp=" + in.getFilePointer(), in);
}
final int magic = in.readInt();
if (magic != FOOTER_MAGIC) {
throw new CorruptIndexException("codec footer mismatch (file truncated?): actual footer=" + magic + " vs expected footer=" + FOOTER_MAGIC, in);
}
final int algorithmID = in.readInt();
if (algorithmID != 0) {
throw new CorruptIndexException("codec footer mismatch: unknown algorithmID: " + algorithmID, in);
}
}
/**
* Clones the provided input, reads all bytes from the file, and calls {@link #checkFooter}
*
* Note that this method may be slow, as it must process the entire file.
* If you just need to extract the checksum value, call {@link #retrieveChecksum}.
*/
public static long checksumEntireFile(IndexInput input) throws IOException {
IndexInput clone = input.clone();
clone.seek(0);
ChecksumIndexInput in = new BufferedChecksumIndexInput(clone);
assert in.getFilePointer() == 0;
if (in.length() < footerLength()) {
throw new CorruptIndexException("misplaced codec footer (file truncated?): length=" + in.length() + " but footerLength==" + footerLength(), input);
}
in.seek(in.length() - footerLength());
return checkFooter(in);
}
/**
* Reads CRC32 value as a 64-bit long from the input.
* @throws CorruptIndexException if CRC is formatted incorrectly (wrong bits set)
* @throws IOException if an i/o error occurs
*/
static long readCRC(IndexInput input) throws IOException {
long value = input.readLong();
if ((value & 0xFFFFFFFF00000000L) != 0) {
throw new CorruptIndexException("Illegal CRC-32 checksum: " + value, input);
}
return value;
}
/**
* Writes CRC32 value as a 64-bit long to the output.
* @throws IllegalStateException if CRC is formatted incorrectly (wrong bits set)
* @throws IOException if an i/o error occurs
*/
static void writeCRC(IndexOutput output) throws IOException {
long value = output.getChecksum();
if ((value & 0xFFFFFFFF00000000L) != 0) {
throw new IllegalStateException("Illegal CRC-32 checksum: " + value + " (resource=" + output + ")");
}
output.writeLong(value);
}
}