All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.CodecUtil Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs;


import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.store.BufferedChecksumIndexInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;

/**
 * Utility class for reading and writing versioned headers.
 * 

* Writing codec headers is useful to ensure that a file is in * the format you think it is. * * @lucene.experimental */ public final class CodecUtil { private CodecUtil() {} // no instance /** * Constant to identify the start of a codec header. */ public final static int CODEC_MAGIC = 0x3fd76c17; /** * Constant to identify the start of a codec footer. */ public final static int FOOTER_MAGIC = ~CODEC_MAGIC; /** * Writes a codec header, which records both a string to * identify the file and a version number. This header can * be parsed and validated with * {@link #checkHeader(DataInput, String, int, int) checkHeader()}. *

* CodecHeader --> Magic,CodecName,Version *

    *
  • Magic --> {@link DataOutput#writeInt Uint32}. This * identifies the start of the header. It is always {@value #CODEC_MAGIC}. *
  • CodecName --> {@link DataOutput#writeString String}. This * is a string to identify this file. *
  • Version --> {@link DataOutput#writeInt Uint32}. Records * the version of the file. *
*

* Note that the length of a codec header depends only upon the * name of the codec, so this length can be computed at any time * with {@link #headerLength(String)}. * * @param out Output stream * @param codec String to identify this file. It should be simple ASCII, * less than 128 characters in length. * @param version Version number * @throws IOException If there is an I/O error writing to the underlying medium. * @throws IllegalArgumentException If the codec name is not simple ASCII, or is more than 127 characters in length */ public static void writeHeader(DataOutput out, String codec, int version) throws IOException { BytesRef bytes = new BytesRef(codec); if (bytes.length != codec.length() || bytes.length >= 128) { throw new IllegalArgumentException("codec must be simple ASCII, less than 128 characters in length [got " + codec + "]"); } out.writeInt(CODEC_MAGIC); out.writeString(codec); out.writeInt(version); } /** * Writes a codec header for an index file, which records both a string to * identify the format of the file, a version number, and data to identify * the file instance (ID and auxiliary suffix such as generation). *

* This header can be parsed and validated with * {@link #checkIndexHeader(DataInput, String, int, int, byte[], String) checkIndexHeader()}. *

* IndexHeader --> CodecHeader,ObjectID,ObjectSuffix *

    *
  • CodecHeader --> {@link #writeHeader} *
  • ObjectID --> {@link DataOutput#writeByte byte}16 *
  • ObjectSuffix --> SuffixLength,SuffixBytes *
  • SuffixLength --> {@link DataOutput#writeByte byte} *
  • SuffixBytes --> {@link DataOutput#writeByte byte}SuffixLength *
*

* Note that the length of an index header depends only upon the * name of the codec and suffix, so this length can be computed at any time * with {@link #indexHeaderLength(String,String)}. * * @param out Output stream * @param codec String to identify the format of this file. It should be simple ASCII, * less than 128 characters in length. * @param id Unique identifier for this particular file instance. * @param suffix auxiliary suffix information for the file. It should be simple ASCII, * less than 256 characters in length. * @param version Version number * @throws IOException If there is an I/O error writing to the underlying medium. * @throws IllegalArgumentException If the codec name is not simple ASCII, or * is more than 127 characters in length, or if id is invalid, * or if the suffix is not simple ASCII, or more than 255 characters * in length. */ public static void writeIndexHeader(DataOutput out, String codec, int version, byte[] id, String suffix) throws IOException { if (id.length != StringHelper.ID_LENGTH) { throw new IllegalArgumentException("Invalid id: " + StringHelper.idToString(id)); } writeHeader(out, codec, version); out.writeBytes(id, 0, id.length); BytesRef suffixBytes = new BytesRef(suffix); if (suffixBytes.length != suffix.length() || suffixBytes.length >= 256) { throw new IllegalArgumentException("suffix must be simple ASCII, less than 256 characters in length [got " + suffix + "]"); } out.writeByte((byte) suffixBytes.length); out.writeBytes(suffixBytes.bytes, suffixBytes.offset, suffixBytes.length); } /** * Computes the length of a codec header. * * @param codec Codec name. * @return length of the entire codec header. * @see #writeHeader(DataOutput, String, int) */ public static int headerLength(String codec) { return 9+codec.length(); } /** * Computes the length of an index header. * * @param codec Codec name. * @return length of the entire index header. * @see #writeIndexHeader(DataOutput, String, int, byte[], String) */ public static int indexHeaderLength(String codec, String suffix) { return headerLength(codec) + StringHelper.ID_LENGTH + 1 + suffix.length(); } /** * Reads and validates a header previously written with * {@link #writeHeader(DataOutput, String, int)}. *

* When reading a file, supply the expected codec and * an expected version range (minVersion to maxVersion). * * @param in Input stream, positioned at the point where the * header was previously written. Typically this is located * at the beginning of the file. * @param codec The expected codec name. * @param minVersion The minimum supported expected version number. * @param maxVersion The maximum supported expected version number. * @return The actual version found, when a valid header is found * that matches codec, with an actual version * where {@code minVersion <= actual <= maxVersion}. * Otherwise an exception is thrown. * @throws CorruptIndexException If the first four bytes are not * {@link #CODEC_MAGIC}, or if the actual codec found is * not codec. * @throws IndexFormatTooOldException If the actual version is less * than minVersion. * @throws IndexFormatTooNewException If the actual version is greater * than maxVersion. * @throws IOException If there is an I/O error reading from the underlying medium. * @see #writeHeader(DataOutput, String, int) */ public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion) throws IOException { // Safety to guard against reading a bogus string: final int actualHeader = in.readInt(); if (actualHeader != CODEC_MAGIC) { throw new CorruptIndexException("codec header mismatch: actual header=" + actualHeader + " vs expected header=" + CODEC_MAGIC, in); } return checkHeaderNoMagic(in, codec, minVersion, maxVersion); } /** Like {@link * #checkHeader(DataInput,String,int,int)} except this * version assumes the first int has already been read * and validated from the input. */ public static int checkHeaderNoMagic(DataInput in, String codec, int minVersion, int maxVersion) throws IOException { final String actualCodec = in.readString(); if (!actualCodec.equals(codec)) { throw new CorruptIndexException("codec mismatch: actual codec=" + actualCodec + " vs expected codec=" + codec, in); } final int actualVersion = in.readInt(); if (actualVersion < minVersion) { throw new IndexFormatTooOldException(in, actualVersion, minVersion, maxVersion); } if (actualVersion > maxVersion) { throw new IndexFormatTooNewException(in, actualVersion, minVersion, maxVersion); } return actualVersion; } /** * Reads and validates a header previously written with * {@link #writeIndexHeader(DataOutput, String, int, byte[], String)}. *

* When reading a file, supply the expected codec, * expected version range (minVersion to maxVersion), * and object ID and suffix. * * @param in Input stream, positioned at the point where the * header was previously written. Typically this is located * at the beginning of the file. * @param codec The expected codec name. * @param minVersion The minimum supported expected version number. * @param maxVersion The maximum supported expected version number. * @param expectedID The expected object identifier for this file. * @param expectedSuffix The expected auxiliary suffix for this file. * @return The actual version found, when a valid header is found * that matches codec, with an actual version * where {@code minVersion <= actual <= maxVersion}, * and matching expectedID and expectedSuffix * Otherwise an exception is thrown. * @throws CorruptIndexException If the first four bytes are not * {@link #CODEC_MAGIC}, or if the actual codec found is * not codec, or if the expectedID * or expectedSuffix do not match. * @throws IndexFormatTooOldException If the actual version is less * than minVersion. * @throws IndexFormatTooNewException If the actual version is greater * than maxVersion. * @throws IOException If there is an I/O error reading from the underlying medium. * @see #writeIndexHeader(DataOutput, String, int, byte[],String) */ public static int checkIndexHeader(DataInput in, String codec, int minVersion, int maxVersion, byte[] expectedID, String expectedSuffix) throws IOException { int version = checkHeader(in, codec, minVersion, maxVersion); checkIndexHeaderID(in, expectedID); checkIndexHeaderSuffix(in, expectedSuffix); return version; } /** * Expert: verifies the incoming {@link IndexInput} has an index header * and that its segment ID matches the expected one, and then copies * that index header into the provided {@link DataOutput}. This is * useful when building compound files. * * @param in Input stream, positioned at the point where the * index header was previously written. Typically this is located * at the beginning of the file. * @param out Output stream, where the header will be copied to. * @param expectedID Expected segment ID * @throws CorruptIndexException If the first four bytes are not * {@link #CODEC_MAGIC}, or if the expectedID * does not match. * @throws IOException If there is an I/O error reading from the underlying medium. * * @lucene.internal */ public static void verifyAndCopyIndexHeader(IndexInput in, DataOutput out, byte[] expectedID) throws IOException { // make sure it's large enough to have a header and footer if (in.length() < footerLength() + headerLength("")) { throw new CorruptIndexException("compound sub-files must have a valid codec header and footer: file is too small (" + in.length() + " bytes)", in); } int actualHeader = in.readInt(); if (actualHeader != CODEC_MAGIC) { throw new CorruptIndexException("compound sub-files must have a valid codec header and footer: codec header mismatch: actual header=" + actualHeader + " vs expected header=" + CodecUtil.CODEC_MAGIC, in); } // we can't verify these, so we pass-through: String codec = in.readString(); int version = in.readInt(); // verify id: checkIndexHeaderID(in, expectedID); // we can't verify extension either, so we pass-through: int suffixLength = in.readByte() & 0xFF; byte[] suffixBytes = new byte[suffixLength]; in.readBytes(suffixBytes, 0, suffixLength); // now write the header we just verified out.writeInt(CodecUtil.CODEC_MAGIC); out.writeString(codec); out.writeInt(version); out.writeBytes(expectedID, 0, expectedID.length); out.writeByte((byte) suffixLength); out.writeBytes(suffixBytes, 0, suffixLength); } /** Retrieves the full index header from the provided {@link IndexInput}. * This throws {@link CorruptIndexException} if this file does * not appear to be an index file. */ public static byte[] readIndexHeader(IndexInput in) throws IOException { in.seek(0); final int actualHeader = in.readInt(); if (actualHeader != CODEC_MAGIC) { throw new CorruptIndexException("codec header mismatch: actual header=" + actualHeader + " vs expected header=" + CODEC_MAGIC, in); } String codec = in.readString(); in.readInt(); in.seek(in.getFilePointer() + StringHelper.ID_LENGTH); int suffixLength = in.readByte() & 0xFF; byte[] bytes = new byte[headerLength(codec) + StringHelper.ID_LENGTH + 1 + suffixLength]; in.seek(0); in.readBytes(bytes, 0, bytes.length); return bytes; } /** Retrieves the full footer from the provided {@link IndexInput}. This throws * {@link CorruptIndexException} if this file does not have a valid footer. */ public static byte[] readFooter(IndexInput in) throws IOException { if (in.length() < footerLength()) { throw new CorruptIndexException("misplaced codec footer (file truncated?): length=" + in.length() + " but footerLength==" + footerLength(), in); } in.seek(in.length() - footerLength()); validateFooter(in); in.seek(in.length() - footerLength()); byte[] bytes = new byte[footerLength()]; in.readBytes(bytes, 0, bytes.length); return bytes; } /** Expert: just reads and verifies the object ID of an index header */ public static byte[] checkIndexHeaderID(DataInput in, byte[] expectedID) throws IOException { byte id[] = new byte[StringHelper.ID_LENGTH]; in.readBytes(id, 0, id.length); if (!Arrays.equals(id, expectedID)) { throw new CorruptIndexException("file mismatch, expected id=" + StringHelper.idToString(expectedID) + ", got=" + StringHelper.idToString(id), in); } return id; } /** Expert: just reads and verifies the suffix of an index header */ public static String checkIndexHeaderSuffix(DataInput in, String expectedSuffix) throws IOException { int suffixLength = in.readByte() & 0xFF; byte suffixBytes[] = new byte[suffixLength]; in.readBytes(suffixBytes, 0, suffixBytes.length); String suffix = new String(suffixBytes, 0, suffixBytes.length, StandardCharsets.UTF_8); if (!suffix.equals(expectedSuffix)) { throw new CorruptIndexException("file mismatch, expected suffix=" + expectedSuffix + ", got=" + suffix, in); } return suffix; } /** * Writes a codec footer, which records both a checksum * algorithm ID and a checksum. This footer can * be parsed and validated with * {@link #checkFooter(ChecksumIndexInput) checkFooter()}. *

* CodecFooter --> Magic,AlgorithmID,Checksum *

    *
  • Magic --> {@link DataOutput#writeInt Uint32}. This * identifies the start of the footer. It is always {@value #FOOTER_MAGIC}. *
  • AlgorithmID --> {@link DataOutput#writeInt Uint32}. This * indicates the checksum algorithm used. Currently this is always 0, * for zlib-crc32. *
  • Checksum --> {@link DataOutput#writeLong Uint64}. The * actual checksum value for all previous bytes in the stream, including * the bytes from Magic and AlgorithmID. *
* * @param out Output stream * @throws IOException If there is an I/O error writing to the underlying medium. */ public static void writeFooter(IndexOutput out) throws IOException { out.writeInt(FOOTER_MAGIC); out.writeInt(0); writeCRC(out); } /** * Computes the length of a codec footer. * * @return length of the entire codec footer. * @see #writeFooter(IndexOutput) */ public static int footerLength() { return 16; } /** * Validates the codec footer previously written by {@link #writeFooter}. * @return actual checksum value * @throws IOException if the footer is invalid, if the checksum does not match, * or if {@code in} is not properly positioned before the footer * at the end of the stream. */ public static long checkFooter(ChecksumIndexInput in) throws IOException { validateFooter(in); long actualChecksum = in.getChecksum(); long expectedChecksum = readCRC(in); if (expectedChecksum != actualChecksum) { throw new CorruptIndexException("checksum failed (hardware problem?) : expected=" + Long.toHexString(expectedChecksum) + " actual=" + Long.toHexString(actualChecksum), in); } return actualChecksum; } /** * Validates the codec footer previously written by {@link #writeFooter}, optionally * passing an unexpected exception that has already occurred. *

* When a {@code priorException} is provided, this method will add a suppressed exception * indicating whether the checksum for the stream passes, fails, or cannot be computed, and * rethrow it. Otherwise it behaves the same as {@link #checkFooter(ChecksumIndexInput)}. *

* Example usage: *

   * try (ChecksumIndexInput input = ...) {
   *   Throwable priorE = null;
   *   try {
   *     // ... read a bunch of stuff ... 
   *   } catch (Throwable exception) {
   *     priorE = exception;
   *   } finally {
   *     CodecUtil.checkFooter(input, priorE);
   *   }
   * }
   * 
*/ public static void checkFooter(ChecksumIndexInput in, Throwable priorException) throws IOException { if (priorException == null) { checkFooter(in); } else { try { long remaining = in.length() - in.getFilePointer(); if (remaining < footerLength()) { // corruption caused us to read into the checksum footer already: we can't proceed priorException.addSuppressed(new CorruptIndexException("checksum status indeterminate: remaining=" + remaining + ", please run checkindex for more details", in)); } else { // otherwise, skip any unread bytes. in.skipBytes(remaining - footerLength()); // now check the footer try { long checksum = checkFooter(in); priorException.addSuppressed(new CorruptIndexException("checksum passed (" + Long.toHexString(checksum) + "). possibly transient resource issue, or a Lucene or JVM bug", in)); } catch (CorruptIndexException t) { priorException.addSuppressed(t); } } } catch (Throwable t) { // catch-all for things that shouldn't go wrong (e.g. OOM during readInt) but could... priorException.addSuppressed(new CorruptIndexException("checksum status indeterminate: unexpected exception", in, t)); } throw IOUtils.rethrowAlways(priorException); } } /** * Returns (but does not validate) the checksum previously written by {@link #checkFooter}. * @return actual checksum value * @throws IOException if the footer is invalid */ public static long retrieveChecksum(IndexInput in) throws IOException { if (in.length() < footerLength()) { throw new CorruptIndexException("misplaced codec footer (file truncated?): length=" + in.length() + " but footerLength==" + footerLength(), in); } in.seek(in.length() - footerLength()); validateFooter(in); return readCRC(in); } private static void validateFooter(IndexInput in) throws IOException { long remaining = in.length() - in.getFilePointer(); long expected = footerLength(); if (remaining < expected) { throw new CorruptIndexException("misplaced codec footer (file truncated?): remaining=" + remaining + ", expected=" + expected + ", fp=" + in.getFilePointer(), in); } else if (remaining > expected) { throw new CorruptIndexException("misplaced codec footer (file extended?): remaining=" + remaining + ", expected=" + expected + ", fp=" + in.getFilePointer(), in); } final int magic = in.readInt(); if (magic != FOOTER_MAGIC) { throw new CorruptIndexException("codec footer mismatch (file truncated?): actual footer=" + magic + " vs expected footer=" + FOOTER_MAGIC, in); } final int algorithmID = in.readInt(); if (algorithmID != 0) { throw new CorruptIndexException("codec footer mismatch: unknown algorithmID: " + algorithmID, in); } } /** * Clones the provided input, reads all bytes from the file, and calls {@link #checkFooter} *

* Note that this method may be slow, as it must process the entire file. * If you just need to extract the checksum value, call {@link #retrieveChecksum}. */ public static long checksumEntireFile(IndexInput input) throws IOException { IndexInput clone = input.clone(); clone.seek(0); ChecksumIndexInput in = new BufferedChecksumIndexInput(clone); assert in.getFilePointer() == 0; if (in.length() < footerLength()) { throw new CorruptIndexException("misplaced codec footer (file truncated?): length=" + in.length() + " but footerLength==" + footerLength(), input); } in.seek(in.length() - footerLength()); return checkFooter(in); } /** * Reads CRC32 value as a 64-bit long from the input. * @throws CorruptIndexException if CRC is formatted incorrectly (wrong bits set) * @throws IOException if an i/o error occurs */ static long readCRC(IndexInput input) throws IOException { long value = input.readLong(); if ((value & 0xFFFFFFFF00000000L) != 0) { throw new CorruptIndexException("Illegal CRC-32 checksum: " + value, input); } return value; } /** * Writes CRC32 value as a 64-bit long to the output. * @throws IllegalStateException if CRC is formatted incorrectly (wrong bits set) * @throws IOException if an i/o error occurs */ static void writeCRC(IndexOutput output) throws IOException { long value = output.getChecksum(); if ((value & 0xFFFFFFFF00000000L) != 0) { throw new IllegalStateException("Illegal CRC-32 checksum: " + value + " (resource=" + output + ")"); } output.writeLong(value); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy