org.apache.hadoop.hbase.io.hfile.ChecksumUtil Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.io.hfile;
import java.io.IOException;
import java.nio.ByteBuffer;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.hbase.nio.ByteBuff;
import org.apache.hadoop.hbase.nio.SingleByteBuff;
import org.apache.hadoop.hbase.util.ChecksumType;
import org.apache.hadoop.util.DataChecksum;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Utility methods to compute and validate checksums.
*/
@InterfaceAudience.Private
public class ChecksumUtil {
public static final Logger LOG = LoggerFactory.getLogger(ChecksumUtil.class);
public static final int CHECKSUM_BUF_SIZE = 256;
/**
* This is used by unit tests to make checksum failures throw an exception instead of returning
* null. Returning a null value from checksum validation will cause the higher layer to retry that
* read with hdfs-level checksums. Instead, we would like checksum failures to cause the entire
* unit test to fail.
*/
private static boolean generateExceptions = false;
/**
* Generates a checksum for all the data in indata. The checksum is written to outdata.
* @param indata input data stream
* @param startOffset starting offset in the indata stream from where to compute checkums
* from
* @param endOffset ending offset in the indata stream upto which checksums needs to be
* computed
* @param outdata the output buffer where checksum values are written
* @param outOffset the starting offset in the outdata where the checksum values are
* written
* @param checksumType type of checksum
* @param bytesPerChecksum number of bytes per checksum value
*/
static void generateChecksums(byte[] indata, int startOffset, int endOffset, byte[] outdata,
int outOffset, ChecksumType checksumType, int bytesPerChecksum) throws IOException {
if (checksumType == ChecksumType.NULL) {
return; // No checksum for this block.
}
DataChecksum checksum =
DataChecksum.newDataChecksum(checksumType.getDataChecksumType(), bytesPerChecksum);
checksum.calculateChunkedSums(ByteBuffer.wrap(indata, startOffset, endOffset - startOffset),
ByteBuffer.wrap(outdata, outOffset, outdata.length - outOffset));
}
/**
* Like the hadoop's {@link DataChecksum#verifyChunkedSums(ByteBuffer, ByteBuffer, String, long)},
* this method will also verify checksum of each chunk in data. the difference is: this method can
* accept {@link ByteBuff} as arguments, we can not add it in hadoop-common so defined here.
* @param dataChecksum to calculate the checksum.
* @param data as the input
* @param checksums to compare
* @param pathName indicate that the data is read from which file.
* @return a flag indicate the checksum match or mismatch.
* @see org.apache.hadoop.util.DataChecksum#verifyChunkedSums(ByteBuffer, ByteBuffer, String,
* long)
*/
private static boolean verifyChunkedSums(DataChecksum dataChecksum, ByteBuff data,
ByteBuff checksums, String pathName) {
// Almost all of the HFile Block are about 64KB, and it would be a SingleByteBuff, use the
// Hadoop's verify checksum directly, because it'll use the native checksum, which has no extra
// byte[] allocation or copying. (HBASE-21917)
if (data instanceof SingleByteBuff && checksums instanceof SingleByteBuff) {
// the checksums ByteBuff must also be an SingleByteBuff because it's duplicated from data.
ByteBuffer dataBB = (ByteBuffer) (data.nioByteBuffers()[0]).duplicate()
.position(data.position()).limit(data.limit());
ByteBuffer checksumBB = (ByteBuffer) (checksums.nioByteBuffers()[0]).duplicate()
.position(checksums.position()).limit(checksums.limit());
try {
dataChecksum.verifyChunkedSums(dataBB, checksumBB, pathName, 0);
return true;
} catch (ChecksumException e) {
return false;
}
}
// If the block is a MultiByteBuff. we use a small byte[] to update the checksum many times for
// reducing GC pressure. it's a rare case.
int checksumTypeSize = dataChecksum.getChecksumType().size;
if (checksumTypeSize == 0) {
return true;
}
// we have 5 checksum type now: NULL,DEFAULT,MIXED,CRC32,CRC32C. the former three need 0 byte,
// and the other two need 4 bytes.
assert checksumTypeSize == 4;
int bytesPerChecksum = dataChecksum.getBytesPerChecksum();
int startDataPos = data.position();
data.mark();
checksums.mark();
try {
// allocate an small buffer for reducing young GC (HBASE-21917), and copy 256 bytes from
// ByteBuff to update the checksum each time. if we upgrade to an future JDK and hadoop
// version which support DataCheckSum#update(ByteBuffer), we won't need to update the checksum
// multiple times then.
byte[] buf = new byte[CHECKSUM_BUF_SIZE];
byte[] sum = new byte[checksumTypeSize];
while (data.remaining() > 0) {
int n = Math.min(data.remaining(), bytesPerChecksum);
checksums.get(sum);
dataChecksum.reset();
for (int remain = n, len; remain > 0; remain -= len) {
// Copy 256 bytes from ByteBuff to update the checksum each time, if the remaining
// bytes is less than 256, then just update the remaining bytes.
len = Math.min(CHECKSUM_BUF_SIZE, remain);
data.get(buf, 0, len);
dataChecksum.update(buf, 0, len);
}
int calculated = (int) dataChecksum.getValue();
int stored = (sum[0] << 24 & 0xff000000) | (sum[1] << 16 & 0xff0000)
| (sum[2] << 8 & 0xff00) | (sum[3] & 0xff);
if (calculated != stored) {
if (LOG.isTraceEnabled()) {
long errPos = data.position() - startDataPos - n;
LOG.trace("Checksum error: {} at {} expected: {} got: {}", pathName, errPos, stored,
calculated);
}
return false;
}
}
} finally {
data.reset();
checksums.reset();
}
return true;
}
/**
* Validates that the data in the specified HFileBlock matches the checksum. Generates the
* checksums for the data and then validate that it matches those stored in the end of the data.
* @param buf Contains the data in following order: HFileBlock header, data, checksums.
* @param pathName Path of the HFile to which the {@code data} belongs. Only used for logging.
* @param offset offset of the data being validated. Only used for logging.
* @param hdrSize Size of the block header in {@code data}. Only used for logging.
* @return True if checksum matches, else false.
*/
static boolean validateChecksum(ByteBuff buf, String pathName, long offset, int hdrSize) {
ChecksumType ctype = ChecksumType.codeToType(buf.get(HFileBlock.Header.CHECKSUM_TYPE_INDEX));
if (ctype == ChecksumType.NULL) {
return true;// No checksum validations needed for this block.
}
// read in the stored value of the checksum size from the header.
int bytesPerChecksum = buf.getInt(HFileBlock.Header.BYTES_PER_CHECKSUM_INDEX);
DataChecksum dataChecksum =
DataChecksum.newDataChecksum(ctype.getDataChecksumType(), bytesPerChecksum);
assert dataChecksum != null;
int onDiskDataSizeWithHeader =
buf.getInt(HFileBlock.Header.ON_DISK_DATA_SIZE_WITH_HEADER_INDEX);
LOG.trace(
"dataLength={}, sizeWithHeader={}, checksumType={}, file={}, "
+ "offset={}, headerSize={}, bytesPerChecksum={}",
buf.capacity(), onDiskDataSizeWithHeader, ctype.getName(), pathName, offset, hdrSize,
bytesPerChecksum);
ByteBuff data = buf.duplicate().position(0).limit(onDiskDataSizeWithHeader);
ByteBuff checksums = buf.duplicate().position(onDiskDataSizeWithHeader).limit(buf.limit());
return verifyChunkedSums(dataChecksum, data, checksums, pathName);
}
/**
* Returns the number of bytes needed to store the checksums for a specified data size
* @param datasize number of bytes of data
* @param bytesPerChecksum number of bytes in a checksum chunk
* @return The number of bytes needed to store the checksum values
*/
static long numBytes(long datasize, int bytesPerChecksum) {
return numChunks(datasize, bytesPerChecksum) * HFileBlock.CHECKSUM_SIZE;
}
/**
* Returns the number of checksum chunks needed to store the checksums for a specified data size
* @param datasize number of bytes of data
* @param bytesPerChecksum number of bytes in a checksum chunk
* @return The number of checksum chunks
*/
static long numChunks(long datasize, int bytesPerChecksum) {
long numChunks = datasize / bytesPerChecksum;
if (datasize % bytesPerChecksum != 0) {
numChunks++;
}
return numChunks;
}
/**
* Mechanism to throw an exception in case of hbase checksum failure. This is used by unit tests
* only.
* @param value Setting this to true will cause hbase checksum verification failures to generate
* exceptions.
*/
public static void generateExceptionForChecksumFailureForTest(boolean value) {
generateExceptions = value;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy