All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.io.hfile.HFileBlock Maven / Gradle / Ivy

There is a newer version: 3.0.0-beta-1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.io.hfile;

import static org.apache.hadoop.hbase.io.ByteBuffAllocator.HEAP;
import static org.apache.hadoop.hbase.io.hfile.trace.HFileContextAttributesBuilderConsumer.CONTEXT_KEY;

import io.opentelemetry.api.common.Attributes;
import io.opentelemetry.api.common.AttributesBuilder;
import io.opentelemetry.api.trace.Span;
import io.opentelemetry.context.Context;
import io.opentelemetry.context.Scope;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.fs.HFileSystem;
import org.apache.hadoop.hbase.io.ByteArrayOutputStream;
import org.apache.hadoop.hbase.io.ByteBuffAllocator;
import org.apache.hadoop.hbase.io.ByteBuffInputStream;
import org.apache.hadoop.hbase.io.ByteBufferWriterDataOutputStream;
import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
import org.apache.hadoop.hbase.io.encoding.EncodingState;
import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext;
import org.apache.hadoop.hbase.io.encoding.HFileBlockDefaultDecodingContext;
import org.apache.hadoop.hbase.io.encoding.HFileBlockDefaultEncodingContext;
import org.apache.hadoop.hbase.io.encoding.HFileBlockEncodingContext;
import org.apache.hadoop.hbase.io.hfile.trace.HFileContextAttributesBuilderConsumer;
import org.apache.hadoop.hbase.io.util.BlockIOUtils;
import org.apache.hadoop.hbase.nio.ByteBuff;
import org.apache.hadoop.hbase.nio.MultiByteBuff;
import org.apache.hadoop.hbase.nio.SingleByteBuff;
import org.apache.hadoop.hbase.regionserver.ShipperListener;
import org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.ReadType;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.ChecksumType;
import org.apache.hadoop.hbase.util.ClassSize;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;

/**
 * Cacheable Blocks of an {@link HFile} version 2 file. Version 2 was introduced in hbase-0.92.0.
 * 

* Version 1 was the original file block. Version 2 was introduced when we changed the hbase file * format to support multi-level block indexes and compound bloom filters (HBASE-3857). Support for * Version 1 was removed in hbase-1.3.0. *

HFileBlock: Version 2

In version 2, a block is structured as follows: *
    *
  • Header: See Writer#putHeader() for where header is written; header total size is * HFILEBLOCK_HEADER_SIZE *
      *
    • 0. blockType: Magic record identifying the {@link BlockType} (8 bytes): e.g. * DATABLK* *
    • 1. onDiskSizeWithoutHeader: Compressed -- a.k.a 'on disk' -- block size, excluding header, * but including tailing checksum bytes (4 bytes) *
    • 2. uncompressedSizeWithoutHeader: Uncompressed block size, excluding header, and excluding * checksum bytes (4 bytes) *
    • 3. prevBlockOffset: The offset of the previous block of the same type (8 bytes). This is used * to navigate to the previous block without having to go to the block index *
    • 4: For minorVersions >=1, the ordinal describing checksum type (1 byte) *
    • 5: For minorVersions >=1, the number of data bytes/checksum chunk (4 bytes) *
    • 6: onDiskDataSizeWithHeader: For minorVersions >=1, the size of data 'on disk', including * header, excluding checksums (4 bytes) *
    *
  • *
  • Raw/Compressed/Encrypted/Encoded data: The compression algorithm is the same for all * the blocks in an {@link HFile}. If compression is NONE, this is just raw, serialized Cells. *
  • Tail: For minorVersions >=1, a series of 4 byte checksums, one each for the number * of bytes specified by bytesPerChecksum. *
*

Caching

Caches cache whole blocks with trailing checksums if any. We then tag on some * metadata, the content of BLOCK_METADATA_SPACE which will be flag on if we are doing 'hbase' * checksums and then the offset into the file which is needed when we re-make a cache key when we * return the block to the cache as 'done'. See {@link Cacheable#serialize(ByteBuffer, boolean)} and * {@link Cacheable#getDeserializer()}. *

* TODO: Should we cache the checksums? Down in Writer#getBlockForCaching(CacheConfig) where we make * a block to cache-on-write, there is an attempt at turning off checksums. This is not the only * place we get blocks to cache. We also will cache the raw return from an hdfs read. In this case, * the checksums may be present. If the cache is backed by something that doesn't do ECC, say an * SSD, we might want to preserve checksums. For now this is open question. *

* TODO: Over in BucketCache, we save a block allocation by doing a custom serialization. Be sure to * change it if serialization changes in here. Could we add a method here that takes an IOEngine and * that then serializes to it rather than expose our internals over in BucketCache? IOEngine is in * the bucket subpackage. Pull it up? Then this class knows about bucketcache. Ugh. */ @InterfaceAudience.Private public class HFileBlock implements Cacheable { private static final Logger LOG = LoggerFactory.getLogger(HFileBlock.class); public static final long FIXED_OVERHEAD = ClassSize.estimateBase(HFileBlock.class, false); // Block Header fields. // TODO: encapsulate Header related logic in this inner class. static class Header { // Format of header is: // 8 bytes - block magic // 4 bytes int - onDiskSizeWithoutHeader // 4 bytes int - uncompressedSizeWithoutHeader // 8 bytes long - prevBlockOffset // The following 3 are only present if header contains checksum information // 1 byte - checksum type // 4 byte int - bytes per checksum // 4 byte int - onDiskDataSizeWithHeader static int BLOCK_MAGIC_INDEX = 0; static int ON_DISK_SIZE_WITHOUT_HEADER_INDEX = 8; static int UNCOMPRESSED_SIZE_WITHOUT_HEADER_INDEX = 12; static int PREV_BLOCK_OFFSET_INDEX = 16; static int CHECKSUM_TYPE_INDEX = 24; static int BYTES_PER_CHECKSUM_INDEX = 25; static int ON_DISK_DATA_SIZE_WITH_HEADER_INDEX = 29; } /** Type of block. Header field 0. */ private BlockType blockType; /** * Size on disk excluding header, including checksum. Header field 1. * @see Writer#putHeader(byte[], int, int, int, int) */ private int onDiskSizeWithoutHeader; /** * Size of pure data. Does not include header or checksums. Header field 2. * @see Writer#putHeader(byte[], int, int, int, int) */ private int uncompressedSizeWithoutHeader; /** * The offset of the previous block on disk. Header field 3. * @see Writer#putHeader(byte[], int, int, int, int) */ private long prevBlockOffset; /** * Size on disk of header + data. Excludes checksum. Header field 6, OR calculated from * {@link #onDiskSizeWithoutHeader} when using HDFS checksum. * @see Writer#putHeader(byte[], int, int, int, int) */ private final int onDiskDataSizeWithHeader; // End of Block Header fields. /** * The in-memory representation of the hfile block. Can be on or offheap. Can be backed by a * single ByteBuffer or by many. Make no assumptions. *

* Be careful reading from this buf. Duplicate and work on the duplicate or if not, * be sure to reset position and limit else trouble down the road. *

* TODO: Make this read-only once made. *

* We are using the ByteBuff type. ByteBuffer is not extensible yet we need to be able to have a * ByteBuffer-like API across multiple ByteBuffers reading from a cache such as BucketCache. So, * we have this ByteBuff type. Unfortunately, it is spread all about HFileBlock. Would be good if * could be confined to cache-use only but hard-to-do. *

* NOTE: this byteBuff including HFileBlock header and data, but excluding checksum. */ private ByteBuff bufWithoutChecksum; /** * Meta data that holds meta information on the hfileblock. */ private final HFileContext fileContext; /** * The offset of this block in the file. Populated by the reader for convenience of access. This * offset is not part of the block header. */ private long offset = UNSET; /** * The on-disk size of the next block, including the header and checksums if present. UNSET if * unknown. Blocks try to carry the size of the next block to read in this data member. Usually we * get block sizes from the hfile index but sometimes the index is not available: e.g. when we * read the indexes themselves (indexes are stored in blocks, we do not have an index for the * indexes). Saves seeks especially around file open when there is a flurry of reading in hfile * metadata. */ private int nextBlockOnDiskSize = UNSET; private ByteBuffAllocator allocator; /** * On a checksum failure, do these many succeeding read requests using hdfs checksums before * auto-reenabling hbase checksum verification. */ static final int CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD = 3; private static int UNSET = -1; public static final boolean FILL_HEADER = true; public static final boolean DONT_FILL_HEADER = false; // How to get the estimate correctly? if it is a singleBB? public static final int MULTI_BYTE_BUFFER_HEAP_SIZE = (int) ClassSize.estimateBase(MultiByteBuff.class, false); /** * Space for metadata on a block that gets stored along with the block when we cache it. There are * a few bytes stuck on the end of the HFileBlock that we pull in from HDFS. 8 bytes are for the * offset of this block (long) in the file. Offset is important because is is used when we remake * the CacheKey when we return block to the cache when done. There is also a flag on whether * checksumming is being done by hbase or not. See class comment for note on uncertain state of * checksumming of blocks that come out of cache (should we or should we not?). Finally there are * 4 bytes to hold the length of the next block which can save a seek on occasion if available. * (This EXTRA info came in with original commit of the bucketcache, HBASE-7404. It was formerly * known as EXTRA_SERIALIZATION_SPACE). */ public static final int BLOCK_METADATA_SPACE = Bytes.SIZEOF_BYTE + Bytes.SIZEOF_LONG + Bytes.SIZEOF_INT; /** * Each checksum value is an integer that can be stored in 4 bytes. */ static final int CHECKSUM_SIZE = Bytes.SIZEOF_INT; static final byte[] DUMMY_HEADER_NO_CHECKSUM = new byte[HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM]; /** * Used deserializing blocks from Cache. * ++++++++++++++ * + HFileBlock + * ++++++++++++++ * + Checksums + <= Optional * ++++++++++++++ * + Metadata! + <= See note on BLOCK_METADATA_SPACE above. * ++++++++++++++ * * @see #serialize(ByteBuffer, boolean) */ public static final CacheableDeserializer BLOCK_DESERIALIZER = new BlockDeserializer(); public static final class BlockDeserializer implements CacheableDeserializer { private BlockDeserializer() { } @Override public HFileBlock deserialize(ByteBuff buf, ByteBuffAllocator alloc) throws IOException { // The buf has the file block followed by block metadata. // Set limit to just before the BLOCK_METADATA_SPACE then rewind. buf.limit(buf.limit() - BLOCK_METADATA_SPACE).rewind(); // Get a new buffer to pass the HFileBlock for it to 'own'. ByteBuff newByteBuff = buf.slice(); // Read out the BLOCK_METADATA_SPACE content and shove into our HFileBlock. buf.position(buf.limit()); buf.limit(buf.limit() + HFileBlock.BLOCK_METADATA_SPACE); boolean usesChecksum = buf.get() == (byte) 1; long offset = buf.getLong(); int nextBlockOnDiskSize = buf.getInt(); return createFromBuff(newByteBuff, usesChecksum, offset, nextBlockOnDiskSize, null, alloc); } @Override public int getDeserializerIdentifier() { return DESERIALIZER_IDENTIFIER; } } private static final int DESERIALIZER_IDENTIFIER; static { DESERIALIZER_IDENTIFIER = CacheableDeserializerIdManager.registerDeserializer(BLOCK_DESERIALIZER); } private final int totalChecksumBytes; /** * Creates a new {@link HFile} block from the given fields. This constructor is used only while * writing blocks and caching, and is sitting in a byte buffer and we want to stuff the block into * cache. See {@link Writer#getBlockForCaching(CacheConfig)}. *

* TODO: The caller presumes no checksumming *

* TODO: HFile block writer can also off-heap ? *

* required of this block instance since going into cache; checksum already verified on underlying * block data pulled in from filesystem. Is that correct? What if cache is SSD? * @param blockType the type of this block, see {@link BlockType} * @param onDiskSizeWithoutHeader see {@link #onDiskSizeWithoutHeader} * @param uncompressedSizeWithoutHeader see {@link #uncompressedSizeWithoutHeader} * @param prevBlockOffset see {@link #prevBlockOffset} * @param buf block buffer with header * ({@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes) * @param fillHeader when true, write the first 4 header fields into passed * buffer. * @param offset the file offset the block was read from * @param onDiskDataSizeWithHeader see {@link #onDiskDataSizeWithHeader} * @param fileContext HFile meta data */ public HFileBlock(BlockType blockType, int onDiskSizeWithoutHeader, int uncompressedSizeWithoutHeader, long prevBlockOffset, ByteBuff buf, boolean fillHeader, long offset, int nextBlockOnDiskSize, int onDiskDataSizeWithHeader, HFileContext fileContext, ByteBuffAllocator allocator) { this.blockType = blockType; this.onDiskSizeWithoutHeader = onDiskSizeWithoutHeader; this.uncompressedSizeWithoutHeader = uncompressedSizeWithoutHeader; this.prevBlockOffset = prevBlockOffset; this.offset = offset; this.onDiskDataSizeWithHeader = onDiskDataSizeWithHeader; this.nextBlockOnDiskSize = nextBlockOnDiskSize; this.fileContext = fileContext; this.allocator = allocator; this.bufWithoutChecksum = buf; if (fillHeader) { overwriteHeader(); } this.bufWithoutChecksum.rewind(); this.totalChecksumBytes = computeTotalChecksumBytes(); } /** * Creates a block from an existing buffer starting with a header. Rewinds and takes ownership of * the buffer. By definition of rewind, ignores the buffer position, but if you slice the buffer * beforehand, it will rewind to that point. * @param buf Has header, content, and trailing checksums if present. */ static HFileBlock createFromBuff(ByteBuff buf, boolean usesHBaseChecksum, final long offset, final int nextBlockOnDiskSize, HFileContext fileContext, ByteBuffAllocator allocator) throws IOException { buf.rewind(); final BlockType blockType = BlockType.read(buf); final int onDiskSizeWithoutHeader = buf.getInt(Header.ON_DISK_SIZE_WITHOUT_HEADER_INDEX); final int uncompressedSizeWithoutHeader = buf.getInt(Header.UNCOMPRESSED_SIZE_WITHOUT_HEADER_INDEX); final long prevBlockOffset = buf.getLong(Header.PREV_BLOCK_OFFSET_INDEX); // This constructor is called when we deserialize a block from cache and when we read a block in // from the fs. fileCache is null when deserialized from cache so need to make up one. HFileContextBuilder fileContextBuilder = fileContext != null ? new HFileContextBuilder(fileContext) : new HFileContextBuilder(); fileContextBuilder.withHBaseCheckSum(usesHBaseChecksum); int onDiskDataSizeWithHeader; if (usesHBaseChecksum) { byte checksumType = buf.get(Header.CHECKSUM_TYPE_INDEX); int bytesPerChecksum = buf.getInt(Header.BYTES_PER_CHECKSUM_INDEX); onDiskDataSizeWithHeader = buf.getInt(Header.ON_DISK_DATA_SIZE_WITH_HEADER_INDEX); // Use the checksum type and bytes per checksum from header, not from fileContext. fileContextBuilder.withChecksumType(ChecksumType.codeToType(checksumType)); fileContextBuilder.withBytesPerCheckSum(bytesPerChecksum); } else { fileContextBuilder.withChecksumType(ChecksumType.NULL); fileContextBuilder.withBytesPerCheckSum(0); // Need to fix onDiskDataSizeWithHeader; there are not checksums after-block-data onDiskDataSizeWithHeader = onDiskSizeWithoutHeader + headerSize(usesHBaseChecksum); } fileContext = fileContextBuilder.build(); assert usesHBaseChecksum == fileContext.isUseHBaseChecksum(); return new HFileBlockBuilder().withBlockType(blockType) .withOnDiskSizeWithoutHeader(onDiskSizeWithoutHeader) .withUncompressedSizeWithoutHeader(uncompressedSizeWithoutHeader) .withPrevBlockOffset(prevBlockOffset).withOffset(offset) .withOnDiskDataSizeWithHeader(onDiskDataSizeWithHeader) .withNextBlockOnDiskSize(nextBlockOnDiskSize).withHFileContext(fileContext) .withByteBuffAllocator(allocator).withByteBuff(buf.rewind()).withShared(!buf.hasArray()) .build(); } /** * Parse total on disk size including header and checksum. * @param headerBuf Header ByteBuffer. Presumed exact size of header. * @param checksumSupport true if checksum verification is in use. * @return Size of the block with header included. */ private static int getOnDiskSizeWithHeader(final ByteBuff headerBuf, boolean checksumSupport) { return headerBuf.getInt(Header.ON_DISK_SIZE_WITHOUT_HEADER_INDEX) + headerSize(checksumSupport); } /** * @return the on-disk size of the next block (including the header size and any checksums if * present) read by peeking into the next block's header; use as a hint when doing a read * of the next block when scanning or running over a file. */ int getNextBlockOnDiskSize() { return nextBlockOnDiskSize; } @Override public BlockType getBlockType() { return blockType; } @Override public int refCnt() { return bufWithoutChecksum.refCnt(); } @Override public HFileBlock retain() { bufWithoutChecksum.retain(); return this; } /** * Call {@link ByteBuff#release()} to decrease the reference count, if no other reference, it will * return back the {@link ByteBuffer} to {@link org.apache.hadoop.hbase.io.ByteBuffAllocator} */ @Override public boolean release() { return bufWithoutChecksum.release(); } /** * Calling this method in strategic locations where HFileBlocks are referenced may help diagnose * potential buffer leaks. We pass the block itself as a default hint, but one can use * {@link #touch(Object)} to pass their own hint as well. */ @Override public HFileBlock touch() { return touch(this); } @Override public HFileBlock touch(Object hint) { bufWithoutChecksum.touch(hint); return this; } /** Returns get data block encoding id that was used to encode this block */ short getDataBlockEncodingId() { if (blockType != BlockType.ENCODED_DATA) { throw new IllegalArgumentException("Querying encoder ID of a block " + "of type other than " + BlockType.ENCODED_DATA + ": " + blockType); } return bufWithoutChecksum.getShort(headerSize()); } /** Returns the on-disk size of header + data part + checksum. */ public int getOnDiskSizeWithHeader() { return onDiskSizeWithoutHeader + headerSize(); } /** Returns the on-disk size of the data part + checksum (header excluded). */ int getOnDiskSizeWithoutHeader() { return onDiskSizeWithoutHeader; } /** Returns the uncompressed size of data part (header and checksum excluded). */ int getUncompressedSizeWithoutHeader() { return uncompressedSizeWithoutHeader; } /** Returns the offset of the previous block of the same type in the file, or -1 if unknown */ long getPrevBlockOffset() { return prevBlockOffset; } /** * Rewinds {@code buf} and writes first 4 header fields. {@code buf} position is modified as * side-effect. */ private void overwriteHeader() { bufWithoutChecksum.rewind(); blockType.write(bufWithoutChecksum); bufWithoutChecksum.putInt(onDiskSizeWithoutHeader); bufWithoutChecksum.putInt(uncompressedSizeWithoutHeader); bufWithoutChecksum.putLong(prevBlockOffset); if (this.fileContext.isUseHBaseChecksum()) { bufWithoutChecksum.put(fileContext.getChecksumType().getCode()); bufWithoutChecksum.putInt(fileContext.getBytesPerChecksum()); bufWithoutChecksum.putInt(onDiskDataSizeWithHeader); } } /** * Returns a buffer that does not include the header and checksum. * @return the buffer with header skipped and checksum omitted. */ public ByteBuff getBufferWithoutHeader() { ByteBuff dup = getBufferReadOnly(); return dup.position(headerSize()).slice(); } /** * Returns a read-only duplicate of the buffer this block stores internally ready to be read. * Clients must not modify the buffer object though they may set position and limit on the * returned buffer since we pass back a duplicate. This method has to be public because it is used * in {@link CompoundBloomFilter} to avoid object creation on every Bloom filter lookup, but has * to be used with caution. Buffer holds header, block content, and any follow-on checksums if * present. * @return the buffer of this block for read-only operations,the buffer includes header,but not * checksum. */ public ByteBuff getBufferReadOnly() { // TODO: ByteBuf does not support asReadOnlyBuffer(). Fix. ByteBuff dup = this.bufWithoutChecksum.duplicate(); assert dup.position() == 0; return dup; } public ByteBuffAllocator getByteBuffAllocator() { return this.allocator; } private void sanityCheckAssertion(long valueFromBuf, long valueFromField, String fieldName) throws IOException { if (valueFromBuf != valueFromField) { throw new AssertionError(fieldName + " in the buffer (" + valueFromBuf + ") is different from that in the field (" + valueFromField + ")"); } } private void sanityCheckAssertion(BlockType valueFromBuf, BlockType valueFromField) throws IOException { if (valueFromBuf != valueFromField) { throw new IOException("Block type stored in the buffer: " + valueFromBuf + ", block type field: " + valueFromField); } } /** * Checks if the block is internally consistent, i.e. the first * {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of the buffer contain a valid header consistent * with the fields. Assumes a packed block structure. This function is primary for testing and * debugging, and is not thread-safe, because it alters the internal buffer pointer. Used by tests * only. */ void sanityCheck() throws IOException { // Duplicate so no side-effects ByteBuff dup = this.bufWithoutChecksum.duplicate().rewind(); sanityCheckAssertion(BlockType.read(dup), blockType); sanityCheckAssertion(dup.getInt(), onDiskSizeWithoutHeader, "onDiskSizeWithoutHeader"); sanityCheckAssertion(dup.getInt(), uncompressedSizeWithoutHeader, "uncompressedSizeWithoutHeader"); sanityCheckAssertion(dup.getLong(), prevBlockOffset, "prevBlockOffset"); if (this.fileContext.isUseHBaseChecksum()) { sanityCheckAssertion(dup.get(), this.fileContext.getChecksumType().getCode(), "checksumType"); sanityCheckAssertion(dup.getInt(), this.fileContext.getBytesPerChecksum(), "bytesPerChecksum"); sanityCheckAssertion(dup.getInt(), onDiskDataSizeWithHeader, "onDiskDataSizeWithHeader"); } if (dup.limit() != onDiskDataSizeWithHeader) { throw new AssertionError( "Expected limit " + onDiskDataSizeWithHeader + ", got " + dup.limit()); } // We might optionally allocate HFILEBLOCK_HEADER_SIZE more bytes to read the next // block's header, so there are two sensible values for buffer capacity. int hdrSize = headerSize(); dup.rewind(); if ( dup.remaining() != onDiskDataSizeWithHeader && dup.remaining() != onDiskDataSizeWithHeader + hdrSize ) { throw new AssertionError("Invalid buffer capacity: " + dup.remaining() + ", expected " + onDiskDataSizeWithHeader + " or " + (onDiskDataSizeWithHeader + hdrSize)); } } @Override public String toString() { StringBuilder sb = new StringBuilder().append("[").append("blockType=").append(blockType) .append(", fileOffset=").append(offset).append(", headerSize=").append(headerSize()) .append(", onDiskSizeWithoutHeader=").append(onDiskSizeWithoutHeader) .append(", uncompressedSizeWithoutHeader=").append(uncompressedSizeWithoutHeader) .append(", prevBlockOffset=").append(prevBlockOffset).append(", isUseHBaseChecksum=") .append(fileContext.isUseHBaseChecksum()); if (fileContext.isUseHBaseChecksum()) { sb.append(", checksumType=").append(ChecksumType.codeToType(this.bufWithoutChecksum.get(24))) .append(", bytesPerChecksum=").append(this.bufWithoutChecksum.getInt(24 + 1)) .append(", onDiskDataSizeWithHeader=").append(onDiskDataSizeWithHeader); } else { sb.append(", onDiskDataSizeWithHeader=").append(onDiskDataSizeWithHeader).append("(") .append(onDiskSizeWithoutHeader).append("+") .append(HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM).append(")"); } String dataBegin; if (bufWithoutChecksum.hasArray()) { dataBegin = Bytes.toStringBinary(bufWithoutChecksum.array(), bufWithoutChecksum.arrayOffset() + headerSize(), Math.min(32, bufWithoutChecksum.limit() - bufWithoutChecksum.arrayOffset() - headerSize())); } else { ByteBuff bufWithoutHeader = getBufferWithoutHeader(); byte[] dataBeginBytes = new byte[Math.min(32, bufWithoutHeader.limit() - bufWithoutHeader.position())]; bufWithoutHeader.get(dataBeginBytes); dataBegin = Bytes.toStringBinary(dataBeginBytes); } sb.append(", getOnDiskSizeWithHeader=").append(getOnDiskSizeWithHeader()) .append(", totalChecksumBytes=").append(totalChecksumBytes()).append(", isUnpacked=") .append(isUnpacked()).append(", buf=[").append(bufWithoutChecksum).append("]") .append(", dataBeginsWith=").append(dataBegin).append(", fileContext=").append(fileContext) .append(", nextBlockOnDiskSize=").append(nextBlockOnDiskSize).append("]"); return sb.toString(); } /** * Retrieves the decompressed/decrypted view of this block. An encoded block remains in its * encoded structure. Internal structures are shared between instances where applicable. */ HFileBlock unpack(HFileContext fileContext, FSReader reader) throws IOException { if (!fileContext.isCompressedOrEncrypted()) { // TODO: cannot use our own fileContext here because HFileBlock(ByteBuffer, boolean), // which is used for block serialization to L2 cache, does not preserve encoding and // encryption details. return this; } ByteBuff newBuf = allocateBufferForUnpacking(); // allocates space for the decompressed block HFileBlock unpacked = shallowClone(this, newBuf); boolean succ = false; final Context context = Context.current().with(CONTEXT_KEY, new HFileContextAttributesBuilderConsumer(fileContext)); try (Scope ignored = context.makeCurrent()) { HFileBlockDecodingContext ctx = blockType == BlockType.ENCODED_DATA ? reader.getBlockDecodingContext() : reader.getDefaultBlockDecodingContext(); // Create a duplicated buffer without the header part. int headerSize = this.headerSize(); ByteBuff dup = this.bufWithoutChecksum.duplicate(); dup.position(headerSize); dup = dup.slice(); // Decode the dup into unpacked#buf ctx.prepareDecoding(unpacked.getOnDiskDataSizeWithHeader() - headerSize, unpacked.getUncompressedSizeWithoutHeader(), unpacked.getBufferWithoutHeader(), dup); succ = true; return unpacked; } finally { if (!succ) { unpacked.release(); } } } /** * Always allocates a new buffer of the correct size. Copies header bytes from the existing * buffer. Does not change header fields. Reserve room to keep checksum bytes too. */ private ByteBuff allocateBufferForUnpacking() { int headerSize = headerSize(); int capacityNeeded = headerSize + uncompressedSizeWithoutHeader; ByteBuff source = bufWithoutChecksum.duplicate(); ByteBuff newBuf = allocator.allocate(capacityNeeded); // Copy header bytes into newBuf. source.position(0); newBuf.put(0, source, 0, headerSize); // set limit to exclude next block's header newBuf.limit(capacityNeeded); return newBuf; } /** * Return true when this block's buffer has been unpacked, false otherwise. Note this is a * calculated heuristic, not tracked attribute of the block. */ public boolean isUnpacked() { final int headerSize = headerSize(); final int expectedCapacity = headerSize + uncompressedSizeWithoutHeader; final int bufCapacity = bufWithoutChecksum.remaining(); return bufCapacity == expectedCapacity || bufCapacity == expectedCapacity + headerSize; } /** * Cannot be {@link #UNSET}. Must be a legitimate value. Used re-making the {@link BlockCacheKey} * when block is returned to the cache. * @return the offset of this block in the file it was read from */ long getOffset() { if (offset < 0) { throw new IllegalStateException("HFile block offset not initialized properly"); } return offset; } /** Returns a byte stream reading the data(excluding header and checksum) of this block */ DataInputStream getByteStream() { ByteBuff dup = this.bufWithoutChecksum.duplicate(); dup.position(this.headerSize()); return new DataInputStream(new ByteBuffInputStream(dup)); } @Override public long heapSize() { long size = FIXED_OVERHEAD; size += fileContext.heapSize(); if (bufWithoutChecksum != null) { // Deep overhead of the byte buffer. Needs to be aligned separately. size += ClassSize.align(bufWithoutChecksum.capacity() + MULTI_BYTE_BUFFER_HEAP_SIZE); } return ClassSize.align(size); } /** * Will be override by {@link SharedMemHFileBlock} or {@link ExclusiveMemHFileBlock}. Return true * by default. */ public boolean isSharedMem() { return true; } /** * Unified version 2 {@link HFile} block writer. The intended usage pattern is as follows: *
    *
  1. Construct an {@link HFileBlock.Writer}, providing a compression algorithm. *
  2. Call {@link Writer#startWriting} and get a data stream to write to. *
  3. Write your data into the stream. *
  4. Call Writer#writeHeaderAndData(FSDataOutputStream) as many times as you need to. store the * serialized block into an external stream. *
  5. Repeat to write more blocks. *
*

*/ static class Writer implements ShipperListener { private enum State { INIT, WRITING, BLOCK_READY }; /** Writer state. Used to ensure the correct usage protocol. */ private State state = State.INIT; /** Data block encoder used for data blocks */ private final HFileDataBlockEncoder dataBlockEncoder; private HFileBlockEncodingContext dataBlockEncodingCtx; /** block encoding context for non-data blocks */ private HFileBlockDefaultEncodingContext defaultBlockEncodingCtx; /** * The stream we use to accumulate data into a block in an uncompressed format. We reset this * stream at the end of each block and reuse it. The header is written as the first * {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes into this stream. */ private ByteArrayOutputStream baosInMemory; /** * Current block type. Set in {@link #startWriting(BlockType)}. Could be changed in * {@link #finishBlock()} from {@link BlockType#DATA} to {@link BlockType#ENCODED_DATA}. */ private BlockType blockType; /** * A stream that we write uncompressed bytes to, which compresses them and writes them to * {@link #baosInMemory}. */ private DataOutputStream userDataStream; /** * Bytes to be written to the file system, including the header. Compressed if compression is * turned on. It also includes the checksum data that immediately follows the block data. * (header + data + checksums) */ private ByteArrayOutputStream onDiskBlockBytesWithHeader; /** * The size of the checksum data on disk. It is used only if data is not compressed. If data is * compressed, then the checksums are already part of onDiskBytesWithHeader. If data is * uncompressed, then this variable stores the checksum data for this block. */ private byte[] onDiskChecksum = HConstants.EMPTY_BYTE_ARRAY; /** * Current block's start offset in the {@link HFile}. Set in * {@link #writeHeaderAndData(FSDataOutputStream)}. */ private long startOffset; /** * Offset of previous block by block type. Updated when the next block is started. */ private long[] prevOffsetByType; /** The offset of the previous block of the same type */ private long prevOffset; /** Meta data that holds information about the hfileblock **/ private HFileContext fileContext; private final ByteBuffAllocator allocator; @Override public void beforeShipped() { if (getEncodingState() != null) { getEncodingState().beforeShipped(); } } EncodingState getEncodingState() { return dataBlockEncodingCtx.getEncodingState(); } /** * @param dataBlockEncoder data block encoding algorithm to use */ public Writer(Configuration conf, HFileDataBlockEncoder dataBlockEncoder, HFileContext fileContext) { this(conf, dataBlockEncoder, fileContext, ByteBuffAllocator.HEAP); } public Writer(Configuration conf, HFileDataBlockEncoder dataBlockEncoder, HFileContext fileContext, ByteBuffAllocator allocator) { if (fileContext.getBytesPerChecksum() < HConstants.HFILEBLOCK_HEADER_SIZE) { throw new RuntimeException("Unsupported value of bytesPerChecksum. " + " Minimum is " + HConstants.HFILEBLOCK_HEADER_SIZE + " but the configured value is " + fileContext.getBytesPerChecksum()); } this.allocator = allocator; this.dataBlockEncoder = dataBlockEncoder != null ? dataBlockEncoder : NoOpDataBlockEncoder.INSTANCE; this.dataBlockEncodingCtx = this.dataBlockEncoder.newDataBlockEncodingContext(conf, HConstants.HFILEBLOCK_DUMMY_HEADER, fileContext); // TODO: This should be lazily instantiated this.defaultBlockEncodingCtx = new HFileBlockDefaultEncodingContext(conf, null, HConstants.HFILEBLOCK_DUMMY_HEADER, fileContext); // TODO: Set BAOS initial size. Use fileContext.getBlocksize() and add for header/checksum baosInMemory = new ByteArrayOutputStream(); prevOffsetByType = new long[BlockType.values().length]; for (int i = 0; i < prevOffsetByType.length; ++i) { prevOffsetByType[i] = UNSET; } // TODO: Why fileContext saved away when we have dataBlockEncoder and/or // defaultDataBlockEncoder? this.fileContext = fileContext; } /** * Starts writing into the block. The previous block's data is discarded. * @return the stream the user can write their data into */ DataOutputStream startWriting(BlockType newBlockType) throws IOException { if (state == State.BLOCK_READY && startOffset != -1) { // We had a previous block that was written to a stream at a specific // offset. Save that offset as the last offset of a block of that type. prevOffsetByType[blockType.getId()] = startOffset; } startOffset = -1; blockType = newBlockType; baosInMemory.reset(); baosInMemory.write(HConstants.HFILEBLOCK_DUMMY_HEADER); state = State.WRITING; // We will compress it later in finishBlock() userDataStream = new ByteBufferWriterDataOutputStream(baosInMemory); if (newBlockType == BlockType.DATA) { this.dataBlockEncoder.startBlockEncoding(dataBlockEncodingCtx, userDataStream); } return userDataStream; } /** * Writes the Cell to this block */ void write(Cell cell) throws IOException { expectState(State.WRITING); this.dataBlockEncoder.encode(cell, dataBlockEncodingCtx, this.userDataStream); } /** * Transitions the block writer from the "writing" state to the "block ready" state. Does * nothing if a block is already finished. */ void ensureBlockReady() throws IOException { Preconditions.checkState(state != State.INIT, "Unexpected state: " + state); if (state == State.BLOCK_READY) { return; } // This will set state to BLOCK_READY. finishBlock(); } /** * Finish up writing of the block. Flushes the compressing stream (if using compression), fills * out the header, does any compression/encryption of bytes to flush out to disk, and manages * the cache on write content, if applicable. Sets block write state to "block ready". */ private void finishBlock() throws IOException { if (blockType == BlockType.DATA) { this.dataBlockEncoder.endBlockEncoding(dataBlockEncodingCtx, userDataStream, baosInMemory.getBuffer(), blockType); blockType = dataBlockEncodingCtx.getBlockType(); } userDataStream.flush(); prevOffset = prevOffsetByType[blockType.getId()]; // We need to set state before we can package the block up for cache-on-write. In a way, the // block is ready, but not yet encoded or compressed. state = State.BLOCK_READY; Bytes compressAndEncryptDat; if (blockType == BlockType.DATA || blockType == BlockType.ENCODED_DATA) { compressAndEncryptDat = dataBlockEncodingCtx.compressAndEncrypt(baosInMemory.getBuffer(), 0, baosInMemory.size()); } else { compressAndEncryptDat = defaultBlockEncodingCtx.compressAndEncrypt(baosInMemory.getBuffer(), 0, baosInMemory.size()); } if (compressAndEncryptDat == null) { compressAndEncryptDat = new Bytes(baosInMemory.getBuffer(), 0, baosInMemory.size()); } if (onDiskBlockBytesWithHeader == null) { onDiskBlockBytesWithHeader = new ByteArrayOutputStream(compressAndEncryptDat.getLength()); } onDiskBlockBytesWithHeader.reset(); onDiskBlockBytesWithHeader.write(compressAndEncryptDat.get(), compressAndEncryptDat.getOffset(), compressAndEncryptDat.getLength()); // Calculate how many bytes we need for checksum on the tail of the block. int numBytes = (int) ChecksumUtil.numBytes(onDiskBlockBytesWithHeader.size(), fileContext.getBytesPerChecksum()); // Put the header for the on disk bytes; header currently is unfilled-out putHeader(onDiskBlockBytesWithHeader, onDiskBlockBytesWithHeader.size() + numBytes, baosInMemory.size(), onDiskBlockBytesWithHeader.size()); if (onDiskChecksum.length != numBytes) { onDiskChecksum = new byte[numBytes]; } ChecksumUtil.generateChecksums(onDiskBlockBytesWithHeader.getBuffer(), 0, onDiskBlockBytesWithHeader.size(), onDiskChecksum, 0, fileContext.getChecksumType(), fileContext.getBytesPerChecksum()); } /** * Put the header into the given byte array at the given offset. * @param onDiskSize size of the block on disk header + data + checksum * @param uncompressedSize size of the block after decompression (but before optional data block * decoding) including header * @param onDiskDataSize size of the block on disk with header and data but not including the * checksums */ private void putHeader(byte[] dest, int offset, int onDiskSize, int uncompressedSize, int onDiskDataSize) { offset = blockType.put(dest, offset); offset = Bytes.putInt(dest, offset, onDiskSize - HConstants.HFILEBLOCK_HEADER_SIZE); offset = Bytes.putInt(dest, offset, uncompressedSize - HConstants.HFILEBLOCK_HEADER_SIZE); offset = Bytes.putLong(dest, offset, prevOffset); offset = Bytes.putByte(dest, offset, fileContext.getChecksumType().getCode()); offset = Bytes.putInt(dest, offset, fileContext.getBytesPerChecksum()); Bytes.putInt(dest, offset, onDiskDataSize); } private void putHeader(ByteBuff buff, int onDiskSize, int uncompressedSize, int onDiskDataSize) { buff.rewind(); blockType.write(buff); buff.putInt(onDiskSize - HConstants.HFILEBLOCK_HEADER_SIZE); buff.putInt(uncompressedSize - HConstants.HFILEBLOCK_HEADER_SIZE); buff.putLong(prevOffset); buff.put(fileContext.getChecksumType().getCode()); buff.putInt(fileContext.getBytesPerChecksum()); buff.putInt(onDiskDataSize); } private void putHeader(ByteArrayOutputStream dest, int onDiskSize, int uncompressedSize, int onDiskDataSize) { putHeader(dest.getBuffer(), 0, onDiskSize, uncompressedSize, onDiskDataSize); } /** * Similar to {@link #writeHeaderAndData(FSDataOutputStream)}, but records the offset of this * block so that it can be referenced in the next block of the same type. */ void writeHeaderAndData(FSDataOutputStream out) throws IOException { long offset = out.getPos(); if (startOffset != UNSET && offset != startOffset) { throw new IOException("A " + blockType + " block written to a " + "stream twice, first at offset " + startOffset + ", then at " + offset); } startOffset = offset; finishBlockAndWriteHeaderAndData(out); } /** * Writes the header and the compressed data of this block (or uncompressed data when not using * compression) into the given stream. Can be called in the "writing" state or in the "block * ready" state. If called in the "writing" state, transitions the writer to the "block ready" * state. * @param out the output stream to write the */ protected void finishBlockAndWriteHeaderAndData(DataOutputStream out) throws IOException { ensureBlockReady(); long startTime = EnvironmentEdgeManager.currentTime(); out.write(onDiskBlockBytesWithHeader.getBuffer(), 0, onDiskBlockBytesWithHeader.size()); out.write(onDiskChecksum); HFile.updateWriteLatency(EnvironmentEdgeManager.currentTime() - startTime); } /** * Returns the header or the compressed data (or uncompressed data when not using compression) * as a byte array. Can be called in the "writing" state or in the "block ready" state. If * called in the "writing" state, transitions the writer to the "block ready" state. This * returns the header + data + checksums stored on disk. * @return header and data as they would be stored on disk in a byte array */ byte[] getHeaderAndDataForTest() throws IOException { ensureBlockReady(); // This is not very optimal, because we are doing an extra copy. // But this method is used only by unit tests. byte[] output = new byte[onDiskBlockBytesWithHeader.size() + onDiskChecksum.length]; System.arraycopy(onDiskBlockBytesWithHeader.getBuffer(), 0, output, 0, onDiskBlockBytesWithHeader.size()); System.arraycopy(onDiskChecksum, 0, output, onDiskBlockBytesWithHeader.size(), onDiskChecksum.length); return output; } /** * Releases resources used by this writer. */ void release() { if (dataBlockEncodingCtx != null) { dataBlockEncodingCtx.close(); dataBlockEncodingCtx = null; } if (defaultBlockEncodingCtx != null) { defaultBlockEncodingCtx.close(); defaultBlockEncodingCtx = null; } } /** * Returns the on-disk size of the data portion of the block. This is the compressed size if * compression is enabled. Can only be called in the "block ready" state. Header is not * compressed, and its size is not included in the return value. * @return the on-disk size of the block, not including the header. */ int getOnDiskSizeWithoutHeader() { expectState(State.BLOCK_READY); return onDiskBlockBytesWithHeader.size() + onDiskChecksum.length - HConstants.HFILEBLOCK_HEADER_SIZE; } /** * Returns the on-disk size of the block. Can only be called in the "block ready" state. * @return the on-disk size of the block ready to be written, including the header size, the * data and the checksum data. */ int getOnDiskSizeWithHeader() { expectState(State.BLOCK_READY); return onDiskBlockBytesWithHeader.size() + onDiskChecksum.length; } /** * The uncompressed size of the block data. Does not include header size. */ int getUncompressedSizeWithoutHeader() { expectState(State.BLOCK_READY); return baosInMemory.size() - HConstants.HFILEBLOCK_HEADER_SIZE; } /** * The uncompressed size of the block data, including header size. */ int getUncompressedSizeWithHeader() { expectState(State.BLOCK_READY); return baosInMemory.size(); } /** Returns true if a block is being written */ boolean isWriting() { return state == State.WRITING; } /** * Returns the number of bytes written into the current block so far, or zero if not writing the * block at the moment. Note that this will return zero in the "block ready" state as well. * @return the number of bytes written */ public int encodedBlockSizeWritten() { return state != State.WRITING ? 0 : this.getEncodingState().getEncodedDataSizeWritten(); } /** * Returns the number of bytes written into the current block so far, or zero if not writing the * block at the moment. Note that this will return zero in the "block ready" state as well. * @return the number of bytes written */ int blockSizeWritten() { return state != State.WRITING ? 0 : this.getEncodingState().getUnencodedDataSizeWritten(); } /** * Clones the header followed by the uncompressed data, even if using compression. This is * needed for storing uncompressed blocks in the block cache. Can be called in the "writing" * state or the "block ready" state. Returns only the header and data, does not include checksum * data. * @return Returns an uncompressed block ByteBuff for caching on write */ ByteBuff cloneUncompressedBufferWithHeader() { expectState(State.BLOCK_READY); ByteBuff bytebuff = allocator.allocate(baosInMemory.size()); baosInMemory.toByteBuff(bytebuff); int numBytes = (int) ChecksumUtil.numBytes(onDiskBlockBytesWithHeader.size(), fileContext.getBytesPerChecksum()); putHeader(bytebuff, onDiskBlockBytesWithHeader.size() + numBytes, baosInMemory.size(), onDiskBlockBytesWithHeader.size()); bytebuff.rewind(); return bytebuff; } /** * Clones the header followed by the on-disk (compressed/encoded/encrypted) data. This is needed * for storing packed blocks in the block cache. Returns only the header and data, Does not * include checksum data. * @return Returns a copy of block bytes for caching on write */ private ByteBuff cloneOnDiskBufferWithHeader() { expectState(State.BLOCK_READY); ByteBuff bytebuff = allocator.allocate(onDiskBlockBytesWithHeader.size()); onDiskBlockBytesWithHeader.toByteBuff(bytebuff); bytebuff.rewind(); return bytebuff; } private void expectState(State expectedState) { if (state != expectedState) { throw new IllegalStateException( "Expected state: " + expectedState + ", actual state: " + state); } } /** * Takes the given {@link BlockWritable} instance, creates a new block of its appropriate type, * writes the writable into this block, and flushes the block into the output stream. The writer * is instructed not to buffer uncompressed bytes for cache-on-write. * @param bw the block-writable object to write as a block * @param out the file system output stream */ void writeBlock(BlockWritable bw, FSDataOutputStream out) throws IOException { bw.writeToBlock(startWriting(bw.getBlockType())); writeHeaderAndData(out); } /** * Creates a new HFileBlock. Checksums have already been validated, so the byte buffer passed * into the constructor of this newly created block does not have checksum data even though the * header minor version is MINOR_VERSION_WITH_CHECKSUM. This is indicated by setting a 0 value * in bytesPerChecksum. This method copies the on-disk or uncompressed data to build the * HFileBlock which is used only while writing blocks and caching. *

* TODO: Should there be an option where a cache can ask that hbase preserve block checksums for * checking after a block comes out of the cache? Otehrwise, cache is responsible for blocks * being wholesome (ECC memory or if file-backed, it does checksumming). */ HFileBlock getBlockForCaching(CacheConfig cacheConf) { HFileContext newContext = new HFileContextBuilder().withBlockSize(fileContext.getBlocksize()) .withBytesPerCheckSum(0).withChecksumType(ChecksumType.NULL) // no checksums in cached data .withCompression(fileContext.getCompression()) .withDataBlockEncoding(fileContext.getDataBlockEncoding()) .withHBaseCheckSum(fileContext.isUseHBaseChecksum()) .withCompressTags(fileContext.isCompressTags()) .withIncludesMvcc(fileContext.isIncludesMvcc()) .withIncludesTags(fileContext.isIncludesTags()) .withColumnFamily(fileContext.getColumnFamily()).withTableName(fileContext.getTableName()) .build(); // Build the HFileBlock. HFileBlockBuilder builder = new HFileBlockBuilder(); ByteBuff buff; if (cacheConf.shouldCacheCompressed(blockType.getCategory())) { buff = cloneOnDiskBufferWithHeader(); } else { buff = cloneUncompressedBufferWithHeader(); } return builder.withBlockType(blockType) .withOnDiskSizeWithoutHeader(getOnDiskSizeWithoutHeader()) .withUncompressedSizeWithoutHeader(getUncompressedSizeWithoutHeader()) .withPrevBlockOffset(prevOffset).withByteBuff(buff).withFillHeader(FILL_HEADER) .withOffset(startOffset).withNextBlockOnDiskSize(UNSET) .withOnDiskDataSizeWithHeader(onDiskBlockBytesWithHeader.size() + onDiskChecksum.length) .withHFileContext(newContext).withByteBuffAllocator(cacheConf.getByteBuffAllocator()) .withShared(!buff.hasArray()).build(); } } /** Something that can be written into a block. */ interface BlockWritable { /** The type of block this data should use. */ BlockType getBlockType(); /** * Writes the block to the provided stream. Must not write any magic records. * @param out a stream to write uncompressed data into */ void writeToBlock(DataOutput out) throws IOException; } /** * Iterator for reading {@link HFileBlock}s in load-on-open-section, such as root data index * block, meta index block, file info block etc. */ interface BlockIterator { /** * Get the next block, or null if there are no more blocks to iterate. */ HFileBlock nextBlock() throws IOException; /** * Similar to {@link #nextBlock()} but checks block type, throws an exception if incorrect, and * returns the HFile block */ HFileBlock nextBlockWithBlockType(BlockType blockType) throws IOException; /** * Now we use the {@link ByteBuffAllocator} to manage the nio ByteBuffers for HFileBlocks, so we * must deallocate all of the ByteBuffers in the end life. the BlockIterator's life cycle is * starting from opening an HFileReader and stopped when the HFileReader#close, so we will keep * track all the read blocks until we call {@link BlockIterator#freeBlocks()} when closing the * HFileReader. Sum bytes of those blocks in load-on-open section should be quite small, so * tracking them should be OK. */ void freeBlocks(); } /** An HFile block reader with iteration ability. */ interface FSReader { /** * Reads the block at the given offset in the file with the given on-disk size and uncompressed * size. * @param offset of the file to read * @param onDiskSize the on-disk size of the entire block, including all applicable headers, * or -1 if unknown * @param pread true to use pread, otherwise use the stream read. * @param updateMetrics update the metrics or not. * @param intoHeap allocate the block's ByteBuff by {@link ByteBuffAllocator} or JVM heap. * For LRUBlockCache, we must ensure that the block to cache is an heap * one, because the memory occupation is based on heap now, also for * {@link CombinedBlockCache}, we use the heap LRUBlockCache as L1 cache to * cache small blocks such as IndexBlock or MetaBlock for faster access. So * introduce an flag here to decide whether allocate from JVM heap or not * so that we can avoid an extra off-heap to heap memory copy when using * LRUBlockCache. For most cases, we known what's the expected block type * we'll read, while for some special case (Example: * HFileReaderImpl#readNextDataBlock()), we cannot pre-decide what's the * expected block type, then we can only allocate block's ByteBuff from * {@link ByteBuffAllocator} firstly, and then when caching it in * {@link LruBlockCache} we'll check whether the ByteBuff is from heap or * not, if not then we'll clone it to an heap one and cache it. * @return the newly read block */ HFileBlock readBlockData(long offset, long onDiskSize, boolean pread, boolean updateMetrics, boolean intoHeap) throws IOException; /** * Creates a block iterator over the given portion of the {@link HFile}. The iterator returns * blocks starting with offset such that offset <= startOffset < endOffset. Returned * blocks are always unpacked. Used when no hfile index available; e.g. reading in the hfile * index blocks themselves on file open. * @param startOffset the offset of the block to start iteration with * @param endOffset the offset to end iteration at (exclusive) * @return an iterator of blocks between the two given offsets */ BlockIterator blockRange(long startOffset, long endOffset); /** Closes the backing streams */ void closeStreams() throws IOException; /** Get a decoder for {@link BlockType#ENCODED_DATA} blocks from this file. */ HFileBlockDecodingContext getBlockDecodingContext(); /** Get the default decoder for blocks from this file. */ HFileBlockDecodingContext getDefaultBlockDecodingContext(); void setIncludesMemStoreTS(boolean includesMemstoreTS); void setDataBlockEncoder(HFileDataBlockEncoder encoder, Configuration conf); /** * To close the stream's socket. Note: This can be concurrently called from multiple threads and * implementation should take care of thread safety. */ void unbufferStream(); } /** * Data-structure to use caching the header of the NEXT block. Only works if next read that comes * in here is next in sequence in this block. When we read, we read current block and the next * blocks' header. We do this so we have the length of the next block to read if the hfile index * is not available (rare, at hfile open only). */ private static class PrefetchedHeader { long offset = -1; byte[] header = new byte[HConstants.HFILEBLOCK_HEADER_SIZE]; final ByteBuff buf = new SingleByteBuff(ByteBuffer.wrap(header, 0, header.length)); @Override public String toString() { return "offset=" + this.offset + ", header=" + Bytes.toStringBinary(header); } } /** * Reads version 2 HFile blocks from the filesystem. */ static class FSReaderImpl implements FSReader { /** * The file system stream of the underlying {@link HFile} that does or doesn't do checksum * validations in the filesystem */ private FSDataInputStreamWrapper streamWrapper; private HFileBlockDecodingContext encodedBlockDecodingCtx; /** Default context used when BlockType != {@link BlockType#ENCODED_DATA}. */ private final HFileBlockDefaultDecodingContext defaultDecodingCtx; /** * Cache of the NEXT header after this. Check it is indeed next blocks header before using it. * TODO: Review. This overread into next block to fetch next blocks header seems unnecessary * given we usually get the block size from the hfile index. Review! */ private AtomicReference prefetchedHeader = new AtomicReference<>(new PrefetchedHeader()); /** The size of the file we are reading from, or -1 if unknown. */ private long fileSize; /** The size of the header */ protected final int hdrSize; /** The filesystem used to access data */ private HFileSystem hfs; private HFileContext fileContext; // Cache the fileName private String pathName; private final ByteBuffAllocator allocator; private final Lock streamLock = new ReentrantLock(); private final boolean isPreadAllBytes; private final long readWarnTime; /** * If reading block cost time in milliseconds more than the threshold, a warning will be logged. */ public static final String FS_READER_WARN_TIME_MS = "hbase.fs.reader.warn.time.ms"; FSReaderImpl(ReaderContext readerContext, HFileContext fileContext, ByteBuffAllocator allocator, Configuration conf) throws IOException { this.fileSize = readerContext.getFileSize(); this.hfs = readerContext.getFileSystem(); if (readerContext.getFilePath() != null) { this.pathName = readerContext.getFilePath().toString(); } this.fileContext = fileContext; this.hdrSize = headerSize(fileContext.isUseHBaseChecksum()); this.allocator = allocator; this.streamWrapper = readerContext.getInputStreamWrapper(); // Older versions of HBase didn't support checksum. this.streamWrapper.prepareForBlockReader(!fileContext.isUseHBaseChecksum()); defaultDecodingCtx = new HFileBlockDefaultDecodingContext(conf, fileContext); encodedBlockDecodingCtx = defaultDecodingCtx; isPreadAllBytes = readerContext.isPreadAllBytes(); // Default warn threshold set to -1, it means skipping record the read block slow warning log. readWarnTime = conf.getLong(FS_READER_WARN_TIME_MS, -1L); } @Override public BlockIterator blockRange(final long startOffset, final long endOffset) { final FSReader owner = this; // handle for inner class return new BlockIterator() { private volatile boolean freed = false; // Tracking all read blocks until we call freeBlocks. private List blockTracker = new ArrayList<>(); private long offset = startOffset; // Cache length of next block. Current block has the length of next block in it. private long length = -1; @Override public HFileBlock nextBlock() throws IOException { if (offset >= endOffset) { return null; } HFileBlock b = readBlockData(offset, length, false, false, true); offset += b.getOnDiskSizeWithHeader(); length = b.getNextBlockOnDiskSize(); HFileBlock uncompressed = b.unpack(fileContext, owner); if (uncompressed != b) { b.release(); // Need to release the compressed Block now. } blockTracker.add(uncompressed); return uncompressed; } @Override public HFileBlock nextBlockWithBlockType(BlockType blockType) throws IOException { HFileBlock blk = nextBlock(); if (blk.getBlockType() != blockType) { throw new IOException( "Expected block of type " + blockType + " but found " + blk.getBlockType()); } return blk; } @Override public void freeBlocks() { if (freed) { return; } blockTracker.forEach(HFileBlock::release); blockTracker = null; freed = true; } }; } /** * Does a positional read or a seek and read into the given byte buffer. We need take care that * we will call the {@link ByteBuff#release()} for every exit to deallocate the ByteBuffers, * otherwise the memory leak may happen. * @param dest destination buffer * @param size size of read * @param peekIntoNextBlock whether to read the next block's on-disk size * @param fileOffset position in the stream to read at * @param pread whether we should do a positional read * @param istream The input source of data * @return true to indicate the destination buffer include the next block header, otherwise only * include the current block data without the next block header. * @throws IOException if any IO error happen. */ protected boolean readAtOffset(FSDataInputStream istream, ByteBuff dest, int size, boolean peekIntoNextBlock, long fileOffset, boolean pread) throws IOException { if (!pread) { // Seek + read. Better for scanning. istream.seek(fileOffset); long realOffset = istream.getPos(); if (realOffset != fileOffset) { throw new IOException("Tried to seek to " + fileOffset + " to read " + size + " bytes, but pos=" + realOffset + " after seek"); } if (!peekIntoNextBlock) { BlockIOUtils.readFully(dest, istream, size); return false; } // Try to read the next block header if (!BlockIOUtils.readWithExtra(dest, istream, size, hdrSize)) { // did not read the next block header. return false; } } else { // Positional read. Better for random reads; or when the streamLock is already locked. int extraSize = peekIntoNextBlock ? hdrSize : 0; if ( !BlockIOUtils.preadWithExtra(dest, istream, fileOffset, size, extraSize, isPreadAllBytes) ) { // did not read the next block header. return false; } } assert peekIntoNextBlock; return true; } /** * Reads a version 2 block (version 1 blocks not supported and not expected). Tries to do as * little memory allocation as possible, using the provided on-disk size. * @param offset the offset in the stream to read at * @param onDiskSizeWithHeaderL the on-disk size of the block, including the header, or -1 if * unknown; i.e. when iterating over blocks reading in the file * metadata info. * @param pread whether to use a positional read * @param updateMetrics whether to update the metrics * @param intoHeap allocate ByteBuff of block from heap or off-heap. * @see FSReader#readBlockData(long, long, boolean, boolean, boolean) for more details about the * useHeap. */ @Override public HFileBlock readBlockData(long offset, long onDiskSizeWithHeaderL, boolean pread, boolean updateMetrics, boolean intoHeap) throws IOException { // Get a copy of the current state of whether to validate // hbase checksums or not for this read call. This is not // thread-safe but the one constraint is that if we decide // to skip hbase checksum verification then we are // guaranteed to use hdfs checksum verification. boolean doVerificationThruHBaseChecksum = streamWrapper.shouldUseHBaseChecksum(); FSDataInputStream is = streamWrapper.getStream(doVerificationThruHBaseChecksum); final Context context = Context.current().with(CONTEXT_KEY, new HFileContextAttributesBuilderConsumer(fileContext) .setSkipChecksum(doVerificationThruHBaseChecksum) .setReadType(pread ? ReadType.POSITIONAL_READ : ReadType.SEEK_PLUS_READ)); try (Scope ignored = context.makeCurrent()) { HFileBlock blk = readBlockDataInternal(is, offset, onDiskSizeWithHeaderL, pread, doVerificationThruHBaseChecksum, updateMetrics, intoHeap); if (blk == null) { HFile.LOG.warn("HBase checksum verification failed for file {} at offset {} filesize {}." + " Retrying read with HDFS checksums turned on...", pathName, offset, fileSize); if (!doVerificationThruHBaseChecksum) { String msg = "HBase checksum verification failed for file " + pathName + " at offset " + offset + " filesize " + fileSize + " but this cannot happen because doVerify is " + doVerificationThruHBaseChecksum; HFile.LOG.warn(msg); throw new IOException(msg); // cannot happen case here } HFile.CHECKSUM_FAILURES.increment(); // update metrics // If we have a checksum failure, we fall back into a mode where // the next few reads use HDFS level checksums. We aim to make the // next CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD reads avoid // hbase checksum verification, but since this value is set without // holding any locks, it can so happen that we might actually do // a few more than precisely this number. is = this.streamWrapper.fallbackToFsChecksum(CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD); doVerificationThruHBaseChecksum = false; blk = readBlockDataInternal(is, offset, onDiskSizeWithHeaderL, pread, doVerificationThruHBaseChecksum, updateMetrics, intoHeap); if (blk != null) { HFile.LOG.warn( "HDFS checksum verification succeeded for file {} at offset {} filesize" + " {}", pathName, offset, fileSize); } } if (blk == null && !doVerificationThruHBaseChecksum) { String msg = "readBlockData failed, possibly due to " + "checksum verification failed for file " + pathName + " at offset " + offset + " filesize " + fileSize; HFile.LOG.warn(msg); throw new IOException(msg); } // If there is a checksum mismatch earlier, then retry with // HBase checksums switched off and use HDFS checksum verification. // This triggers HDFS to detect and fix corrupt replicas. The // next checksumOffCount read requests will use HDFS checksums. // The decrementing of this.checksumOffCount is not thread-safe, // but it is harmless because eventually checksumOffCount will be // a negative number. streamWrapper.checksumOk(); return blk; } } /** * Check that {@code value} read from a block header seems reasonable, within a large margin of * error. * @return {@code true} if the value is safe to proceed, {@code false} otherwise. */ private boolean checkOnDiskSizeWithHeader(int value) { if (value < 0) { if (LOG.isTraceEnabled()) { LOG.trace( "onDiskSizeWithHeader={}; value represents a size, so it should never be negative.", value); } return false; } if (value - hdrSize < 0) { if (LOG.isTraceEnabled()) { LOG.trace("onDiskSizeWithHeader={}, hdrSize={}; don't accept a value that is negative" + " after the header size is excluded.", value, hdrSize); } return false; } return true; } /** * Check that {@code value} provided by the calling context seems reasonable, within a large * margin of error. * @return {@code true} if the value is safe to proceed, {@code false} otherwise. */ private boolean checkCallerProvidedOnDiskSizeWithHeader(long value) { // same validation logic as is used by Math.toIntExact(long) int intValue = (int) value; if (intValue != value) { if (LOG.isTraceEnabled()) { LOG.trace("onDiskSizeWithHeaderL={}; value exceeds int size limits.", value); } return false; } if (intValue == -1) { // a magic value we expect to see. return true; } return checkOnDiskSizeWithHeader(intValue); } /** * Check atomic reference cache for this block's header. Cache only good if next read coming * through is next in sequence in the block. We read next block's header on the tail of reading * the previous block to save a seek. Otherwise, we have to do a seek to read the header before * we can pull in the block OR we have to backup the stream because we over-read (the next * block's header). * @see PrefetchedHeader * @return The cached block header or null if not found. * @see #cacheNextBlockHeader(long, ByteBuff, int, int) */ private ByteBuff getCachedHeader(final long offset) { PrefetchedHeader ph = this.prefetchedHeader.get(); return ph != null && ph.offset == offset ? ph.buf : null; } /** * Save away the next blocks header in atomic reference. * @see #getCachedHeader(long) * @see PrefetchedHeader */ private void cacheNextBlockHeader(final long offset, ByteBuff onDiskBlock, int onDiskSizeWithHeader, int headerLength) { PrefetchedHeader ph = new PrefetchedHeader(); ph.offset = offset; onDiskBlock.get(onDiskSizeWithHeader, ph.header, 0, headerLength); this.prefetchedHeader.set(ph); } /** * Clear the cached value when its integrity is suspect. */ private void invalidateNextBlockHeader() { prefetchedHeader.set(null); } private int getNextBlockOnDiskSize(ByteBuff onDiskBlock, int onDiskSizeWithHeader) { return onDiskBlock.getIntAfterPosition(onDiskSizeWithHeader + BlockType.MAGIC_LENGTH) + hdrSize; } private ByteBuff allocate(int size, boolean intoHeap) { return intoHeap ? HEAP.allocate(size) : allocator.allocate(size); } /** * Reads a version 2 block. * @param offset the offset in the stream to read at. * @param onDiskSizeWithHeaderL the on-disk size of the block, including the header and * checksums if present or -1 if unknown (as a long). Can be -1 if * we are doing raw iteration of blocks as when loading up file * metadata; i.e. the first read of a new file. Usually non-null * gotten from the file index. * @param pread whether to use a positional read * @param verifyChecksum Whether to use HBase checksums. If HBase checksum is switched * off, then use HDFS checksum. Can also flip on/off reading same * file if we hit a troublesome patch in an hfile. * @param updateMetrics whether need to update the metrics. * @param intoHeap allocate the ByteBuff of block from heap or off-heap. * @return the HFileBlock or null if there is a HBase checksum mismatch */ protected HFileBlock readBlockDataInternal(FSDataInputStream is, long offset, long onDiskSizeWithHeaderL, boolean pread, boolean verifyChecksum, boolean updateMetrics, boolean intoHeap) throws IOException { final Span span = Span.current(); final AttributesBuilder attributesBuilder = Attributes.builder(); Optional.of(Context.current()).map(val -> val.get(CONTEXT_KEY)) .ifPresent(c -> c.accept(attributesBuilder)); if (offset < 0) { throw new IOException("Invalid offset=" + offset + " trying to read " + "block (onDiskSize=" + onDiskSizeWithHeaderL + ")"); } if (!checkCallerProvidedOnDiskSizeWithHeader(onDiskSizeWithHeaderL)) { LOG.trace("Caller provided invalid onDiskSizeWithHeaderL={}", onDiskSizeWithHeaderL); onDiskSizeWithHeaderL = -1; } int onDiskSizeWithHeader = (int) onDiskSizeWithHeaderL; // Try to use the cached header. Will serve us in rare case where onDiskSizeWithHeaderL==-1 // and will save us having to seek the stream backwards to reread the header we // read the last time through here. ByteBuff headerBuf = getCachedHeader(offset); LOG.trace( "Reading {} at offset={}, pread={}, verifyChecksum={}, cachedHeader={}, " + "onDiskSizeWithHeader={}", this.fileContext.getHFileName(), offset, pread, verifyChecksum, headerBuf, onDiskSizeWithHeader); // This is NOT same as verifyChecksum. This latter is whether to do hbase // checksums. Can change with circumstances. The below flag is whether the // file has support for checksums (version 2+). boolean checksumSupport = this.fileContext.isUseHBaseChecksum(); long startTime = EnvironmentEdgeManager.currentTime(); if (onDiskSizeWithHeader == -1) { // The caller does not know the block size. Need to get it from the header. If header was // not cached (see getCachedHeader above), need to seek to pull it in. This is costly // and should happen very rarely. Currently happens on open of a hfile reader where we // read the trailer blocks to pull in the indices. Otherwise, we are reading block sizes // out of the hfile index. To check, enable TRACE in this file and you'll get an exception // in a LOG every time we seek. See HBASE-17072 for more detail. if (headerBuf == null) { if (LOG.isTraceEnabled()) { LOG.trace("Extra seek to get block size!", new RuntimeException()); } span.addEvent("Extra seek to get block size!", attributesBuilder.build()); headerBuf = HEAP.allocate(hdrSize); readAtOffset(is, headerBuf, hdrSize, false, offset, pread); headerBuf.rewind(); } onDiskSizeWithHeader = getOnDiskSizeWithHeader(headerBuf, checksumSupport); } // The common case is that onDiskSizeWithHeader was produced by a read without checksum // validation, so give it a sanity check before trying to use it. if (!checkOnDiskSizeWithHeader(onDiskSizeWithHeader)) { if (verifyChecksum) { invalidateNextBlockHeader(); span.addEvent("Falling back to HDFS checksumming.", attributesBuilder.build()); return null; } else { throw new IOException("Invalid onDiskSizeWithHeader=" + onDiskSizeWithHeader); } } int preReadHeaderSize = headerBuf == null ? 0 : hdrSize; // Allocate enough space to fit the next block's header too; saves a seek next time through. // onDiskBlock is whole block + header + checksums then extra hdrSize to read next header; // onDiskSizeWithHeader is header, body, and any checksums if present. preReadHeaderSize // says where to start reading. If we have the header cached, then we don't need to read // it again and we can likely read from last place we left off w/o need to backup and reread // the header we read last time through here. ByteBuff onDiskBlock = this.allocate(onDiskSizeWithHeader + hdrSize, intoHeap); boolean initHFileBlockSuccess = false; try { if (headerBuf != null) { onDiskBlock.put(0, headerBuf, 0, hdrSize).position(hdrSize); } boolean readNextHeader = readAtOffset(is, onDiskBlock, onDiskSizeWithHeader - preReadHeaderSize, true, offset + preReadHeaderSize, pread); onDiskBlock.rewind(); // in case of moving position when copying a cached header // the call to validateChecksum for this block excludes the next block header over-read, so // no reason to delay extracting this value. int nextBlockOnDiskSize = -1; if (readNextHeader) { int parsedVal = getNextBlockOnDiskSize(onDiskBlock, onDiskSizeWithHeader); if (checkOnDiskSizeWithHeader(parsedVal)) { nextBlockOnDiskSize = parsedVal; } } if (headerBuf == null) { headerBuf = onDiskBlock.duplicate().position(0).limit(hdrSize); } ByteBuff curBlock = onDiskBlock.duplicate().position(0).limit(onDiskSizeWithHeader); // Verify checksum of the data before using it for building HFileBlock. if (verifyChecksum && !validateChecksum(offset, curBlock, hdrSize)) { invalidateNextBlockHeader(); span.addEvent("Falling back to HDFS checksumming.", attributesBuilder.build()); return null; } // TODO: is this check necessary or can we proceed with a provided value regardless of // what is in the header? int fromHeader = getOnDiskSizeWithHeader(headerBuf, checksumSupport); if (onDiskSizeWithHeader != fromHeader) { if (LOG.isTraceEnabled()) { LOG.trace("Passed in onDiskSizeWithHeader={} != {}, offset={}, fileContext={}", onDiskSizeWithHeader, fromHeader, offset, this.fileContext); } if (checksumSupport && verifyChecksum) { // This file supports HBase checksums and verification of those checksums was // requested. The block size provided by the caller (presumably from the block index) // does not match the block size written to the block header. treat this as // HBase-checksum failure. span.addEvent("Falling back to HDFS checksumming.", attributesBuilder.build()); invalidateNextBlockHeader(); return null; } throw new IOException("Passed in onDiskSizeWithHeader=" + onDiskSizeWithHeader + " != " + fromHeader + ", offset=" + offset + ", fileContext=" + this.fileContext); } // remove checksum from buffer now that it's verified int sizeWithoutChecksum = curBlock.getInt(Header.ON_DISK_DATA_SIZE_WITH_HEADER_INDEX); curBlock.limit(sizeWithoutChecksum); long duration = EnvironmentEdgeManager.currentTime() - startTime; if (updateMetrics) { HFile.updateReadLatency(duration, pread); } // The onDiskBlock will become the headerAndDataBuffer for this block. // If nextBlockOnDiskSizeWithHeader is not zero, the onDiskBlock already // contains the header of next block, so no need to set next block's header in it. HFileBlock hFileBlock = createFromBuff(curBlock, checksumSupport, offset, nextBlockOnDiskSize, fileContext, intoHeap ? HEAP : allocator); // Run check on uncompressed sizings. if (!fileContext.isCompressedOrEncrypted()) { hFileBlock.sanityCheckUncompressed(); } LOG.trace("Read {} in {} ms", hFileBlock, duration); if (!LOG.isTraceEnabled() && this.readWarnTime >= 0 && duration > this.readWarnTime) { LOG.warn("Read Block Slow: read {} cost {} ms, threshold = {} ms", hFileBlock, duration, this.readWarnTime); } span.addEvent("Read block", attributesBuilder.build()); // Cache next block header if we read it for the next time through here. if (nextBlockOnDiskSize != -1) { cacheNextBlockHeader(offset + hFileBlock.getOnDiskSizeWithHeader(), onDiskBlock, onDiskSizeWithHeader, hdrSize); } initHFileBlockSuccess = true; return hFileBlock; } finally { if (!initHFileBlockSuccess) { onDiskBlock.release(); } } } @Override public void setIncludesMemStoreTS(boolean includesMemstoreTS) { this.fileContext = new HFileContextBuilder(this.fileContext).withIncludesMvcc(includesMemstoreTS).build(); } @Override public void setDataBlockEncoder(HFileDataBlockEncoder encoder, Configuration conf) { encodedBlockDecodingCtx = encoder.newDataBlockDecodingContext(conf, fileContext); } @Override public HFileBlockDecodingContext getBlockDecodingContext() { return this.encodedBlockDecodingCtx; } @Override public HFileBlockDecodingContext getDefaultBlockDecodingContext() { return this.defaultDecodingCtx; } /** * Generates the checksum for the header as well as the data and then validates it. If the block * doesn't uses checksum, returns false. * @return True if checksum matches, else false. */ private boolean validateChecksum(long offset, ByteBuff data, int hdrSize) { // If this is an older version of the block that does not have checksums, then return false // indicating that checksum verification did not succeed. Actually, this method should never // be called when the minorVersion is 0, thus this is a defensive check for a cannot-happen // case. Since this is a cannot-happen case, it is better to return false to indicate a // checksum validation failure. if (!fileContext.isUseHBaseChecksum()) { return false; } return ChecksumUtil.validateChecksum(data, pathName, offset, hdrSize); } @Override public void closeStreams() throws IOException { streamWrapper.close(); } @Override public void unbufferStream() { // To handle concurrent reads, ensure that no other client is accessing the streams while we // unbuffer it. if (streamLock.tryLock()) { try { this.streamWrapper.unbuffer(); } finally { streamLock.unlock(); } } } @Override public String toString() { return "hfs=" + hfs + ", path=" + pathName + ", fileContext=" + fileContext; } } /** An additional sanity-check in case no compression or encryption is being used. */ void sanityCheckUncompressed() throws IOException { if (onDiskSizeWithoutHeader != uncompressedSizeWithoutHeader + totalChecksumBytes()) { throw new IOException("Using no compression but " + "onDiskSizeWithoutHeader=" + onDiskSizeWithoutHeader + ", " + "uncompressedSizeWithoutHeader=" + uncompressedSizeWithoutHeader + ", numChecksumbytes=" + totalChecksumBytes()); } } // Cacheable implementation @Override public int getSerializedLength() { if (bufWithoutChecksum != null) { // Include extra bytes for block metadata. return this.bufWithoutChecksum.limit() + BLOCK_METADATA_SPACE; } return 0; } // Cacheable implementation @Override public void serialize(ByteBuffer destination, boolean includeNextBlockMetadata) { this.bufWithoutChecksum.get(destination, 0, getSerializedLength() - BLOCK_METADATA_SPACE); destination = addMetaData(destination, includeNextBlockMetadata); // Make it ready for reading. flip sets position to zero and limit to current position which // is what we want if we do not want to serialize the block plus checksums if present plus // metadata. destination.flip(); } /** * For use by bucketcache. This exposes internals. */ public ByteBuffer getMetaData(ByteBuffer bb) { bb = addMetaData(bb, true); bb.flip(); return bb; } /** * Adds metadata at current position (position is moved forward). Does not flip or reset. * @return The passed destination with metadata added. */ private ByteBuffer addMetaData(final ByteBuffer destination, boolean includeNextBlockMetadata) { destination.put(this.fileContext.isUseHBaseChecksum() ? (byte) 1 : (byte) 0); destination.putLong(this.offset); if (includeNextBlockMetadata) { destination.putInt(this.nextBlockOnDiskSize); } return destination; } // Cacheable implementation @Override public CacheableDeserializer getDeserializer() { return HFileBlock.BLOCK_DESERIALIZER; } @Override public int hashCode() { int result = 1; result = result * 31 + blockType.hashCode(); result = result * 31 + nextBlockOnDiskSize; result = result * 31 + (int) (offset ^ (offset >>> 32)); result = result * 31 + onDiskSizeWithoutHeader; result = result * 31 + (int) (prevBlockOffset ^ (prevBlockOffset >>> 32)); result = result * 31 + uncompressedSizeWithoutHeader; result = result * 31 + bufWithoutChecksum.hashCode(); return result; } @Override public boolean equals(Object comparison) { if (this == comparison) { return true; } if (comparison == null) { return false; } if (!(comparison instanceof HFileBlock)) { return false; } HFileBlock castedComparison = (HFileBlock) comparison; if (castedComparison.blockType != this.blockType) { return false; } if (castedComparison.nextBlockOnDiskSize != this.nextBlockOnDiskSize) { return false; } // Offset is important. Needed when we have to remake cachekey when block is returned to cache. if (castedComparison.offset != this.offset) { return false; } if (castedComparison.onDiskSizeWithoutHeader != this.onDiskSizeWithoutHeader) { return false; } if (castedComparison.prevBlockOffset != this.prevBlockOffset) { return false; } if (castedComparison.uncompressedSizeWithoutHeader != this.uncompressedSizeWithoutHeader) { return false; } if ( ByteBuff.compareTo(this.bufWithoutChecksum, 0, this.bufWithoutChecksum.limit(), castedComparison.bufWithoutChecksum, 0, castedComparison.bufWithoutChecksum.limit()) != 0 ) { return false; } return true; } DataBlockEncoding getDataBlockEncoding() { if (blockType == BlockType.ENCODED_DATA) { return DataBlockEncoding.getEncodingById(getDataBlockEncodingId()); } return DataBlockEncoding.NONE; } byte getChecksumType() { return this.fileContext.getChecksumType().getCode(); } int getBytesPerChecksum() { return this.fileContext.getBytesPerChecksum(); } /** Returns the size of data on disk + header. Excludes checksum. */ int getOnDiskDataSizeWithHeader() { return this.onDiskDataSizeWithHeader; } /** * Return the number of bytes required to store all the checksums for this block. Each checksum * value is a 4 byte integer.
* NOTE: ByteBuff returned by {@link HFileBlock#getBufferWithoutHeader()} and * {@link HFileBlock#getBufferReadOnly} or DataInputStream returned by * {@link HFileBlock#getByteStream()} does not include checksum. */ int totalChecksumBytes() { return totalChecksumBytes; } private int computeTotalChecksumBytes() { // If the hfile block has minorVersion 0, then there are no checksum // data to validate. Similarly, a zero value in this.bytesPerChecksum // indicates that cached blocks do not have checksum data because // checksums were already validated when the block was read from disk. if (!fileContext.isUseHBaseChecksum() || this.fileContext.getBytesPerChecksum() == 0) { return 0; } return (int) ChecksumUtil.numBytes(onDiskDataSizeWithHeader, this.fileContext.getBytesPerChecksum()); } /** * Returns the size of this block header. */ public int headerSize() { return headerSize(this.fileContext.isUseHBaseChecksum()); } /** * Maps a minor version to the size of the header. */ public static int headerSize(boolean usesHBaseChecksum) { return usesHBaseChecksum ? HConstants.HFILEBLOCK_HEADER_SIZE : HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM; } /** * Return the appropriate DUMMY_HEADER for the minor version */ // TODO: Why is this in here? byte[] getDummyHeaderForVersion() { return getDummyHeaderForVersion(this.fileContext.isUseHBaseChecksum()); } /** * Return the appropriate DUMMY_HEADER for the minor version */ static private byte[] getDummyHeaderForVersion(boolean usesHBaseChecksum) { return usesHBaseChecksum ? HConstants.HFILEBLOCK_DUMMY_HEADER : DUMMY_HEADER_NO_CHECKSUM; } /** * @return This HFileBlocks fileContext which will a derivative of the fileContext for the file * from which this block's data was originally read. */ public HFileContext getHFileContext() { return this.fileContext; } /** * Convert the contents of the block header into a human readable string. This is mostly helpful * for debugging. This assumes that the block has minor version > 0. */ static String toStringHeader(ByteBuff buf) throws IOException { byte[] magicBuf = new byte[Math.min(buf.limit() - buf.position(), BlockType.MAGIC_LENGTH)]; buf.get(magicBuf); BlockType bt = BlockType.parse(magicBuf, 0, BlockType.MAGIC_LENGTH); int compressedBlockSizeNoHeader = buf.getInt(); int uncompressedBlockSizeNoHeader = buf.getInt(); long prevBlockOffset = buf.getLong(); byte cksumtype = buf.get(); long bytesPerChecksum = buf.getInt(); long onDiskDataSizeWithHeader = buf.getInt(); return " Header dump: magic: " + Bytes.toString(magicBuf) + " blockType " + bt + " compressedBlockSizeNoHeader " + compressedBlockSizeNoHeader + " uncompressedBlockSizeNoHeader " + uncompressedBlockSizeNoHeader + " prevBlockOffset " + prevBlockOffset + " checksumType " + ChecksumType.codeToType(cksumtype) + " bytesPerChecksum " + bytesPerChecksum + " onDiskDataSizeWithHeader " + onDiskDataSizeWithHeader; } /** * Creates a new HFileBlockBuilder from the existing block and a new ByteBuff. The builder will be * loaded with all of the original fields from blk, except now using the newBuff and setting * isSharedMem based on the source of the passed in newBuff. An existing HFileBlock may have been * an {@link ExclusiveMemHFileBlock}, but the new buffer might call for a * {@link SharedMemHFileBlock}. Or vice versa. * @param blk the block to clone from * @param newBuff the new buffer to use */ private static HFileBlockBuilder createBuilder(HFileBlock blk, ByteBuff newBuff) { return new HFileBlockBuilder().withBlockType(blk.blockType) .withOnDiskSizeWithoutHeader(blk.onDiskSizeWithoutHeader) .withUncompressedSizeWithoutHeader(blk.uncompressedSizeWithoutHeader) .withPrevBlockOffset(blk.prevBlockOffset).withByteBuff(newBuff).withOffset(blk.offset) .withOnDiskDataSizeWithHeader(blk.onDiskDataSizeWithHeader) .withNextBlockOnDiskSize(blk.nextBlockOnDiskSize).withHFileContext(blk.fileContext) .withByteBuffAllocator(blk.allocator).withShared(!newBuff.hasArray()); } private static HFileBlock shallowClone(HFileBlock blk, ByteBuff newBuf) { return createBuilder(blk, newBuf).build(); } static HFileBlock deepCloneOnHeap(HFileBlock blk) { ByteBuff deepCloned = ByteBuff .wrap(ByteBuffer.wrap(blk.bufWithoutChecksum.toBytes(0, blk.bufWithoutChecksum.limit()))); return createBuilder(blk, deepCloned).build(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy