org.apache.hadoop.hbase.io.hfile.HFile Maven / Gradle / Ivy
Show all versions of hbase-server Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.io.hfile;
import java.io.Closeable;
import java.io.DataInput;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.atomic.LongAdder;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
import org.apache.hadoop.hbase.io.MetricsIO;
import org.apache.hadoop.hbase.io.MetricsIOWrapperImpl;
import org.apache.hadoop.hbase.io.compress.Compression;
import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
import org.apache.hadoop.hbase.io.hfile.ReaderContext.ReaderType;
import org.apache.hadoop.hbase.regionserver.CellSink;
import org.apache.hadoop.hbase.regionserver.ShipperListener;
import org.apache.hadoop.hbase.util.BloomFilterWriter;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.io.Writable;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
/**
* File format for hbase. A file of sorted key/value pairs. Both keys and values are byte arrays.
*
* The memory footprint of a HFile includes the following (below is taken from the TFile documentation but applies also
* to HFile):
*
* - Some constant overhead of reading or writing a compressed block.
*
* - Each compressed block requires one compression/decompression codec for I/O.
*
- Temporary space to buffer the key.
*
- Temporary space to buffer the value.
*
* - HFile index, which is proportional to the total number of Data Blocks. The total amount of
* memory needed to hold the index can be estimated as (56+AvgKeySize)*NumBlocks.
*
* Suggestions on performance optimization.
*
* - Minimum block size. We recommend a setting of minimum block size between 8KB to 1MB for
* general usage. Larger block size is preferred if files are primarily for sequential access.
* However, it would lead to inefficient random access (because there are more data to decompress).
* Smaller blocks are good for random access, but require more memory to hold the block index, and
* may be slower to create (because we must flush the compressor stream at the conclusion of each
* data block, which leads to an FS I/O flush). Further, due to the internal caching in Compression
* codec, the smallest possible block size would be around 20KB-30KB.
*
- The current implementation does not offer true multi-threading for reading. The
* implementation uses FSDataInputStream seek()+read(), which is shown to be much faster than
* positioned-read call in single thread mode. However, it also means that if multiple threads
* attempt to access the same HFile (using multiple scanners) simultaneously, the actual I/O is
* carried out sequentially even if they access different DFS blocks (Reexamine! pread seems to be
* 10% faster than seek+read in my testing -- stack).
*
- Compression codec. Use "none" if the data is not very compressable (by compressable, I mean a
* compression ratio at least 2:1). Generally, use "lzo" as the starting point for experimenting.
* "gz" overs slightly better compression ratio over "lzo" but requires 4x CPU to compress and 2x
* CPU to decompress, comparing to "lzo".
*
* For more on the background behind HFile, see HBASE-61.
*
* File is made of data blocks followed by meta data blocks (if any), a fileinfo block, data block
* index, meta data block index, and a fixed size trailer which records the offsets at which file
* changes content type.
*
*
* <data blocks><meta blocks><fileinfo><
* data index><meta index><trailer>
*
*
* Each block has a bit of magic at its start. Block are comprised of key/values. In data blocks,
* they are both byte arrays. Metadata blocks are a String key and a byte array value. An empty file
* looks like this:
*
*
* <fileinfo><trailer>
*
*
* . That is, there are not data nor meta blocks present.
*
* TODO: Do scanners need to be able to take a start and end row? TODO: Should BlockIndex know the
* name of its file? Should it have a Path that points at its file say for the case where an index
* lives apart from an HFile instance?
*/
@InterfaceAudience.Private
public final class HFile {
// LOG is being used in HFileBlock and CheckSumUtil
static final Logger LOG = LoggerFactory.getLogger(HFile.class);
/**
* Maximum length of key in HFile.
*/
public final static int MAXIMUM_KEY_LENGTH = Integer.MAX_VALUE;
/**
* Default compression: none.
*/
public final static Compression.Algorithm DEFAULT_COMPRESSION_ALGORITHM =
Compression.Algorithm.NONE;
/** Minimum supported HFile format version */
public static final int MIN_FORMAT_VERSION = 2;
/**
* Maximum supported HFile format version
*/
public static final int MAX_FORMAT_VERSION = 3;
/**
* Minimum HFile format version with support for persisting cell tags
*/
public static final int MIN_FORMAT_VERSION_WITH_TAGS = 3;
/** Default compression name: none. */
public final static String DEFAULT_COMPRESSION = DEFAULT_COMPRESSION_ALGORITHM.getName();
/** Meta data block name for bloom filter bits. */
public static final String BLOOM_FILTER_DATA_KEY = "BLOOM_FILTER_DATA";
/**
* We assume that HFile path ends with ROOT_DIR/TABLE_NAME/REGION_NAME/CF_NAME/HFILE, so it has at
* least this many levels of nesting. This is needed for identifying table and CF name from an
* HFile path.
*/
public final static int MIN_NUM_HFILE_PATH_LEVELS = 5;
/**
* The number of bytes per checksum.
*/
public static final int DEFAULT_BYTES_PER_CHECKSUM = 16 * 1024;
// For measuring number of checksum failures
static final LongAdder CHECKSUM_FAILURES = new LongAdder();
// For tests. Gets incremented when we read a block whether from HDFS or from Cache.
public static final LongAdder DATABLOCK_READ_COUNT = new LongAdder();
/** Static instance for the metrics so that HFileReaders access the same instance */
static final MetricsIO metrics = new MetricsIO(new MetricsIOWrapperImpl());
/**
* Shutdown constructor.
*/
private HFile() {
}
/**
* Number of checksum verification failures. It also clears the counter.
*/
public static final long getAndResetChecksumFailuresCount() {
return CHECKSUM_FAILURES.sumThenReset();
}
/**
* Number of checksum verification failures. It also clears the counter.
*/
public static final long getChecksumFailuresCount() {
return CHECKSUM_FAILURES.sum();
}
public static final void updateReadLatency(long latencyMillis, boolean pread) {
if (pread) {
metrics.updateFsPreadTime(latencyMillis);
} else {
metrics.updateFsReadTime(latencyMillis);
}
}
public static final void updateWriteLatency(long latencyMillis) {
metrics.updateFsWriteTime(latencyMillis);
}
/** API required to write an {@link HFile} */
public interface Writer extends Closeable, CellSink, ShipperListener {
/** Max memstore (mvcc) timestamp in FileInfo */
public static final byte[] MAX_MEMSTORE_TS_KEY = Bytes.toBytes("MAX_MEMSTORE_TS_KEY");
/** Add an element to the file info map. */
void appendFileInfo(byte[] key, byte[] value) throws IOException;
/** Returns the path to this {@link HFile} */
Path getPath();
/**
* Adds an inline block writer such as a multi-level block index writer or a compound Bloom
* filter writer.
*/
void addInlineBlockWriter(InlineBlockWriter bloomWriter);
// The below three methods take Writables. We'd like to undo Writables but undoing the below
// would be pretty painful. Could take a byte [] or a Message but we want to be backward
// compatible around hfiles so would need to map between Message and Writable or byte [] and
// current Writable serialization. This would be a bit of work to little gain. Thats my
// thinking at moment. St.Ack 20121129
void appendMetaBlock(String bloomFilterMetaKey, Writable metaWriter);
/**
* Store general Bloom filter in the file. This does not deal with Bloom filter internals but is
* necessary, since Bloom filters are stored differently in HFile version 1 and version 2.
*/
void addGeneralBloomFilter(BloomFilterWriter bfw);
/**
* Store delete family Bloom filter in the file, which is only supported in HFile V2.
*/
void addDeleteFamilyBloomFilter(BloomFilterWriter bfw) throws IOException;
/**
* Return the file context for the HFile this writer belongs to
*/
HFileContext getFileContext();
}
/**
* This variety of ways to construct writers is used throughout the code, and we want to be able
* to swap writer implementations.
*/
public static class WriterFactory {
protected final Configuration conf;
protected final CacheConfig cacheConf;
protected FileSystem fs;
protected Path path;
protected FSDataOutputStream ostream;
protected InetSocketAddress[] favoredNodes;
private HFileContext fileContext;
protected boolean shouldDropBehind = false;
WriterFactory(Configuration conf, CacheConfig cacheConf) {
this.conf = conf;
this.cacheConf = cacheConf;
}
public WriterFactory withPath(FileSystem fs, Path path) {
Preconditions.checkNotNull(fs);
Preconditions.checkNotNull(path);
this.fs = fs;
this.path = path;
return this;
}
public WriterFactory withOutputStream(FSDataOutputStream ostream) {
Preconditions.checkNotNull(ostream);
this.ostream = ostream;
return this;
}
public WriterFactory withFavoredNodes(InetSocketAddress[] favoredNodes) {
// Deliberately not checking for null here.
this.favoredNodes = favoredNodes;
return this;
}
public WriterFactory withFileContext(HFileContext fileContext) {
this.fileContext = fileContext;
return this;
}
public WriterFactory withShouldDropCacheBehind(boolean shouldDropBehind) {
this.shouldDropBehind = shouldDropBehind;
return this;
}
public Writer create() throws IOException {
if ((path != null ? 1 : 0) + (ostream != null ? 1 : 0) != 1) {
throw new AssertionError("Please specify exactly one of " + "filesystem/path or path");
}
if (path != null) {
ostream = HFileWriterImpl.createOutputStream(conf, fs, path, favoredNodes);
try {
ostream.setDropBehind(shouldDropBehind && cacheConf.shouldDropBehindCompaction());
} catch (UnsupportedOperationException uoe) {
LOG.trace("Unable to set drop behind on {}", path, uoe);
LOG.debug("Unable to set drop behind on {}", path.getName());
}
}
return new HFileWriterImpl(conf, cacheConf, path, ostream, fileContext);
}
}
/** The configuration key for HFile version to use for new files */
public static final String FORMAT_VERSION_KEY = "hfile.format.version";
public static int getFormatVersion(Configuration conf) {
int version = conf.getInt(FORMAT_VERSION_KEY, MAX_FORMAT_VERSION);
checkFormatVersion(version);
return version;
}
/**
* Returns the factory to be used to create {@link HFile} writers. Disables block cache access for
* all writers created through the returned factory.
*/
public static final WriterFactory getWriterFactoryNoCache(Configuration conf) {
return HFile.getWriterFactory(conf, CacheConfig.DISABLED);
}
/**
* Returns the factory to be used to create {@link HFile} writers
*/
public static final WriterFactory getWriterFactory(Configuration conf, CacheConfig cacheConf) {
int version = getFormatVersion(conf);
switch (version) {
case 2:
throw new IllegalArgumentException("This should never happen. "
+ "Did you change hfile.format.version to read v2? This version of the software writes v3"
+ " hfiles only (but it can read v2 files without having to update hfile.format.version "
+ "in hbase-site.xml)");
case 3:
return new HFile.WriterFactory(conf, cacheConf);
default:
throw new IllegalArgumentException(
"Cannot create writer for HFile " + "format version " + version);
}
}
/**
* An abstraction used by the block index. Implementations will check cache for any asked-for
* block and return cached block if found. Otherwise, after reading from fs, will try and put
* block into cache before returning.
*/
public interface CachingBlockReader {
/**
* Read in a file block.
* @param offset offset to read.
* @param onDiskBlockSize size of the block
* @param isCompaction is this block being read as part of a compaction
* @param expectedBlockType the block type we are expecting to read with this read
* operation, or null to read whatever block type is available
* and avoid checking (that might reduce caching efficiency of
* encoded data blocks)
* @param expectedDataBlockEncoding the data block encoding the caller is expecting data blocks
* to be in, or null to not perform this check and return the
* block irrespective of the encoding. This check only applies
* to data blocks and can be set to null when the caller is
* expecting to read a non-data block and has set
* expectedBlockType accordingly.
* @return Block wrapped in a ByteBuffer.
*/
HFileBlock readBlock(long offset, long onDiskBlockSize, boolean cacheBlock, final boolean pread,
final boolean isCompaction, final boolean updateCacheMetrics, BlockType expectedBlockType,
DataBlockEncoding expectedDataBlockEncoding) throws IOException;
HFileBlock readBlock(long offset, long onDiskBlockSize, boolean cacheBlock, final boolean pread,
final boolean isCompaction, final boolean updateCacheMetrics, BlockType expectedBlockType,
DataBlockEncoding expectedDataBlockEncoding, boolean cacheOnly) throws IOException;
}
/** An interface used by clients to open and iterate an {@link HFile}. */
public interface Reader extends Closeable, CachingBlockReader {
/**
* Returns this reader's "name". Usually the last component of the path. Needs to be constant as
* the file is being moved to support caching on write.
*/
String getName();
CellComparator getComparator();
HFileScanner getScanner(boolean cacheBlocks, final boolean pread, final boolean isCompaction);
HFileBlock getMetaBlock(String metaBlockName, boolean cacheBlock) throws IOException;
Optional getLastKey();
Optional midKey() throws IOException;
long length();
long getEntries();
Optional getFirstKey();
long indexSize();
Optional getFirstRowKey();
Optional getLastRowKey();
FixedFileTrailer getTrailer();
void setDataBlockIndexReader(HFileBlockIndex.CellBasedKeyBlockIndexReader reader);
HFileBlockIndex.CellBasedKeyBlockIndexReader getDataBlockIndexReader();
void setMetaBlockIndexReader(HFileBlockIndex.ByteArrayKeyBlockIndexReader reader);
HFileBlockIndex.ByteArrayKeyBlockIndexReader getMetaBlockIndexReader();
HFileScanner getScanner(boolean cacheBlocks, boolean pread);
/**
* Retrieves general Bloom filter metadata as appropriate for each {@link HFile} version. Knows
* nothing about how that metadata is structured.
*/
DataInput getGeneralBloomFilterMetadata() throws IOException;
/**
* Retrieves delete family Bloom filter metadata as appropriate for each {@link HFile} version.
* Knows nothing about how that metadata is structured.
*/
DataInput getDeleteBloomFilterMetadata() throws IOException;
Path getPath();
/** Close method with optional evictOnClose */
void close(boolean evictOnClose) throws IOException;
DataBlockEncoding getDataBlockEncoding();
boolean hasMVCCInfo();
/**
* Return the file context of the HFile this reader belongs to
*/
HFileContext getFileContext();
boolean isPrimaryReplicaReader();
DataBlockEncoding getEffectiveEncodingInCache(boolean isCompaction);
HFileBlock.FSReader getUncachedBlockReader();
boolean prefetchComplete();
/**
* To close the stream's socket. Note: This can be concurrently called from multiple threads and
* implementation should take care of thread safety.
*/
void unbufferStream();
ReaderContext getContext();
HFileInfo getHFileInfo();
void setDataBlockEncoder(HFileDataBlockEncoder dataBlockEncoder);
}
/**
* Method returns the reader given the specified arguments. TODO This is a bad abstraction. See
* HBASE-6635.
* @param context Reader context info
* @param fileInfo HFile info
* @param cacheConf Cache configuation values, cannot be null.
* @param conf Configuration
* @return an appropriate instance of HFileReader
* @throws IOException If file is invalid, will throw CorruptHFileException flavored IOException
*/
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "SF_SWITCH_FALLTHROUGH",
justification = "Intentional")
public static Reader createReader(ReaderContext context, HFileInfo fileInfo,
CacheConfig cacheConf, Configuration conf) throws IOException {
try {
if (context.getReaderType() == ReaderType.STREAM) {
// stream reader will share trailer with pread reader, see HFileStreamReader#copyFields
return new HFileStreamReader(context, fileInfo, cacheConf, conf);
}
FixedFileTrailer trailer = fileInfo.getTrailer();
switch (trailer.getMajorVersion()) {
case 2:
LOG.debug("Opening HFile v2 with v3 reader");
// Fall through. FindBugs: SF_SWITCH_FALLTHROUGH
case 3:
return new HFilePreadReader(context, fileInfo, cacheConf, conf);
default:
throw new IllegalArgumentException("Invalid HFile version " + trailer.getMajorVersion());
}
} catch (Throwable t) {
IOUtils.closeQuietly(context.getInputStreamWrapper(),
e -> LOG.warn("failed to close input stream wrapper", e));
throw new CorruptHFileException(
"Problem reading HFile Trailer from file " + context.getFilePath(), t);
} finally {
context.getInputStreamWrapper().unbuffer();
}
}
/**
* Creates reader with cache configuration disabled
* @param fs filesystem
* @param path Path to file to read
* @param conf Configuration
* @return an active Reader instance
* @throws IOException Will throw a CorruptHFileException (DoNotRetryIOException subtype) if hfile
* is corrupt/invalid.
*/
public static Reader createReader(FileSystem fs, Path path, Configuration conf)
throws IOException {
// The primaryReplicaReader is mainly used for constructing block cache key, so if we do not use
// block cache then it is OK to set it as any value. We use true here.
return createReader(fs, path, CacheConfig.DISABLED, true, conf);
}
/**
* @param fs filesystem
* @param path Path to file to read
* @param cacheConf This must not be null.
* @param primaryReplicaReader true if this is a reader for primary replica
* @param conf Configuration
* @return an active Reader instance
* @throws IOException Will throw a CorruptHFileException (DoNotRetryIOException subtype) if hfile
* is corrupt/invalid.
* @see CacheConfig#CacheConfig(Configuration)
*/
public static Reader createReader(FileSystem fs, Path path, CacheConfig cacheConf,
boolean primaryReplicaReader, Configuration conf) throws IOException {
Preconditions.checkNotNull(cacheConf, "Cannot create Reader with null CacheConf");
FSDataInputStreamWrapper stream = new FSDataInputStreamWrapper(fs, path);
ReaderContext context =
new ReaderContextBuilder().withFilePath(path).withInputStreamWrapper(stream)
.withFileSize(fs.getFileStatus(path).getLen()).withFileSystem(stream.getHfs())
.withPrimaryReplicaReader(primaryReplicaReader).withReaderType(ReaderType.PREAD).build();
HFileInfo fileInfo = new HFileInfo(context, conf);
Reader reader = createReader(context, fileInfo, cacheConf, conf);
fileInfo.initMetaAndIndex(reader);
return reader;
}
/**
* Returns true if the specified file has a valid HFile Trailer.
* @param fs filesystem
* @param path Path to file to verify
* @return true if the file has a valid HFile Trailer, otherwise false
* @throws IOException if failed to read from the underlying stream
*/
public static boolean isHFileFormat(final FileSystem fs, final Path path) throws IOException {
return isHFileFormat(fs, fs.getFileStatus(path));
}
/**
* Returns true if the specified file has a valid HFile Trailer.
* @param fs filesystem
* @param fileStatus the file to verify
* @return true if the file has a valid HFile Trailer, otherwise false
* @throws IOException if failed to read from the underlying stream
*/
public static boolean isHFileFormat(final FileSystem fs, final FileStatus fileStatus)
throws IOException {
final Path path = fileStatus.getPath();
final long size = fileStatus.getLen();
try (FSDataInputStreamWrapper fsdis = new FSDataInputStreamWrapper(fs, path)) {
boolean isHBaseChecksum = fsdis.shouldUseHBaseChecksum();
assert !isHBaseChecksum; // Initially we must read with FS checksum.
FixedFileTrailer.readFromStream(fsdis.getStream(isHBaseChecksum), size);
return true;
} catch (IllegalArgumentException e) {
return false;
}
}
/**
* Get names of supported compression algorithms. The names are acceptable by HFile.Writer.
* @return Array of strings, each represents a supported compression algorithm. Currently, the
* following compression algorithms are supported.
*
* - "none" - No compression.
*
- "gz" - GZIP compression.
*
*/
public static String[] getSupportedCompressionAlgorithms() {
return Compression.getSupportedAlgorithms();
}
// Utility methods.
/*
* @param l Long to convert to an int.
* @return l cast as an int.
*/
static int longToInt(final long l) {
// Expecting the size() of a block not exceeding 4GB. Assuming the
// size() will wrap to negative integer if it exceeds 2GB (From tfile).
return (int) (l & 0x00000000ffffffffL);
}
/**
* Returns all HFiles belonging to the given region directory. Could return an empty list.
* @param fs The file system reference.
* @param regionDir The region directory to scan.
* @return The list of files found.
* @throws IOException When scanning the files fails.
*/
public static List getStoreFiles(FileSystem fs, Path regionDir) throws IOException {
List regionHFiles = new ArrayList<>();
PathFilter dirFilter = new FSUtils.DirFilter(fs);
FileStatus[] familyDirs = fs.listStatus(regionDir, dirFilter);
for (FileStatus dir : familyDirs) {
FileStatus[] files = fs.listStatus(dir.getPath());
for (FileStatus file : files) {
if (
!file.isDirectory()
&& (!file.getPath().toString().contains(HConstants.HREGION_OLDLOGDIR_NAME))
&& (!file.getPath().toString().contains(HConstants.RECOVERED_EDITS_DIR))
) {
regionHFiles.add(file.getPath());
}
}
}
return regionHFiles;
}
/**
* Checks the given {@link HFile} format version, and throws an exception if invalid. Note that if
* the version number comes from an input file and has not been verified, the caller needs to
* re-throw an {@link IOException} to indicate that this is not a software error, but corrupted
* input.
* @param version an HFile version
* @throws IllegalArgumentException if the version is invalid
*/
public static void checkFormatVersion(int version) throws IllegalArgumentException {
if (version < MIN_FORMAT_VERSION || version > MAX_FORMAT_VERSION) {
throw new IllegalArgumentException("Invalid HFile version: " + version + " (expected to be "
+ "between " + MIN_FORMAT_VERSION + " and " + MAX_FORMAT_VERSION + ")");
}
}
public static void checkHFileVersion(final Configuration c) {
int version = c.getInt(FORMAT_VERSION_KEY, MAX_FORMAT_VERSION);
if (version < MAX_FORMAT_VERSION || version > MAX_FORMAT_VERSION) {
throw new IllegalArgumentException(
"The setting for " + FORMAT_VERSION_KEY + " (in your hbase-*.xml files) is " + version
+ " which does not match " + MAX_FORMAT_VERSION
+ "; are you running with a configuration from an older or newer hbase install (an "
+ "incompatible hbase-default.xml or hbase-site.xml on your CLASSPATH)?");
}
}
public static void main(String[] args) throws Exception {
// delegate to preserve old behavior
HFilePrettyPrinter.main(args);
}
}
| | |