org.apache.hadoop.hbase.regionserver.wal.FSHLog Maven / Gradle / Ivy
Show all versions of hbase-server Show documentation
/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver.wal;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.TreeMap;
import java.util.UUID;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Syncable;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.ClassSize;
import org.apache.hadoop.hbase.util.DrainBarrier;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.HasThread;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.util.StringUtils;
import org.cloudera.htrace.Trace;
import org.cloudera.htrace.TraceScope;
import com.google.common.annotations.VisibleForTesting;
/**
* HLog stores all the edits to the HStore. Its the hbase write-ahead-log
* implementation.
*
* It performs logfile-rolling, so external callers are not aware that the
* underlying file is being rolled.
*
*
* There is one HLog per RegionServer. All edits for all Regions carried by
* a particular RegionServer are entered first in the HLog.
*
*
* Each HRegion is identified by a unique long int
. HRegions do
* not need to declare themselves before using the HLog; they simply include
* their HRegion-id in the append
or
* completeCacheFlush
calls.
*
*
* An HLog consists of multiple on-disk files, which have a chronological order.
* As data is flushed to other (better) on-disk structures, the log becomes
* obsolete. We can destroy all the log messages for a given HRegion-id up to
* the most-recent CACHEFLUSH message from that HRegion.
*
*
* It's only practical to delete entire files. Thus, we delete an entire on-disk
* file F when all of the messages in F have a log-sequence-id that's older
* (smaller) than the most-recent CACHEFLUSH message for every HRegion that has
* a message in F.
*
*
* Synchronized methods can never execute in parallel. However, between the
* start of a cache flush and the completion point, appends are allowed but log
* rolling is not. To prevent log rolling taking place during this period, a
* separate reentrant lock is used.
*
*
To read an HLog, call {@link HLogFactory#createReader(org.apache.hadoop.fs.FileSystem,
* org.apache.hadoop.fs.Path, org.apache.hadoop.conf.Configuration)}.
*
*/
@InterfaceAudience.Private
class FSHLog implements HLog, Syncable {
static final Log LOG = LogFactory.getLog(FSHLog.class);
private static final int DEFAULT_SLOW_SYNC_TIME_MS = 100; // in ms
private final FileSystem fs;
private final Path rootDir;
private final Path dir;
private final Configuration conf;
// Listeners that are called on WAL events.
private List listeners =
new CopyOnWriteArrayList();
private final long blocksize;
private final String prefix;
private final AtomicLong unflushedEntries = new AtomicLong(0);
private final AtomicLong syncedTillHere = new AtomicLong(0);
private long lastUnSyncedTxid;
private final Path oldLogDir;
// all writes pending on AsyncWriter/AsyncSyncer thread with
// txid <= failedTxid will fail by throwing asyncIOE
private final AtomicLong failedTxid = new AtomicLong(-1);
private volatile IOException asyncIOE = null;
private WALCoprocessorHost coprocessorHost;
private FSDataOutputStream hdfs_out; // FSDataOutputStream associated with the current SequenceFile.writer
// Minimum tolerable replicas, if the actual value is lower than it,
// rollWriter will be triggered
private int minTolerableReplication;
private Method getNumCurrentReplicas; // refers to DFSOutputStream.getNumCurrentReplicas
private final Method getPipeLine; // refers to DFSOutputStream.getPipeLine
private final int slowSyncNs;
final static Object [] NO_ARGS = new Object []{};
/** The barrier used to ensure that close() waits for all log rolls and flushes to finish. */
private DrainBarrier closeBarrier = new DrainBarrier();
/**
* Current log file.
*/
Writer writer;
/**
* This lock synchronizes all operations on oldestUnflushedSeqNums and oldestFlushingSeqNums,
* with the exception of append's putIfAbsent into oldestUnflushedSeqNums.
* We only use these to find out the low bound seqNum, or to find regions with old seqNums to
* force flush them, so we don't care about these numbers messing with anything. */
private final Object oldestSeqNumsLock = new Object();
/**
* This lock makes sure only one log roll runs at the same time. Should not be taken while
* any other lock is held. We don't just use synchronized because that results in bogus and
* tedious findbugs warning when it thinks synchronized controls writer thread safety */
private final ReentrantLock rollWriterLock = new ReentrantLock(true);
/**
* Map of encoded region names to their most recent sequence/edit id in their memstore.
*/
private final ConcurrentSkipListMap oldestUnflushedSeqNums =
new ConcurrentSkipListMap(Bytes.BYTES_COMPARATOR);
/**
* Map of encoded region names to their most recent sequence/edit id in their memstore;
* contains the regions that are currently flushing. That way we can store two numbers for
* flushing and non-flushing (oldestUnflushedSeqNums) memstore for the same region.
*/
private final Map oldestFlushingSeqNums =
new TreeMap(Bytes.BYTES_COMPARATOR);
private volatile boolean closed = false;
private boolean forMeta = false;
// The timestamp (in ms) when the log file was created.
private volatile long filenum = -1;
//number of transactions in the current Hlog.
private final AtomicInteger numEntries = new AtomicInteger(0);
// If live datanode count is lower than the default replicas value,
// RollWriter will be triggered in each sync(So the RollWriter will be
// triggered one by one in a short time). Using it as a workaround to slow
// down the roll frequency triggered by checkLowReplication().
private AtomicInteger consecutiveLogRolls = new AtomicInteger(0);
private final int lowReplicationRollLimit;
// If consecutiveLogRolls is larger than lowReplicationRollLimit,
// then disable the rolling in checkLowReplication().
// Enable it if the replications recover.
private volatile boolean lowReplicationRollEnabled = true;
// If > than this size, roll the log. This is typically 0.95 times the size
// of the default Hdfs block size.
private final long logrollsize;
/** size of current log */
private long curLogSize = 0;
/**
* The total size of hlog
*/
private AtomicLong totalLogSize = new AtomicLong(0);
// We synchronize on updateLock to prevent updates and to prevent a log roll
// during an update
// locked during appends
private final Object updateLock = new Object();
private final Object pendingWritesLock = new Object();
private final boolean enabled;
/*
* If more than this many logs, force flush of oldest region to oldest edit
* goes to disk. If too many and we crash, then will take forever replaying.
* Keep the number of logs tidy.
*/
private final int maxLogs;
// List of pending writes to the HLog. There corresponds to transactions
// that have not yet returned to the client. We keep them cached here
// instead of writing them to HDFS piecemeal. The goal is to increase
// the batchsize for writing-to-hdfs as well as sync-to-hdfs, so that
// we can get better system throughput.
private List pendingWrites = new LinkedList();
private final AsyncWriter asyncWriter;
// since AsyncSyncer takes much longer than other phase(add WALEdits to local
// buffer, write local buffer to HDFS, notify pending write handler threads),
// when a sync is ongoing, all other phase pend, we use multiple parallel
// AsyncSyncer threads to improve overall throughput.
private final AsyncSyncer[] asyncSyncers;
private final AsyncNotifier asyncNotifier;
/** Number of log close errors tolerated before we abort */
private final int closeErrorsTolerated;
private final AtomicInteger closeErrorCount = new AtomicInteger();
private final MetricsWAL metrics;
/**
* Map of region encoded names to the latest sequence num obtained from them while appending
* WALEdits to the wal. We create one map for each WAL file at the time it is rolled.
*
* When deciding whether to archive a WAL file, we compare the sequence IDs in this map to
* {@link #oldestFlushingSeqNums} and {@link #oldestUnflushedSeqNums}.
* See {@link FSHLog#areAllRegionsFlushed(Map, Map, Map)} for more info.
*
* This map uses byte[] as the key, and uses reference equality. It works in our use case as we
* use {@link HRegionInfo#getEncodedNameAsBytes()} as keys. For a given region, it always returns
* the same array.
*/
private Map latestSequenceNums = new HashMap();
/**
* WAL Comparator; it compares the timestamp (log filenum), present in the log file name.
*/
public final Comparator LOG_NAME_COMPARATOR = new Comparator() {
@Override
public int compare(Path o1, Path o2) {
long t1 = getFileNumFromFileName(o1);
long t2 = getFileNumFromFileName(o2);
if (t1 == t2) return 0;
return (t1 > t2) ? 1 : -1;
}
};
/**
* Map of log file to the latest sequence nums of all regions it has entries of.
* The map is sorted by the log file creation timestamp (contained in the log file name).
*/
private NavigableMap> hlogSequenceNums =
new ConcurrentSkipListMap>(LOG_NAME_COMPARATOR);
/**
* Constructor.
*
* @param fs filesystem handle
* @param root path for stored and archived hlogs
* @param logDir dir where hlogs are stored
* @param conf configuration to use
* @throws IOException
*/
public FSHLog(final FileSystem fs, final Path root, final String logDir,
final Configuration conf)
throws IOException {
this(fs, root, logDir, HConstants.HREGION_OLDLOGDIR_NAME,
conf, null, true, null, false);
}
/**
* Constructor.
*
* @param fs filesystem handle
* @param root path for stored and archived hlogs
* @param logDir dir where hlogs are stored
* @param oldLogDir dir where hlogs are archived
* @param conf configuration to use
* @throws IOException
*/
public FSHLog(final FileSystem fs, final Path root, final String logDir,
final String oldLogDir, final Configuration conf)
throws IOException {
this(fs, root, logDir, oldLogDir,
conf, null, true, null, false);
}
/**
* Create an edit log at the given dir
location.
*
* You should never have to load an existing log. If there is a log at
* startup, it should have already been processed and deleted by the time the
* HLog object is started up.
*
* @param fs filesystem handle
* @param root path for stored and archived hlogs
* @param logDir dir where hlogs are stored
* @param conf configuration to use
* @param listeners Listeners on WAL events. Listeners passed here will
* be registered before we do anything else; e.g. the
* Constructor {@link #rollWriter()}.
* @param prefix should always be hostname and port in distributed env and
* it will be URL encoded before being used.
* If prefix is null, "hlog" will be used
* @throws IOException
*/
public FSHLog(final FileSystem fs, final Path root, final String logDir,
final Configuration conf, final List listeners,
final String prefix) throws IOException {
this(fs, root, logDir, HConstants.HREGION_OLDLOGDIR_NAME,
conf, listeners, true, prefix, false);
}
/**
* Create an edit log at the given dir
location.
*
* You should never have to load an existing log. If there is a log at
* startup, it should have already been processed and deleted by the time the
* HLog object is started up.
*
* @param fs filesystem handle
* @param root path to where logs and oldlogs
* @param logDir dir where hlogs are stored
* @param oldLogDir dir where hlogs are archived
* @param conf configuration to use
* @param listeners Listeners on WAL events. Listeners passed here will
* be registered before we do anything else; e.g. the
* Constructor {@link #rollWriter()}.
* @param failIfLogDirExists If true IOException will be thrown if dir already exists.
* @param prefix should always be hostname and port in distributed env and
* it will be URL encoded before being used.
* If prefix is null, "hlog" will be used
* @param forMeta if this hlog is meant for meta updates
* @throws IOException
*/
public FSHLog(final FileSystem fs, final Path root, final String logDir,
final String oldLogDir, final Configuration conf,
final List listeners,
final boolean failIfLogDirExists, final String prefix, boolean forMeta)
throws IOException {
super();
this.fs = fs;
this.rootDir = root;
this.dir = new Path(this.rootDir, logDir);
this.oldLogDir = new Path(this.rootDir, oldLogDir);
this.forMeta = forMeta;
this.conf = conf;
if (listeners != null) {
for (WALActionsListener i: listeners) {
registerWALActionsListener(i);
}
}
this.blocksize = this.conf.getLong("hbase.regionserver.hlog.blocksize",
FSUtils.getDefaultBlockSize(this.fs, this.dir));
// Roll at 95% of block size.
float multi = conf.getFloat("hbase.regionserver.logroll.multiplier", 0.95f);
this.logrollsize = (long)(this.blocksize * multi);
this.maxLogs = conf.getInt("hbase.regionserver.maxlogs", 32);
this.minTolerableReplication = conf.getInt(
"hbase.regionserver.hlog.tolerable.lowreplication",
FSUtils.getDefaultReplication(fs, this.dir));
this.lowReplicationRollLimit = conf.getInt(
"hbase.regionserver.hlog.lowreplication.rolllimit", 5);
this.enabled = conf.getBoolean("hbase.regionserver.hlog.enabled", true);
this.closeErrorsTolerated = conf.getInt(
"hbase.regionserver.logroll.errors.tolerated", 0);
LOG.info("WAL/HLog configuration: blocksize=" +
StringUtils.byteDesc(this.blocksize) +
", rollsize=" + StringUtils.byteDesc(this.logrollsize) +
", enabled=" + this.enabled);
// If prefix is null||empty then just name it hlog
this.prefix = prefix == null || prefix.isEmpty() ?
"hlog" : URLEncoder.encode(prefix, "UTF8");
boolean dirExists = false;
if (failIfLogDirExists && (dirExists = this.fs.exists(dir))) {
throw new IOException("Target HLog directory already exists: " + dir);
}
if (!dirExists && !fs.mkdirs(dir)) {
throw new IOException("Unable to mkdir " + dir);
}
if (!fs.exists(this.oldLogDir)) {
if (!fs.mkdirs(this.oldLogDir)) {
throw new IOException("Unable to mkdir " + this.oldLogDir);
}
}
// rollWriter sets this.hdfs_out if it can.
rollWriter();
this.slowSyncNs =
1000000 * conf.getInt("hbase.regionserver.hlog.slowsync.ms",
DEFAULT_SLOW_SYNC_TIME_MS);
// handle the reflection necessary to call getNumCurrentReplicas()
this.getNumCurrentReplicas = getGetNumCurrentReplicas(this.hdfs_out);
this.getPipeLine = getGetPipeline(this.hdfs_out);
final String n = Thread.currentThread().getName();
asyncWriter = new AsyncWriter(n + "-WAL.AsyncWriter");
asyncWriter.start();
int syncerNums = conf.getInt("hbase.hlog.asyncer.number", 5);
asyncSyncers = new AsyncSyncer[syncerNums];
for (int i = 0; i < asyncSyncers.length; ++i) {
asyncSyncers[i] = new AsyncSyncer(n + "-WAL.AsyncSyncer" + i);
asyncSyncers[i].start();
}
asyncNotifier = new AsyncNotifier(n + "-WAL.AsyncNotifier");
asyncNotifier.start();
coprocessorHost = new WALCoprocessorHost(this, conf);
this.metrics = new MetricsWAL();
registerWALActionsListener(metrics);
}
/**
* Find the 'getNumCurrentReplicas' on the passed os
stream.
* @return Method or null.
*/
private Method getGetNumCurrentReplicas(final FSDataOutputStream os) {
Method m = null;
if (os != null) {
Class extends OutputStream> wrappedStreamClass = os.getWrappedStream()
.getClass();
try {
m = wrappedStreamClass.getDeclaredMethod("getNumCurrentReplicas",
new Class>[] {});
m.setAccessible(true);
} catch (NoSuchMethodException e) {
LOG.info("FileSystem's output stream doesn't support"
+ " getNumCurrentReplicas; --HDFS-826 not available; fsOut="
+ wrappedStreamClass.getName());
} catch (SecurityException e) {
LOG.info("Doesn't have access to getNumCurrentReplicas on "
+ "FileSystems's output stream --HDFS-826 not available; fsOut="
+ wrappedStreamClass.getName(), e);
m = null; // could happen on setAccessible()
}
}
if (m != null) {
if (LOG.isTraceEnabled()) LOG.trace("Using getNumCurrentReplicas--HDFS-826");
}
return m;
}
@Override
public void registerWALActionsListener(final WALActionsListener listener) {
this.listeners.add(listener);
}
@Override
public boolean unregisterWALActionsListener(final WALActionsListener listener) {
return this.listeners.remove(listener);
}
@Override
public long getFilenum() {
return this.filenum;
}
/**
* Method used internal to this class and for tests only.
* @return The wrapped stream our writer is using; its not the
* writer's 'out' FSDatoOutputStream but the stream that this 'out' wraps
* (In hdfs its an instance of DFSDataOutputStream).
*
* usage: see TestLogRolling.java
*/
OutputStream getOutputStream() {
return this.hdfs_out.getWrappedStream();
}
@Override
public byte [][] rollWriter() throws FailedLogCloseException, IOException {
return rollWriter(false);
}
@Override
public byte [][] rollWriter(boolean force)
throws FailedLogCloseException, IOException {
rollWriterLock.lock();
try {
// Return if nothing to flush.
if (!force && this.writer != null && this.numEntries.get() <= 0) {
return null;
}
byte [][] regionsToFlush = null;
if (closed) {
LOG.debug("HLog closed. Skipping rolling of writer");
return null;
}
try {
if (!closeBarrier.beginOp()) {
LOG.debug("HLog closing. Skipping rolling of writer");
return regionsToFlush;
}
// Do all the preparation outside of the updateLock to block
// as less as possible the incoming writes
long currentFilenum = this.filenum;
Path oldPath = null;
if (currentFilenum > 0) {
//computeFilename will take care of meta hlog filename
oldPath = computeFilename(currentFilenum);
}
this.filenum = System.currentTimeMillis();
Path newPath = computeFilename();
while (fs.exists(newPath)) {
this.filenum++;
newPath = computeFilename();
}
// Tell our listeners that a new log is about to be created
if (!this.listeners.isEmpty()) {
for (WALActionsListener i : this.listeners) {
i.preLogRoll(oldPath, newPath);
}
}
FSHLog.Writer nextWriter = this.createWriterInstance(fs, newPath, conf);
// Can we get at the dfsclient outputstream?
FSDataOutputStream nextHdfsOut = null;
if (nextWriter instanceof ProtobufLogWriter) {
nextHdfsOut = ((ProtobufLogWriter)nextWriter).getStream();
// perform the costly sync before we get the lock to roll writers.
try {
nextWriter.sync();
} catch (IOException e) {
// optimization failed, no need to abort here.
LOG.warn("pre-sync failed", e);
}
}
Path oldFile = null;
int oldNumEntries = 0;
synchronized (updateLock) {
// Clean up current writer.
oldNumEntries = this.numEntries.get();
oldFile = cleanupCurrentWriter(currentFilenum);
this.writer = nextWriter;
this.hdfs_out = nextHdfsOut;
this.numEntries.set(0);
if (oldFile != null) {
this.hlogSequenceNums.put(oldFile, this.latestSequenceNums);
this.latestSequenceNums = new HashMap();
}
}
if (oldFile == null) LOG.info("New WAL " + FSUtils.getPath(newPath));
else {
long oldFileLen = this.fs.getFileStatus(oldFile).getLen();
this.totalLogSize.addAndGet(oldFileLen);
LOG.info("Rolled WAL " + FSUtils.getPath(oldFile) + " with entries="
+ oldNumEntries + ", filesize="
+ StringUtils.humanReadableInt(oldFileLen) + "; new WAL "
+ FSUtils.getPath(newPath));
}
// Tell our listeners that a new log was created
if (!this.listeners.isEmpty()) {
for (WALActionsListener i : this.listeners) {
i.postLogRoll(oldPath, newPath);
}
}
// Can we delete any of the old log files?
if (getNumRolledLogFiles() > 0) {
cleanOldLogs();
regionsToFlush = findRegionsToForceFlush();
}
} finally {
closeBarrier.endOp();
}
return regionsToFlush;
} finally {
rollWriterLock.unlock();
}
}
/**
* This method allows subclasses to inject different writers without having to
* extend other methods like rollWriter().
*
* @param fs
* @param path
* @param conf
* @return Writer instance
* @throws IOException
*/
protected Writer createWriterInstance(final FileSystem fs, final Path path,
final Configuration conf) throws IOException {
if (forMeta) {
//TODO: set a higher replication for the hlog files (HBASE-6773)
}
return HLogFactory.createWALWriter(fs, path, conf);
}
/**
* Archive old logs that could be archived: a log is eligible for archiving if all its WALEdits
* are already flushed by the corresponding regions.
*
* For each log file, it compares its region to sequenceId map
* (@link {@link FSHLog#latestSequenceNums} with corresponding region entries in
* {@link FSHLog#oldestFlushingSeqNums} and {@link FSHLog#oldestUnflushedSeqNums}.
* If all the regions in the map are flushed past of their value, then the wal is eligible for
* archiving.
* @throws IOException
*/
private void cleanOldLogs() throws IOException {
Map oldestFlushingSeqNumsLocal = null;
Map oldestUnflushedSeqNumsLocal = null;
List logsToArchive = new ArrayList();
// make a local copy so as to avoid locking when we iterate over these maps.
synchronized (oldestSeqNumsLock) {
oldestFlushingSeqNumsLocal = new HashMap(this.oldestFlushingSeqNums);
oldestUnflushedSeqNumsLocal = new HashMap(this.oldestUnflushedSeqNums);
}
for (Map.Entry> e : hlogSequenceNums.entrySet()) {
// iterate over the log file.
Path log = e.getKey();
Map sequenceNums = e.getValue();
// iterate over the map for this log file, and tell whether it should be archive or not.
if (areAllRegionsFlushed(sequenceNums, oldestFlushingSeqNumsLocal,
oldestUnflushedSeqNumsLocal)) {
logsToArchive.add(log);
LOG.debug("log file is ready for archiving " + log);
}
}
for (Path p : logsToArchive) {
this.totalLogSize.addAndGet(-this.fs.getFileStatus(p).getLen());
archiveLogFile(p);
this.hlogSequenceNums.remove(p);
}
}
/**
* Takes a region:sequenceId map for a WAL file, and checks whether the file can be archived.
* It compares the region entries present in the passed sequenceNums map with the local copy of
* {@link #oldestUnflushedSeqNums} and {@link #oldestFlushingSeqNums}. If, for all regions,
* the value is lesser than the minimum of values present in the oldestFlushing/UnflushedSeqNums,
* then the wal file is eligible for archiving.
* @param sequenceNums for a HLog, at the time when it was rolled.
* @param oldestFlushingMap
* @param oldestUnflushedMap
* @return true if wal is eligible for archiving, false otherwise.
*/
static boolean areAllRegionsFlushed(Map sequenceNums,
Map oldestFlushingMap, Map oldestUnflushedMap) {
for (Map.Entry regionSeqIdEntry : sequenceNums.entrySet()) {
// find region entries in the flushing/unflushed map. If there is no entry, it means
// a region doesn't have any unflushed entry.
long oldestFlushing = oldestFlushingMap.containsKey(regionSeqIdEntry.getKey()) ?
oldestFlushingMap.get(regionSeqIdEntry.getKey()) : Long.MAX_VALUE;
long oldestUnFlushed = oldestUnflushedMap.containsKey(regionSeqIdEntry.getKey()) ?
oldestUnflushedMap.get(regionSeqIdEntry.getKey()) : Long.MAX_VALUE;
// do a minimum to be sure to contain oldest sequence Id
long minSeqNum = Math.min(oldestFlushing, oldestUnFlushed);
if (minSeqNum <= regionSeqIdEntry.getValue()) return false;// can't archive
}
return true;
}
/**
* Iterates over the given map of regions, and compares their sequence numbers with corresponding
* entries in {@link #oldestUnflushedSeqNums}. If the sequence number is greater or equal, the
* region is eligible to flush, otherwise, there is no benefit to flush (from the perspective of
* passed regionsSequenceNums map), because the region has already flushed the entries present
* in the WAL file for which this method is called for (typically, the oldest wal file).
* @param regionsSequenceNums
* @return regions which should be flushed (whose sequence numbers are larger than their
* corresponding un-flushed entries.
*/
private byte[][] findEligibleMemstoresToFlush(Map regionsSequenceNums) {
List regionsToFlush = null;
// Keeping the old behavior of iterating unflushedSeqNums under oldestSeqNumsLock.
synchronized (oldestSeqNumsLock) {
for (Map.Entry e : regionsSequenceNums.entrySet()) {
Long unFlushedVal = this.oldestUnflushedSeqNums.get(e.getKey());
if (unFlushedVal != null && unFlushedVal <= e.getValue()) {
if (regionsToFlush == null) regionsToFlush = new ArrayList();
regionsToFlush.add(e.getKey());
}
}
}
return regionsToFlush == null ? null : regionsToFlush
.toArray(new byte[][] { HConstants.EMPTY_BYTE_ARRAY });
}
/**
* If the number of un-archived WAL files is greater than maximum allowed, it checks
* the first (oldest) WAL file, and returns the regions which should be flushed so that it could
* be archived.
* @return regions to flush in order to archive oldest wal file.
* @throws IOException
*/
byte[][] findRegionsToForceFlush() throws IOException {
byte [][] regions = null;
int logCount = getNumRolledLogFiles();
if (logCount > this.maxLogs && logCount > 0) {
Map.Entry> firstWALEntry =
this.hlogSequenceNums.firstEntry();
regions = findEligibleMemstoresToFlush(firstWALEntry.getValue());
}
if (regions != null) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < regions.length; i++) {
if (i > 0) sb.append(", ");
sb.append(Bytes.toStringBinary(regions[i]));
}
LOG.info("Too many hlogs: logs=" + logCount + ", maxlogs=" +
this.maxLogs + "; forcing flush of " + regions.length + " regions(s): " +
sb.toString());
}
return regions;
}
/*
* Cleans up current writer closing.
* Presumes we're operating inside an updateLock scope.
* @return Path to current writer or null if none.
* @throws IOException
*/
Path cleanupCurrentWriter(final long currentfilenum) throws IOException {
Path oldFile = null;
if (this.writer != null) {
// Close the current writer, get a new one.
try {
// Wait till all current transactions are written to the hlog.
// No new transactions can occur because we have the updatelock.
if (this.unflushedEntries.get() != this.syncedTillHere.get()) {
LOG.debug("cleanupCurrentWriter " +
" waiting for transactions to get synced " +
" total " + this.unflushedEntries.get() +
" synced till here " + this.syncedTillHere.get());
sync();
}
this.writer.close();
this.writer = null;
closeErrorCount.set(0);
} catch (IOException e) {
LOG.error("Failed close of HLog writer", e);
int errors = closeErrorCount.incrementAndGet();
if (errors <= closeErrorsTolerated && !hasUnSyncedEntries()) {
LOG.warn("Riding over HLog close failure! error count="+errors);
} else {
if (hasUnSyncedEntries()) {
LOG.error("Aborting due to unflushed edits in HLog");
}
// Failed close of log file. Means we're losing edits. For now,
// shut ourselves down to minimize loss. Alternative is to try and
// keep going. See HBASE-930.
FailedLogCloseException flce =
new FailedLogCloseException("#" + currentfilenum);
flce.initCause(e);
throw flce;
}
}
if (currentfilenum >= 0) {
oldFile = computeFilename(currentfilenum);
}
}
return oldFile;
}
private void archiveLogFile(final Path p) throws IOException {
Path newPath = getHLogArchivePath(this.oldLogDir, p);
// Tell our listeners that a log is going to be archived.
if (!this.listeners.isEmpty()) {
for (WALActionsListener i : this.listeners) {
i.preLogArchive(p, newPath);
}
}
if (!FSUtils.renameAndSetModifyTime(this.fs, p, newPath)) {
throw new IOException("Unable to rename " + p + " to " + newPath);
}
// Tell our listeners that a log has been archived.
if (!this.listeners.isEmpty()) {
for (WALActionsListener i : this.listeners) {
i.postLogArchive(p, newPath);
}
}
}
/**
* This is a convenience method that computes a new filename with a given
* using the current HLog file-number
* @return Path
*/
protected Path computeFilename() {
return computeFilename(this.filenum);
}
/**
* This is a convenience method that computes a new filename with a given
* file-number.
* @param filenum to use
* @return Path
*/
protected Path computeFilename(long filenum) {
if (filenum < 0) {
throw new RuntimeException("hlog file number can't be < 0");
}
String child = prefix + "." + filenum;
if (forMeta) {
child += HLog.META_HLOG_FILE_EXTN;
}
return new Path(dir, child);
}
/**
* A log file has a creation timestamp (in ms) in its file name ({@link #filenum}.
* This helper method returns the creation timestamp from a given log file.
* It extracts the timestamp assuming the filename is created with the
* {@link #computeFilename(long filenum)} method.
* @param fileName
* @return timestamp, as in the log file name.
*/
protected long getFileNumFromFileName(Path fileName) {
if (fileName == null) throw new IllegalArgumentException("file name can't be null");
// The path should start with dir/.
String prefixPathStr = new Path(dir, prefix + ".").toString();
if (!fileName.toString().startsWith(prefixPathStr)) {
throw new IllegalArgumentException("The log file " + fileName + " doesn't belong to" +
" this regionserver " + prefixPathStr);
}
String chompedPath = fileName.toString().substring(prefixPathStr.length());
if (forMeta) chompedPath = chompedPath.substring(0, chompedPath.indexOf(META_HLOG_FILE_EXTN));
return Long.parseLong(chompedPath);
}
@Override
public void closeAndDelete() throws IOException {
close();
if (!fs.exists(this.dir)) return;
FileStatus[] files = fs.listStatus(this.dir);
if (files != null) {
for(FileStatus file : files) {
Path p = getHLogArchivePath(this.oldLogDir, file.getPath());
// Tell our listeners that a log is going to be archived.
if (!this.listeners.isEmpty()) {
for (WALActionsListener i : this.listeners) {
i.preLogArchive(file.getPath(), p);
}
}
if (!FSUtils.renameAndSetModifyTime(fs, file.getPath(), p)) {
throw new IOException("Unable to rename " + file.getPath() + " to " + p);
}
// Tell our listeners that a log was archived.
if (!this.listeners.isEmpty()) {
for (WALActionsListener i : this.listeners) {
i.postLogArchive(file.getPath(), p);
}
}
}
LOG.debug("Moved " + files.length + " WAL file(s) to " + FSUtils.getPath(this.oldLogDir));
}
if (!fs.delete(dir, true)) {
LOG.info("Unable to delete " + dir);
}
}
@Override
public void close() throws IOException {
if (this.closed) {
return;
}
try {
asyncNotifier.interrupt();
asyncNotifier.join();
} catch (InterruptedException e) {
LOG.error("Exception while waiting for " + asyncNotifier.getName() +
" threads to die", e);
}
for (int i = 0; i < asyncSyncers.length; ++i) {
try {
asyncSyncers[i].interrupt();
asyncSyncers[i].join();
} catch (InterruptedException e) {
LOG.error("Exception while waiting for " + asyncSyncers[i].getName() +
" threads to die", e);
}
}
try {
asyncWriter.interrupt();
asyncWriter.join();
} catch (InterruptedException e) {
LOG.error("Exception while waiting for " + asyncWriter.getName() +
" thread to die", e);
}
try {
// Prevent all further flushing and rolling.
closeBarrier.stopAndDrainOps();
} catch (InterruptedException e) {
LOG.error("Exception while waiting for cache flushes and log rolls", e);
Thread.currentThread().interrupt();
}
// Tell our listeners that the log is closing
if (!this.listeners.isEmpty()) {
for (WALActionsListener i : this.listeners) {
i.logCloseRequested();
}
}
synchronized (updateLock) {
this.closed = true;
if (LOG.isDebugEnabled()) {
LOG.debug("Closing WAL writer in " + this.dir.toString());
}
if (this.writer != null) {
this.writer.close();
this.writer = null;
}
}
}
/**
* @param now
* @param encodedRegionName Encoded name of the region as returned by
* HRegionInfo#getEncodedNameAsBytes()
.
* @param tableName
* @param clusterIds that have consumed the change
* @return New log key.
*/
protected HLogKey makeKey(byte[] encodedRegionName, TableName tableName, long seqnum,
long now, List clusterIds, long nonceGroup, long nonce) {
return new HLogKey(encodedRegionName, tableName, seqnum, now, clusterIds, nonceGroup, nonce);
}
@Override
@VisibleForTesting
public void append(HRegionInfo info, TableName tableName, WALEdit edits,
final long now, HTableDescriptor htd, AtomicLong sequenceId) throws IOException {
append(info, tableName, edits, new ArrayList(), now, htd, true, true, sequenceId,
HConstants.NO_NONCE, HConstants.NO_NONCE);
}
/**
* Append a set of edits to the log. Log edits are keyed by (encoded)
* regionName, rowname, and log-sequence-id.
*
* Later, if we sort by these keys, we obtain all the relevant edits for a
* given key-range of the HRegion (TODO). Any edits that do not have a
* matching COMPLETE_CACHEFLUSH message can be discarded.
*
*
* Logs cannot be restarted once closed, or once the HLog process dies. Each
* time the HLog starts, it must create a new log. This means that other
* systems should process the log appropriately upon each startup (and prior
* to initializing HLog).
*
* synchronized prevents appends during the completion of a cache flush or for
* the duration of a log roll.
*
* @param info
* @param tableName
* @param edits
* @param clusterIds that have consumed the change (for replication)
* @param now
* @param doSync shall we sync?
* @param sequenceId of the region.
* @return txid of this transaction
* @throws IOException
*/
@SuppressWarnings("deprecation")
private long append(HRegionInfo info, TableName tableName, WALEdit edits, List clusterIds,
final long now, HTableDescriptor htd, boolean doSync, boolean isInMemstore,
AtomicLong sequenceId, long nonceGroup, long nonce) throws IOException {
if (edits.isEmpty()) return this.unflushedEntries.get();
if (this.closed) {
throw new IOException("Cannot append; log is closed");
}
TraceScope traceScope = Trace.startSpan("FSHlog.append");
try {
long txid = 0;
synchronized (this.updateLock) {
// get the sequence number from the passed Long. In normal flow, it is coming from the
// region.
long seqNum = sequenceId.incrementAndGet();
// The 'lastSeqWritten' map holds the sequence number of the oldest
// write for each region (i.e. the first edit added to the particular
// memstore). . When the cache is flushed, the entry for the
// region being flushed is removed if the sequence number of the flush
// is greater than or equal to the value in lastSeqWritten.
// Use encoded name. Its shorter, guaranteed unique and a subset of
// actual name.
byte [] encodedRegionName = info.getEncodedNameAsBytes();
if (isInMemstore) this.oldestUnflushedSeqNums.putIfAbsent(encodedRegionName, seqNum);
HLogKey logKey = makeKey(
encodedRegionName, tableName, seqNum, now, clusterIds, nonceGroup, nonce);
synchronized (pendingWritesLock) {
doWrite(info, logKey, edits, htd);
txid = this.unflushedEntries.incrementAndGet();
}
this.numEntries.incrementAndGet();
this.asyncWriter.setPendingTxid(txid);
if (htd.isDeferredLogFlush()) {
lastUnSyncedTxid = txid;
}
this.latestSequenceNums.put(encodedRegionName, seqNum);
}
// TODO: note that only tests currently call append w/sync.
// Therefore, this code here is not actually used by anything.
// Sync if catalog region, and if not then check if that table supports
// deferred log flushing
if (doSync &&
(info.isMetaRegion() ||
!htd.isDeferredLogFlush())) {
// sync txn to file system
this.sync(txid);
}
return txid;
} finally {
traceScope.close();
}
}
@Override
public long appendNoSync(HRegionInfo info, TableName tableName, WALEdit edits,
List clusterIds, final long now, HTableDescriptor htd, AtomicLong sequenceId,
boolean isInMemstore, long nonceGroup, long nonce) throws IOException {
return append(info, tableName, edits, clusterIds,
now, htd, false, isInMemstore, sequenceId, nonceGroup, nonce);
}
/* The work of current write process of HLog goes as below:
* 1). All write handler threads append edits to HLog's local pending buffer;
* (it notifies AsyncWriter thread that there is new edits in local buffer)
* 2). All write handler threads wait in HLog.syncer() function for underlying threads to
* finish the sync that contains its txid;
* 3). An AsyncWriter thread is responsible for retrieving all edits in HLog's
* local pending buffer and writing to the hdfs (hlog.writer.append);
* (it notifies AsyncSyncer threads that there is new writes to hdfs which needs a sync)
* 4). AsyncSyncer threads are responsible for issuing sync request to hdfs to persist the
* writes by AsyncWriter; (they notify the AsyncNotifier thread that sync is done)
* 5). An AsyncNotifier thread is responsible for notifying all pending write handler
* threads which are waiting in the HLog.syncer() function
* 6). No LogSyncer thread any more (since there is always AsyncWriter/AsyncFlusher threads
* do the same job it does)
* note: more than one AsyncSyncer threads are needed here to guarantee good enough performance
* when less concurrent write handler threads. since sync is the most time-consuming
* operation in the whole write process, multiple AsyncSyncer threads can provide better
* parallelism of sync to get better overall throughput
*/
// thread to write locally buffered writes to HDFS
private class AsyncWriter extends HasThread {
private long pendingTxid = 0;
private long txidToWrite = 0;
private long lastWrittenTxid = 0;
private Object writeLock = new Object();
public AsyncWriter(String name) {
super(name);
}
// wake up (called by (write) handler thread) AsyncWriter thread
// to write buffered writes to HDFS
public void setPendingTxid(long txid) {
synchronized (this.writeLock) {
if (txid <= this.pendingTxid)
return;
this.pendingTxid = txid;
this.writeLock.notify();
}
}
public void run() {
try {
while (!this.isInterrupted()) {
// 1. wait until there is new writes in local buffer
synchronized (this.writeLock) {
while (this.pendingTxid <= this.lastWrittenTxid) {
this.writeLock.wait();
}
}
// 2. get all buffered writes and update 'real' pendingTxid
// since maybe newer writes enter buffer as AsyncWriter wakes
// up and holds the lock
// NOTE! can't hold 'updateLock' here since rollWriter will pend
// on 'sync()' with 'updateLock', but 'sync()' will wait for
// AsyncWriter/AsyncSyncer/AsyncNotifier series. without updateLock
// can leads to pendWrites more than pendingTxid, but not problem
List pendWrites = null;
synchronized (pendingWritesLock) {
this.txidToWrite = unflushedEntries.get();
pendWrites = pendingWrites;
pendingWrites = new LinkedList();
}
// 3. write all buffered writes to HDFS(append, without sync)
try {
for (Entry e : pendWrites) {
writer.append(e);
}
} catch(IOException e) {
LOG.error("Error while AsyncWriter write, request close of hlog ", e);
requestLogRoll();
asyncIOE = e;
failedTxid.set(this.txidToWrite);
}
// 4. update 'lastWrittenTxid' and notify AsyncSyncer to do 'sync'
this.lastWrittenTxid = this.txidToWrite;
boolean hasIdleSyncer = false;
for (int i = 0; i < asyncSyncers.length; ++i) {
if (!asyncSyncers[i].isSyncing()) {
hasIdleSyncer = true;
asyncSyncers[i].setWrittenTxid(this.lastWrittenTxid);
break;
}
}
if (!hasIdleSyncer) {
int idx = (int)(this.lastWrittenTxid % asyncSyncers.length);
asyncSyncers[idx].setWrittenTxid(this.lastWrittenTxid);
}
}
} catch (InterruptedException e) {
LOG.debug(getName() + " interrupted while waiting for " +
"newer writes added to local buffer");
} catch (Exception e) {
LOG.error("UNEXPECTED", e);
} finally {
LOG.info(getName() + " exiting");
}
}
}
// thread to request HDFS to sync the WALEdits written by AsyncWriter
// to make those WALEdits durable on HDFS side
private class AsyncSyncer extends HasThread {
private long writtenTxid = 0;
private long txidToSync = 0;
private long lastSyncedTxid = 0;
private volatile boolean isSyncing = false;
private Object syncLock = new Object();
public AsyncSyncer(String name) {
super(name);
}
public boolean isSyncing() {
return this.isSyncing;
}
// wake up (called by AsyncWriter thread) AsyncSyncer thread
// to sync(flush) writes written by AsyncWriter in HDFS
public void setWrittenTxid(long txid) {
synchronized (this.syncLock) {
if (txid <= this.writtenTxid)
return;
this.writtenTxid = txid;
this.syncLock.notify();
}
}
public void run() {
try {
while (!this.isInterrupted()) {
// 1. wait until AsyncWriter has written data to HDFS and
// called setWrittenTxid to wake up us
synchronized (this.syncLock) {
while (this.writtenTxid <= this.lastSyncedTxid) {
this.syncLock.wait();
}
this.txidToSync = this.writtenTxid;
}
// if this syncer's writes have been synced by other syncer:
// 1. just set lastSyncedTxid
// 2. don't do real sync, don't notify AsyncNotifier, don't logroll check
// regardless of whether the writer is null or not
if (this.txidToSync <= syncedTillHere.get()) {
this.lastSyncedTxid = this.txidToSync;
continue;
}
// 2. do 'sync' to HDFS to provide durability
long now = EnvironmentEdgeManager.currentTimeMillis();
try {
if (writer == null) {
// the only possible case where writer == null is as below:
// 1. t1: AsyncWriter append writes to hdfs,
// envokes AsyncSyncer 1 with writtenTxid==100
// 2. t2: AsyncWriter append writes to hdfs,
// envokes AsyncSyncer 2 with writtenTxid==200
// 3. t3: rollWriter starts, it grabs the updateLock which
// prevents further writes entering pendingWrites and
// wait for all items(200) in pendingWrites to append/sync
// to hdfs
// 4. t4: AsyncSyncer 2 finishes, now syncedTillHere==200
// 5. t5: rollWriter close writer, set writer=null...
// 6. t6: AsyncSyncer 1 starts to use writer to do sync... before
// rollWriter set writer to the newly created Writer
//
// Now writer == null and txidToSync > syncedTillHere here:
// we need fail all the writes with txid <= txidToSync to avoid
// 'data loss' where user get successful write response but can't
// read the writes!
LOG.error("should never happen: has unsynced writes but writer is null!");
asyncIOE = new IOException("has unsynced writes but writer is null!");
failedTxid.set(this.txidToSync);
} else {
this.isSyncing = true;
writer.sync();
this.isSyncing = false;
}
postSync();
} catch (IOException e) {
LOG.warn("Error while AsyncSyncer sync, request close of hlog ", e);
requestLogRoll();
asyncIOE = e;
failedTxid.set(this.txidToSync);
this.isSyncing = false;
}
final long took = EnvironmentEdgeManager.currentTimeMillis() - now;
metrics.finishSync(took);
if (took > (slowSyncNs/1000000)) {
String msg =
new StringBuilder().append("Slow sync cost: ")
.append(took).append(" ms, current pipeline: ")
.append(Arrays.toString(getPipeLine())).toString();
Trace.addTimelineAnnotation(msg);
LOG.info(msg);
}
// 3. wake up AsyncNotifier to notify(wake-up) all pending 'put'
// handler threads on 'sync()'
this.lastSyncedTxid = this.txidToSync;
asyncNotifier.setFlushedTxid(this.lastSyncedTxid);
// 4. check and do logRoll if needed
boolean lowReplication = false;
if (rollWriterLock.tryLock()) {
try {
lowReplication = checkLowReplication();
} finally {
rollWriterLock.unlock();
}
try {
if (lowReplication || writer != null && writer.getLength() > logrollsize) {
requestLogRoll(lowReplication);
}
} catch (IOException e) {
LOG.warn("writer.getLength() failed,this failure won't block here");
}
}
}
} catch (InterruptedException e) {
LOG.debug(getName() + " interrupted while waiting for " +
"notification from AsyncWriter thread");
} catch (Exception e) {
LOG.error("UNEXPECTED", e);
} finally {
LOG.info(getName() + " exiting");
}
}
}
// thread to notify all write handler threads which are pending on
// their written WALEdits' durability(sync)
// why an extra 'notifier' thread is needed rather than letting
// AsyncSyncer thread itself notifies when sync is done is to let
// AsyncSyncer thread do next sync as soon as possible since 'notify'
// has heavy synchronization with all pending write handler threads
private class AsyncNotifier extends HasThread {
private long flushedTxid = 0;
private long lastNotifiedTxid = 0;
private Object notifyLock = new Object();
public AsyncNotifier(String name) {
super(name);
}
public void setFlushedTxid(long txid) {
synchronized (this.notifyLock) {
if (txid <= this.flushedTxid) {
return;
}
this.flushedTxid = txid;
this.notifyLock.notify();
}
}
public void run() {
try {
while (!this.isInterrupted()) {
synchronized (this.notifyLock) {
while (this.flushedTxid <= this.lastNotifiedTxid) {
this.notifyLock.wait();
}
this.lastNotifiedTxid = this.flushedTxid;
}
// notify(wake-up) all pending (write) handler thread
// (or logroller thread which also may pend on sync())
synchronized (syncedTillHere) {
syncedTillHere.set(this.lastNotifiedTxid);
syncedTillHere.notifyAll();
}
}
} catch (InterruptedException e) {
LOG.debug(getName() + " interrupted while waiting for " +
" notification from AsyncSyncer thread");
} catch (Exception e) {
LOG.error("UNEXPECTED", e);
} finally {
LOG.info(getName() + " exiting");
}
}
}
// sync all known transactions
private void syncer() throws IOException {
syncer(this.unflushedEntries.get()); // sync all pending items
}
// sync all transactions upto the specified txid
private void syncer(long txid) throws IOException {
synchronized (this.syncedTillHere) {
while (this.syncedTillHere.get() < txid) {
try {
this.syncedTillHere.wait();
} catch (InterruptedException e) {
LOG.debug("interrupted while waiting for notification from AsyncNotifier");
}
}
}
if (txid <= this.failedTxid.get()) {
assert asyncIOE != null :
"current txid is among(under) failed txids, but asyncIOE is null!";
throw asyncIOE;
}
}
@Override
public void postSync() {}
@Override
public void postAppend(List entries) {}
/*
* @return whether log roll should be requested
*/
private boolean checkLowReplication() {
boolean logRollNeeded = false;
// if the number of replicas in HDFS has fallen below the configured
// value, then roll logs.
try {
int numCurrentReplicas = getLogReplication();
if (numCurrentReplicas != 0
&& numCurrentReplicas < this.minTolerableReplication) {
if (this.lowReplicationRollEnabled) {
if (this.consecutiveLogRolls.get() < this.lowReplicationRollLimit) {
LOG.warn("HDFS pipeline error detected. " + "Found "
+ numCurrentReplicas + " replicas but expecting no less than "
+ this.minTolerableReplication + " replicas. "
+ " Requesting close of hlog. current pipeline: "
+ Arrays.toString(getPipeLine()));
logRollNeeded = true;
// If rollWriter is requested, increase consecutiveLogRolls. Once it
// is larger than lowReplicationRollLimit, disable the
// LowReplication-Roller
this.consecutiveLogRolls.getAndIncrement();
} else {
LOG.warn("Too many consecutive RollWriter requests, it's a sign of "
+ "the total number of live datanodes is lower than the tolerable replicas.");
this.consecutiveLogRolls.set(0);
this.lowReplicationRollEnabled = false;
}
}
} else if (numCurrentReplicas >= this.minTolerableReplication) {
if (!this.lowReplicationRollEnabled) {
// The new writer's log replicas is always the default value.
// So we should not enable LowReplication-Roller. If numEntries
// is lower than or equals 1, we consider it as a new writer.
if (this.numEntries.get() <= 1) {
return logRollNeeded;
}
// Once the live datanode number and the replicas return to normal,
// enable the LowReplication-Roller.
this.lowReplicationRollEnabled = true;
LOG.info("LowReplication-Roller was enabled.");
}
}
} catch (Exception e) {
LOG.warn("Unable to invoke DFSOutputStream.getNumCurrentReplicas" + e +
" still proceeding ahead...");
}
return logRollNeeded;
}
/**
* This method gets the datanode replication count for the current HLog.
*
* If the pipeline isn't started yet or is empty, you will get the default
* replication factor. Therefore, if this function returns 0, it means you
* are not properly running with the HDFS-826 patch.
* @throws InvocationTargetException
* @throws IllegalAccessException
* @throws IllegalArgumentException
*
* @throws Exception
*/
int getLogReplication()
throws IllegalArgumentException, IllegalAccessException, InvocationTargetException {
if (this.getNumCurrentReplicas != null && this.hdfs_out != null) {
Object repl = this.getNumCurrentReplicas.invoke(getOutputStream(), NO_ARGS);
if (repl instanceof Integer) {
return ((Integer)repl).intValue();
}
}
return 0;
}
boolean canGetCurReplicas() {
return this.getNumCurrentReplicas != null;
}
@Override
public void hsync() throws IOException {
syncer();
}
@Override
public void hflush() throws IOException {
syncer();
}
@Override
public void sync() throws IOException {
syncer();
}
@Override
public void sync(long txid) throws IOException {
syncer(txid);
}
private void requestLogRoll() {
requestLogRoll(false);
}
private void requestLogRoll(boolean tooFewReplicas) {
if (!this.listeners.isEmpty()) {
for (WALActionsListener i: this.listeners) {
i.logRollRequested(tooFewReplicas);
}
}
}
// TODO: Remove info. Unused.
protected void doWrite(HRegionInfo info, HLogKey logKey, WALEdit logEdit,
HTableDescriptor htd)
throws IOException {
if (!this.enabled) {
return;
}
if (!this.listeners.isEmpty()) {
for (WALActionsListener i: this.listeners) {
i.visitLogEntryBeforeWrite(htd, logKey, logEdit);
}
}
try {
long now = EnvironmentEdgeManager.currentTimeMillis();
// coprocessor hook:
if (!coprocessorHost.preWALWrite(info, logKey, logEdit)) {
if (logEdit.isReplay()) {
// set replication scope null so that this won't be replicated
logKey.setScopes(null);
}
// write to our buffer for the Hlog file.
this.pendingWrites.add(new HLog.Entry(logKey, logEdit));
}
long took = EnvironmentEdgeManager.currentTimeMillis() - now;
coprocessorHost.postWALWrite(info, logKey, logEdit);
long len = 0;
for (KeyValue kv : logEdit.getKeyValues()) {
len += kv.getLength();
}
this.metrics.finishAppend(took, len);
} catch (IOException e) {
LOG.warn("Could not append. Requesting close of hlog", e);
requestLogRoll();
throw e;
}
}
/** @return How many items have been added to the log */
int getNumEntries() {
return numEntries.get();
}
/** @return the number of rolled log files */
public int getNumRolledLogFiles() {
return hlogSequenceNums.size();
}
/** @return the number of log files in use */
@Override
public int getNumLogFiles() {
// +1 for current use log
return getNumRolledLogFiles() + 1;
}
/** @return the size of log files in use */
@Override
public long getLogFileSize() {
return totalLogSize.get() + curLogSize;
}
@Override
public boolean startCacheFlush(final byte[] encodedRegionName) {
Long oldRegionSeqNum = null;
if (!closeBarrier.beginOp()) {
LOG.info("Flush will not be started for " + Bytes.toString(encodedRegionName) +
" - because the server is closing.");
return false;
}
synchronized (oldestSeqNumsLock) {
oldRegionSeqNum = this.oldestUnflushedSeqNums.remove(encodedRegionName);
if (oldRegionSeqNum != null) {
Long oldValue = this.oldestFlushingSeqNums.put(encodedRegionName, oldRegionSeqNum);
assert oldValue == null : "Flushing map not cleaned up for "
+ Bytes.toString(encodedRegionName);
}
}
if (oldRegionSeqNum == null) {
// TODO: if we have no oldRegionSeqNum, and WAL is not disabled, presumably either
// the region is already flushing (which would make this call invalid), or there
// were no appends after last flush, so why are we starting flush? Maybe we should
// assert not null, and switch to "long" everywhere. Less rigorous, but safer,
// alternative is telling the caller to stop. For now preserve old logic.
LOG.warn("Couldn't find oldest seqNum for the region we are about to flush: ["
+ Bytes.toString(encodedRegionName) + "]");
}
return true;
}
@Override
public void completeCacheFlush(final byte [] encodedRegionName)
{
synchronized (oldestSeqNumsLock) {
this.oldestFlushingSeqNums.remove(encodedRegionName);
}
closeBarrier.endOp();
}
@Override
public void abortCacheFlush(byte[] encodedRegionName) {
Long currentSeqNum = null, seqNumBeforeFlushStarts = null;
synchronized (oldestSeqNumsLock) {
seqNumBeforeFlushStarts = this.oldestFlushingSeqNums.remove(encodedRegionName);
if (seqNumBeforeFlushStarts != null) {
currentSeqNum =
this.oldestUnflushedSeqNums.put(encodedRegionName, seqNumBeforeFlushStarts);
}
}
closeBarrier.endOp();
if ((currentSeqNum != null)
&& (currentSeqNum.longValue() <= seqNumBeforeFlushStarts.longValue())) {
String errorStr = "Region " + Bytes.toString(encodedRegionName) +
"acquired edits out of order current memstore seq=" + currentSeqNum
+ ", previous oldest unflushed id=" + seqNumBeforeFlushStarts;
LOG.error(errorStr);
assert false : errorStr;
Runtime.getRuntime().halt(1);
}
}
@Override
public boolean isLowReplicationRollEnabled() {
return lowReplicationRollEnabled;
}
/**
* Get the directory we are making logs in.
*
* @return dir
*/
protected Path getDir() {
return dir;
}
static Path getHLogArchivePath(Path oldLogDir, Path p) {
return new Path(oldLogDir, p.getName());
}
static String formatRecoveredEditsFileName(final long seqid) {
return String.format("%019d", seqid);
}
public static final long FIXED_OVERHEAD = ClassSize.align(
ClassSize.OBJECT + (5 * ClassSize.REFERENCE) +
ClassSize.ATOMIC_INTEGER + Bytes.SIZEOF_INT + (3 * Bytes.SIZEOF_LONG));
private static void usage() {
System.err.println("Usage: HLog ");
System.err.println("Arguments:");
System.err.println(" --dump Dump textual representation of passed one or more files");
System.err.println(" For example: HLog --dump hdfs://example.com:9000/hbase/.logs/MACHINE/LOGFILE");
System.err.println(" --split Split the passed directory of WAL logs");
System.err.println(" For example: HLog --split hdfs://example.com:9000/hbase/.logs/DIR");
}
private static void split(final Configuration conf, final Path p)
throws IOException {
FileSystem fs = FileSystem.get(conf);
if (!fs.exists(p)) {
throw new FileNotFoundException(p.toString());
}
if (!fs.getFileStatus(p).isDir()) {
throw new IOException(p + " is not a directory");
}
final Path baseDir = FSUtils.getRootDir(conf);
final Path oldLogDir = new Path(baseDir, HConstants.HREGION_OLDLOGDIR_NAME);
HLogSplitter.split(baseDir, p, oldLogDir, fs, conf);
}
@Override
public WALCoprocessorHost getCoprocessorHost() {
return coprocessorHost;
}
/** Provide access to currently deferred sequence num for tests */
boolean hasUnSyncedEntries() {
return this.lastUnSyncedTxid > this.syncedTillHere.get();
}
@Override
public long getEarliestMemstoreSeqNum(byte[] encodedRegionName) {
Long result = oldestUnflushedSeqNums.get(encodedRegionName);
return result == null ? HConstants.NO_SEQNUM : result.longValue();
}
/**
* Pass one or more log file names and it will either dump out a text version
* on stdout
or split the specified log files.
*
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
if (args.length < 2) {
usage();
System.exit(-1);
}
// either dump using the HLogPrettyPrinter or split, depending on args
if (args[0].compareTo("--dump") == 0) {
HLogPrettyPrinter.run(Arrays.copyOfRange(args, 1, args.length));
} else if (args[0].compareTo("--split") == 0) {
Configuration conf = HBaseConfiguration.create();
for (int i = 1; i < args.length; i++) {
try {
Path logPath = new Path(args[i]);
FSUtils.setFsDefault(conf, logPath);
split(conf, logPath);
} catch (Throwable t) {
t.printStackTrace(System.err);
System.exit(-1);
}
}
} else {
usage();
System.exit(-1);
}
}
/**
* Find the 'getPipeline' on the passed os
stream.
* @return Method or null.
*/
private Method getGetPipeline(final FSDataOutputStream os) {
Method m = null;
if (os != null) {
Class extends OutputStream> wrappedStreamClass = os.getWrappedStream()
.getClass();
try {
m = wrappedStreamClass.getDeclaredMethod("getPipeline",
new Class>[] {});
m.setAccessible(true);
} catch (NoSuchMethodException e) {
LOG.info("FileSystem's output stream doesn't support"
+ " getPipeline; not available; fsOut="
+ wrappedStreamClass.getName());
} catch (SecurityException e) {
LOG.info(
"Doesn't have access to getPipeline on "
+ "FileSystems's output stream ; fsOut="
+ wrappedStreamClass.getName(), e);
m = null; // could happen on setAccessible()
}
}
return m;
}
/**
* This method gets the pipeline for the current HLog.
* @return
*/
DatanodeInfo[] getPipeLine() {
if (this.getPipeLine != null && this.hdfs_out != null) {
Object repl;
try {
repl = this.getPipeLine.invoke(getOutputStream(), NO_ARGS);
if (repl instanceof DatanodeInfo[]) {
return ((DatanodeInfo[]) repl);
}
} catch (Exception e) {
LOG.info("Get pipeline failed", e);
}
}
return new DatanodeInfo[0];
}
}