All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.wal.WALSplitter Maven / Gradle / Ivy

There is a newer version: 3.0.0-beta-1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.wal;

import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.NavigableSet;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellScanner;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.coordination.SplitLogWorkerCoordination;
import org.apache.hadoop.hbase.io.HeapSize;
import org.apache.hadoop.hbase.log.HBaseMarkers;
import org.apache.hadoop.hbase.master.SplitLogManager;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.LastSequenceId;
import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
import org.apache.hadoop.hbase.regionserver.wal.WALCellCodec;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CancelableProgressable;
import org.apache.hadoop.hbase.util.ClassSize;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.wal.WAL.Entry;
import org.apache.hadoop.hbase.wal.WAL.Reader;
import org.apache.hadoop.hbase.wal.WALProvider.Writer;
import org.apache.hadoop.hbase.zookeeper.ZKSplitLog;
import org.apache.hadoop.io.MultipleIOException;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
import org.apache.hbase.thirdparty.com.google.protobuf.TextFormat;
import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils;
import org.apache.hbase.thirdparty.org.apache.commons.collections4.MapUtils;

import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.WALEntry;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.MutationProto.MutationType;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.RegionStoreSequenceIds;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos;
/**
 * This class is responsible for splitting up a bunch of regionserver commit log
 * files that are no longer being written to, into new files, one per region, for
 * recovering data on startup. Delete the old log files when finished.
 */
@InterfaceAudience.Private
public class WALSplitter {
  private static final Logger LOG = LoggerFactory.getLogger(WALSplitter.class);

  /** By default we retry errors in splitting, rather than skipping. */
  public static final boolean SPLIT_SKIP_ERRORS_DEFAULT = false;

  // Parameters for split process
  protected final Path rootDir;
  protected final FileSystem fs;
  protected final Configuration conf;

  // Major subcomponents of the split process.
  // These are separated into inner classes to make testing easier.
  OutputSink outputSink;
  private EntryBuffers entryBuffers;

  private SplitLogWorkerCoordination splitLogWorkerCoordination;
  private final WALFactory walFactory;

  private MonitoredTask status;

  // For checking the latest flushed sequence id
  protected final LastSequenceId sequenceIdChecker;

  // Map encodedRegionName -> lastFlushedSequenceId
  protected Map lastFlushedSequenceIds = new ConcurrentHashMap<>();

  // Map encodedRegionName -> maxSeqIdInStores
  protected Map> regionMaxSeqIdInStores = new ConcurrentHashMap<>();

  // the file being split currently
  private FileStatus fileBeingSplit;

  // if we limit the number of writers opened for sinking recovered edits
  private final boolean splitWriterCreationBounded;

  public final static String SPLIT_WRITER_CREATION_BOUNDED = "hbase.split.writer.creation.bounded";


  @VisibleForTesting
  WALSplitter(final WALFactory factory, Configuration conf, Path rootDir,
      FileSystem fs, LastSequenceId idChecker,
      SplitLogWorkerCoordination splitLogWorkerCoordination) {
    this.conf = HBaseConfiguration.create(conf);
    String codecClassName = conf
        .get(WALCellCodec.WAL_CELL_CODEC_CLASS_KEY, WALCellCodec.class.getName());
    this.conf.set(HConstants.RPC_CODEC_CONF_KEY, codecClassName);
    this.rootDir = rootDir;
    this.fs = fs;
    this.sequenceIdChecker = idChecker;
    this.splitLogWorkerCoordination = splitLogWorkerCoordination;

    this.walFactory = factory;
    PipelineController controller = new PipelineController();

    this.splitWriterCreationBounded = conf.getBoolean(SPLIT_WRITER_CREATION_BOUNDED, false);

    entryBuffers = new EntryBuffers(controller,
        this.conf.getInt("hbase.regionserver.hlog.splitlog.buffersize", 128 * 1024 * 1024),
        splitWriterCreationBounded);

    int numWriterThreads = this.conf.getInt("hbase.regionserver.hlog.splitlog.writer.threads", 3);
    if(splitWriterCreationBounded){
      outputSink = new BoundedLogWriterCreationOutputSink(
          controller, entryBuffers, numWriterThreads);
    }else {
      outputSink = new LogRecoveredEditsOutputSink(controller, entryBuffers, numWriterThreads);
    }
  }

  /**
   * Splits a WAL file into region's recovered-edits directory.
   * This is the main entry point for distributed log splitting from SplitLogWorker.
   * 

* If the log file has N regions then N recovered.edits files will be produced. *

* @return false if it is interrupted by the progress-able. */ public static boolean splitLogFile(Path rootDir, FileStatus logfile, FileSystem fs, Configuration conf, CancelableProgressable reporter, LastSequenceId idChecker, SplitLogWorkerCoordination splitLogWorkerCoordination, final WALFactory factory) throws IOException { WALSplitter s = new WALSplitter(factory, conf, rootDir, fs, idChecker, splitLogWorkerCoordination); return s.splitLogFile(logfile, reporter); } // A wrapper to split one log folder using the method used by distributed // log splitting. Used by tools and unit tests. It should be package private. // It is public only because TestWALObserver is in a different package, // which uses this method to do log splitting. @VisibleForTesting public static List split(Path rootDir, Path logDir, Path oldLogDir, FileSystem fs, Configuration conf, final WALFactory factory) throws IOException { final FileStatus[] logfiles = SplitLogManager.getFileList(conf, Collections.singletonList(logDir), null); List splits = new ArrayList<>(); if (ArrayUtils.isNotEmpty(logfiles)) { for (FileStatus logfile: logfiles) { WALSplitter s = new WALSplitter(factory, conf, rootDir, fs, null, null); if (s.splitLogFile(logfile, null)) { finishSplitLogFile(rootDir, oldLogDir, logfile.getPath(), conf); if (s.outputSink.splits != null) { splits.addAll(s.outputSink.splits); } } } } if (!fs.delete(logDir, true)) { throw new IOException("Unable to delete src dir: " + logDir); } return splits; } /** * log splitting implementation, splits one log file. * @param logfile should be an actual log file. */ @VisibleForTesting boolean splitLogFile(FileStatus logfile, CancelableProgressable reporter) throws IOException { Preconditions.checkState(status == null); Preconditions.checkArgument(logfile.isFile(), "passed in file status is for something other than a regular file."); boolean isCorrupted = false; boolean skipErrors = conf.getBoolean("hbase.hlog.split.skip.errors", SPLIT_SKIP_ERRORS_DEFAULT); int interval = conf.getInt("hbase.splitlog.report.interval.loglines", 1024); Path logPath = logfile.getPath(); boolean outputSinkStarted = false; boolean progress_failed = false; int editsCount = 0; int editsSkipped = 0; status = TaskMonitor.get().createStatus( "Splitting log file " + logfile.getPath() + "into a temporary staging area."); Reader logFileReader = null; this.fileBeingSplit = logfile; try { long logLength = logfile.getLen(); LOG.info("Splitting WAL={}, length={}", logPath, logLength); status.setStatus("Opening log file"); if (reporter != null && !reporter.progress()) { progress_failed = true; return false; } logFileReader = getReader(logfile, skipErrors, reporter); if (logFileReader == null) { LOG.warn("Nothing to split in WAL={}", logPath); return true; } int numOpenedFilesBeforeReporting = conf.getInt("hbase.splitlog.report.openedfiles", 3); int numOpenedFilesLastCheck = 0; outputSink.setReporter(reporter); outputSink.startWriterThreads(); outputSinkStarted = true; Entry entry; Long lastFlushedSequenceId = -1L; while ((entry = getNextLogLine(logFileReader, logPath, skipErrors)) != null) { byte[] region = entry.getKey().getEncodedRegionName(); String encodedRegionNameAsStr = Bytes.toString(region); lastFlushedSequenceId = lastFlushedSequenceIds.get(encodedRegionNameAsStr); if (lastFlushedSequenceId == null) { if (sequenceIdChecker != null) { RegionStoreSequenceIds ids = sequenceIdChecker.getLastSequenceId(region); Map maxSeqIdInStores = new TreeMap<>(Bytes.BYTES_COMPARATOR); for (StoreSequenceId storeSeqId : ids.getStoreSequenceIdList()) { maxSeqIdInStores.put(storeSeqId.getFamilyName().toByteArray(), storeSeqId.getSequenceId()); } regionMaxSeqIdInStores.put(encodedRegionNameAsStr, maxSeqIdInStores); lastFlushedSequenceId = ids.getLastFlushedSequenceId(); if (LOG.isDebugEnabled()) { LOG.debug("DLS Last flushed sequenceid for " + encodedRegionNameAsStr + ": " + TextFormat.shortDebugString(ids)); } } if (lastFlushedSequenceId == null) { lastFlushedSequenceId = -1L; } lastFlushedSequenceIds.put(encodedRegionNameAsStr, lastFlushedSequenceId); } if (lastFlushedSequenceId >= entry.getKey().getSequenceId()) { editsSkipped++; continue; } // Don't send Compaction/Close/Open region events to recovered edit type sinks. if (entry.getEdit().isMetaEdit() && !outputSink.keepRegionEvent(entry)) { editsSkipped++; continue; } entryBuffers.appendEntry(entry); editsCount++; int moreWritersFromLastCheck = this.getNumOpenWriters() - numOpenedFilesLastCheck; // If sufficient edits have passed, check if we should report progress. if (editsCount % interval == 0 || moreWritersFromLastCheck > numOpenedFilesBeforeReporting) { numOpenedFilesLastCheck = this.getNumOpenWriters(); String countsStr = (editsCount - (editsSkipped + outputSink.getSkippedEdits())) + " edits, skipped " + editsSkipped + " edits."; status.setStatus("Split " + countsStr); if (reporter != null && !reporter.progress()) { progress_failed = true; return false; } } } } catch (InterruptedException ie) { IOException iie = new InterruptedIOException(); iie.initCause(ie); throw iie; } catch (CorruptedLogFileException e) { LOG.warn("Could not parse, corrupted WAL={}", logPath, e); if (splitLogWorkerCoordination != null) { // Some tests pass in a csm of null. splitLogWorkerCoordination.markCorrupted(rootDir, logfile.getPath().getName(), fs); } else { // for tests only ZKSplitLog.markCorrupted(rootDir, logfile.getPath().getName(), fs); } isCorrupted = true; } catch (IOException e) { e = e instanceof RemoteException ? ((RemoteException) e).unwrapRemoteException() : e; throw e; } finally { LOG.debug("Finishing writing output logs and closing down"); try { if (null != logFileReader) { logFileReader.close(); } } catch (IOException exception) { LOG.warn("Could not close WAL reader", exception); } try { if (outputSinkStarted) { // Set progress_failed to true as the immediate following statement will reset its value // when finishWritingAndClose() throws exception, progress_failed has the right value progress_failed = true; progress_failed = outputSink.finishWritingAndClose() == null; } } finally { String msg = "Processed " + editsCount + " edits across " + outputSink.getNumberOfRecoveredRegions() + " regions; edits skipped=" + editsSkipped + "; log file=" + logPath + ", length=" + logfile.getLen() + // See if length got updated post lease recovery ", corrupted=" + isCorrupted + ", progress failed=" + progress_failed; LOG.info(msg); status.markComplete(msg); } } return !progress_failed; } /** * Completes the work done by splitLogFile by archiving logs *

* It is invoked by SplitLogManager once it knows that one of the * SplitLogWorkers have completed the splitLogFile() part. If the master * crashes then this function might get called multiple times. *

* @param logfile * @param conf * @throws IOException */ public static void finishSplitLogFile(String logfile, Configuration conf) throws IOException { Path rootdir = FSUtils.getWALRootDir(conf); Path oldLogDir = new Path(rootdir, HConstants.HREGION_OLDLOGDIR_NAME); Path logPath; if (FSUtils.isStartingWithPath(rootdir, logfile)) { logPath = new Path(logfile); } else { logPath = new Path(rootdir, logfile); } finishSplitLogFile(rootdir, oldLogDir, logPath, conf); } private static void finishSplitLogFile(Path rootdir, Path oldLogDir, Path logPath, Configuration conf) throws IOException { List processedLogs = new ArrayList<>(); List corruptedLogs = new ArrayList<>(); FileSystem fs; fs = rootdir.getFileSystem(conf); if (ZKSplitLog.isCorrupted(rootdir, logPath.getName(), fs)) { corruptedLogs.add(logPath); } else { processedLogs.add(logPath); } archiveLogs(corruptedLogs, processedLogs, oldLogDir, fs, conf); Path stagingDir = ZKSplitLog.getSplitLogDir(rootdir, logPath.getName()); fs.delete(stagingDir, true); } /** * Moves processed logs to a oldLogDir after successful processing Moves * corrupted logs (any log that couldn't be successfully parsed to corruptDir * (.corrupt) for later investigation * * @param corruptedLogs * @param processedLogs * @param oldLogDir * @param fs * @param conf * @throws IOException */ private static void archiveLogs( final List corruptedLogs, final List processedLogs, final Path oldLogDir, final FileSystem fs, final Configuration conf) throws IOException { final Path corruptDir = new Path(FSUtils.getWALRootDir(conf), HConstants.CORRUPT_DIR_NAME); if (conf.get("hbase.regionserver.hlog.splitlog.corrupt.dir") != null) { LOG.warn("hbase.regionserver.hlog.splitlog.corrupt.dir is deprecated. Default to {}", corruptDir); } if (!fs.mkdirs(corruptDir)) { LOG.info("Unable to mkdir {}", corruptDir); } fs.mkdirs(oldLogDir); // this method can get restarted or called multiple times for archiving // the same log files. for (Path corrupted : corruptedLogs) { Path p = new Path(corruptDir, corrupted.getName()); if (fs.exists(corrupted)) { if (!fs.rename(corrupted, p)) { LOG.warn("Unable to move corrupted log {} to {}", corrupted, p); } else { LOG.warn("Moved corrupted log {} to {}", corrupted, p); } } } for (Path p : processedLogs) { Path newPath = AbstractFSWAL.getWALArchivePath(oldLogDir, p); if (fs.exists(p)) { if (!FSUtils.renameAndSetModifyTime(fs, p, newPath)) { LOG.warn("Unable to move {} to {}", p, newPath); } else { LOG.info("Archived processed log {} to {}", p, newPath); } } } } /** * Path to a file under RECOVERED_EDITS_DIR directory of the region found in * logEntry named for the sequenceid in the passed * logEntry: e.g. /hbase/some_table/2323432434/recovered.edits/2332. * This method also ensures existence of RECOVERED_EDITS_DIR under the region * creating it if necessary. * @param fs * @param logEntry * @param rootDir HBase root dir. * @param fileNameBeingSplit the file being split currently. Used to generate tmp file name. * @return Path to file into which to dump split log edits. * @throws IOException */ @SuppressWarnings("deprecation") @VisibleForTesting static Path getRegionSplitEditsPath(final FileSystem fs, final Entry logEntry, final Path rootDir, String fileNameBeingSplit) throws IOException { Path tableDir = FSUtils.getTableDir(rootDir, logEntry.getKey().getTableName()); String encodedRegionName = Bytes.toString(logEntry.getKey().getEncodedRegionName()); Path regiondir = HRegion.getRegionDir(tableDir, encodedRegionName); Path dir = getRegionDirRecoveredEditsDir(regiondir); if (!fs.exists(regiondir)) { LOG.info("This region's directory does not exist: {}." + "It is very likely that it was already split so it is " + "safe to discard those edits.", regiondir); return null; } if (fs.exists(dir) && fs.isFile(dir)) { Path tmp = new Path("/tmp"); if (!fs.exists(tmp)) { fs.mkdirs(tmp); } tmp = new Path(tmp, HConstants.RECOVERED_EDITS_DIR + "_" + encodedRegionName); LOG.warn("Found existing old file: {}. It could be some " + "leftover of an old installation. It should be a folder instead. " + "So moving it to {}", dir, tmp); if (!fs.rename(dir, tmp)) { LOG.warn("Failed to sideline old file {}", dir); } } if (!fs.exists(dir) && !fs.mkdirs(dir)) { LOG.warn("mkdir failed on {}", dir); } // Append fileBeingSplit to prevent name conflict since we may have duplicate wal entries now. // Append file name ends with RECOVERED_LOG_TMPFILE_SUFFIX to ensure // region's replayRecoveredEdits will not delete it String fileName = formatRecoveredEditsFileName(logEntry.getKey().getSequenceId()); fileName = getTmpRecoveredEditsFileName(fileName + "-" + fileNameBeingSplit); return new Path(dir, fileName); } private static String getTmpRecoveredEditsFileName(String fileName) { return fileName + RECOVERED_LOG_TMPFILE_SUFFIX; } /** * Get the completed recovered edits file path, renaming it to be by last edit * in the file from its first edit. Then we could use the name to skip * recovered edits when doing {@link HRegion#replayRecoveredEditsIfAny}. * @param srcPath * @param maximumEditLogSeqNum * @return dstPath take file's last edit log seq num as the name */ private static Path getCompletedRecoveredEditsFilePath(Path srcPath, long maximumEditLogSeqNum) { String fileName = formatRecoveredEditsFileName(maximumEditLogSeqNum); return new Path(srcPath.getParent(), fileName); } @VisibleForTesting static String formatRecoveredEditsFileName(final long seqid) { return String.format("%019d", seqid); } private static final Pattern EDITFILES_NAME_PATTERN = Pattern.compile("-?[0-9]+"); private static final String RECOVERED_LOG_TMPFILE_SUFFIX = ".temp"; /** * @param regiondir * This regions directory in the filesystem. * @return The directory that holds recovered edits files for the region * regiondir */ public static Path getRegionDirRecoveredEditsDir(final Path regiondir) { return new Path(regiondir, HConstants.RECOVERED_EDITS_DIR); } /** * Check whether there is recovered.edits in the region dir * @param fs FileSystem * @param conf conf * @param regionInfo the region to check * @throws IOException IOException * @return true if recovered.edits exist in the region dir */ public static boolean hasRecoveredEdits(final FileSystem fs, final Configuration conf, final RegionInfo regionInfo) throws IOException { // No recovered.edits for non default replica regions if (regionInfo.getReplicaId() != RegionInfo.DEFAULT_REPLICA_ID) { return false; } Path rootDir = FSUtils.getRootDir(conf); //Only default replica region can reach here, so we can use regioninfo //directly without converting it to default replica's regioninfo. Path regionDir = HRegion.getRegionDir(rootDir, regionInfo); NavigableSet files = getSplitEditFilesSorted(fs, regionDir); return files != null && !files.isEmpty(); } /** * Returns sorted set of edit files made by splitter, excluding files * with '.temp' suffix. * * @param fs * @param regiondir * @return Files in passed regiondir as a sorted set. * @throws IOException */ public static NavigableSet getSplitEditFilesSorted(final FileSystem fs, final Path regiondir) throws IOException { NavigableSet filesSorted = new TreeSet<>(); Path editsdir = getRegionDirRecoveredEditsDir(regiondir); if (!fs.exists(editsdir)) { return filesSorted; } FileStatus[] files = FSUtils.listStatus(fs, editsdir, new PathFilter() { @Override public boolean accept(Path p) { boolean result = false; try { // Return files and only files that match the editfile names pattern. // There can be other files in this directory other than edit files. // In particular, on error, we'll move aside the bad edit file giving // it a timestamp suffix. See moveAsideBadEditsFile. Matcher m = EDITFILES_NAME_PATTERN.matcher(p.getName()); result = fs.isFile(p) && m.matches(); // Skip the file whose name ends with RECOVERED_LOG_TMPFILE_SUFFIX, // because it means splitwal thread is writting this file. if (p.getName().endsWith(RECOVERED_LOG_TMPFILE_SUFFIX)) { result = false; } // Skip SeqId Files if (isSequenceIdFile(p)) { result = false; } } catch (IOException e) { LOG.warn("Failed isFile check on {}", p, e); } return result; } }); if (ArrayUtils.isNotEmpty(files)) { Arrays.asList(files).forEach(status -> filesSorted.add(status.getPath())); } return filesSorted; } /** * Move aside a bad edits file. * * @param fs * @param edits * Edits file to move aside. * @return The name of the moved aside file. * @throws IOException */ public static Path moveAsideBadEditsFile(final FileSystem fs, final Path edits) throws IOException { Path moveAsideName = new Path(edits.getParent(), edits.getName() + "." + System.currentTimeMillis()); if (!fs.rename(edits, moveAsideName)) { LOG.warn("Rename failed from {} to {}", edits, moveAsideName); } return moveAsideName; } private static final String SEQUENCE_ID_FILE_SUFFIX = ".seqid"; private static final String OLD_SEQUENCE_ID_FILE_SUFFIX = "_seqid"; private static final int SEQUENCE_ID_FILE_SUFFIX_LENGTH = SEQUENCE_ID_FILE_SUFFIX.length(); /** * Is the given file a region open sequence id file. */ @VisibleForTesting public static boolean isSequenceIdFile(final Path file) { return file.getName().endsWith(SEQUENCE_ID_FILE_SUFFIX) || file.getName().endsWith(OLD_SEQUENCE_ID_FILE_SUFFIX); } private static FileStatus[] getSequenceIdFiles(FileSystem fs, Path regionDir) throws IOException { // TODO: Why are we using a method in here as part of our normal region open where // there is no splitting involved? Fix. St.Ack 01/20/2017. Path editsDir = WALSplitter.getRegionDirRecoveredEditsDir(regionDir); try { FileStatus[] files = fs.listStatus(editsDir, WALSplitter::isSequenceIdFile); return files != null ? files : new FileStatus[0]; } catch (FileNotFoundException e) { return new FileStatus[0]; } } private static long getMaxSequenceId(FileStatus[] files) { long maxSeqId = -1L; for (FileStatus file : files) { String fileName = file.getPath().getName(); try { maxSeqId = Math.max(maxSeqId, Long .parseLong(fileName.substring(0, fileName.length() - SEQUENCE_ID_FILE_SUFFIX_LENGTH))); } catch (NumberFormatException ex) { LOG.warn("Invalid SeqId File Name={}", fileName); } } return maxSeqId; } /** * Get the max sequence id which is stored in the region directory. -1 if none. */ public static long getMaxRegionSequenceId(FileSystem fs, Path regionDir) throws IOException { return getMaxSequenceId(getSequenceIdFiles(fs, regionDir)); } /** * Create a file with name as region's max sequence id */ public static void writeRegionSequenceIdFile(FileSystem fs, Path regionDir, long newMaxSeqId) throws IOException { FileStatus[] files = getSequenceIdFiles(fs, regionDir); long maxSeqId = getMaxSequenceId(files); if (maxSeqId > newMaxSeqId) { throw new IOException("The new max sequence id " + newMaxSeqId + " is less than the old max sequence id " + maxSeqId); } // write a new seqId file Path newSeqIdFile = new Path(WALSplitter.getRegionDirRecoveredEditsDir(regionDir), newMaxSeqId + SEQUENCE_ID_FILE_SUFFIX); if (newMaxSeqId != maxSeqId) { try { if (!fs.createNewFile(newSeqIdFile) && !fs.exists(newSeqIdFile)) { throw new IOException("Failed to create SeqId file:" + newSeqIdFile); } LOG.debug("Wrote file={}, newMaxSeqId={}, maxSeqId={}", newSeqIdFile, newMaxSeqId, maxSeqId); } catch (FileAlreadyExistsException ignored) { // latest hdfs throws this exception. it's all right if newSeqIdFile already exists } } // remove old ones for (FileStatus status : files) { if (!newSeqIdFile.equals(status.getPath())) { fs.delete(status.getPath(), false); } } } /** * Create a new {@link Reader} for reading logs to split. * * @param file * @return A new Reader instance, caller should close * @throws IOException * @throws CorruptedLogFileException */ protected Reader getReader(FileStatus file, boolean skipErrors, CancelableProgressable reporter) throws IOException, CorruptedLogFileException { Path path = file.getPath(); long length = file.getLen(); Reader in; // Check for possibly empty file. With appends, currently Hadoop reports a // zero length even if the file has been sync'd. Revisit if HDFS-376 or // HDFS-878 is committed. if (length <= 0) { LOG.warn("File {} might be still open, length is 0", path); } try { FSUtils.getInstance(fs, conf).recoverFileLease(fs, path, conf, reporter); try { in = getReader(path, reporter); } catch (EOFException e) { if (length <= 0) { // TODO should we ignore an empty, not-last log file if skip.errors // is false? Either way, the caller should decide what to do. E.g. // ignore if this is the last log in sequence. // TODO is this scenario still possible if the log has been // recovered (i.e. closed) LOG.warn("Could not open {} for reading. File is empty", path, e); } // EOFException being ignored return null; } } catch (IOException e) { if (e instanceof FileNotFoundException) { // A wal file may not exist anymore. Nothing can be recovered so move on LOG.warn("File {} does not exist anymore", path, e); return null; } if (!skipErrors || e instanceof InterruptedIOException) { throw e; // Don't mark the file corrupted if interrupted, or not skipErrors } CorruptedLogFileException t = new CorruptedLogFileException("skipErrors=true Could not open wal " + path + " ignoring"); t.initCause(e); throw t; } return in; } static private Entry getNextLogLine(Reader in, Path path, boolean skipErrors) throws CorruptedLogFileException, IOException { try { return in.next(); } catch (EOFException eof) { // truncated files are expected if a RS crashes (see HBASE-2643) LOG.info("EOF from wal {}. Continuing.", path); return null; } catch (IOException e) { // If the IOE resulted from bad file format, // then this problem is idempotent and retrying won't help if (e.getCause() != null && (e.getCause() instanceof ParseException || e.getCause() instanceof org.apache.hadoop.fs.ChecksumException)) { LOG.warn("Parse exception from wal {}. Continuing", path, e); return null; } if (!skipErrors) { throw e; } CorruptedLogFileException t = new CorruptedLogFileException("skipErrors=true Ignoring exception" + " while parsing wal " + path + ". Marking as corrupted"); t.initCause(e); throw t; } } /** * Create a new {@link Writer} for writing log splits. * @return a new Writer instance, caller should close */ protected Writer createWriter(Path logfile) throws IOException { return walFactory.createRecoveredEditsWriter(fs, logfile); } /** * Create a new {@link Reader} for reading logs to split. * @return new Reader instance, caller should close */ protected Reader getReader(Path curLogFile, CancelableProgressable reporter) throws IOException { return walFactory.createReader(fs, curLogFile, reporter); } /** * Get current open writers */ private int getNumOpenWriters() { int result = 0; if (this.outputSink != null) { result += this.outputSink.getNumOpenWriters(); } return result; } /** * Contains some methods to control WAL-entries producer / consumer interactions */ public static class PipelineController { // If an exception is thrown by one of the other threads, it will be // stored here. AtomicReference thrown = new AtomicReference<>(); // Wait/notify for when data has been produced by the writer thread, // consumed by the reader thread, or an exception occurred public final Object dataAvailable = new Object(); void writerThreadError(Throwable t) { thrown.compareAndSet(null, t); } /** * Check for errors in the writer threads. If any is found, rethrow it. */ void checkForErrors() throws IOException { Throwable thrown = this.thrown.get(); if (thrown == null) return; if (thrown instanceof IOException) { throw new IOException(thrown); } else { throw new RuntimeException(thrown); } } } /** * Class which accumulates edits and separates them into a buffer per region * while simultaneously accounting RAM usage. Blocks if the RAM usage crosses * a predefined threshold. * * Writer threads then pull region-specific buffers from this class. */ public static class EntryBuffers { PipelineController controller; Map buffers = new TreeMap<>(Bytes.BYTES_COMPARATOR); /* Track which regions are currently in the middle of writing. We don't allow an IO thread to pick up bytes from a region if we're already writing data for that region in a different IO thread. */ Set currentlyWriting = new TreeSet<>(Bytes.BYTES_COMPARATOR); long totalBuffered = 0; long maxHeapUsage; boolean splitWriterCreationBounded; public EntryBuffers(PipelineController controller, long maxHeapUsage) { this(controller, maxHeapUsage, false); } public EntryBuffers(PipelineController controller, long maxHeapUsage, boolean splitWriterCreationBounded){ this.controller = controller; this.maxHeapUsage = maxHeapUsage; this.splitWriterCreationBounded = splitWriterCreationBounded; } /** * Append a log entry into the corresponding region buffer. * Blocks if the total heap usage has crossed the specified threshold. * * @throws InterruptedException * @throws IOException */ public void appendEntry(Entry entry) throws InterruptedException, IOException { WALKey key = entry.getKey(); RegionEntryBuffer buffer; long incrHeap; synchronized (this) { buffer = buffers.get(key.getEncodedRegionName()); if (buffer == null) { buffer = new RegionEntryBuffer(key.getTableName(), key.getEncodedRegionName()); buffers.put(key.getEncodedRegionName(), buffer); } incrHeap= buffer.appendEntry(entry); } // If we crossed the chunk threshold, wait for more space to be available synchronized (controller.dataAvailable) { totalBuffered += incrHeap; while (totalBuffered > maxHeapUsage && controller.thrown.get() == null) { LOG.debug("Used {} bytes of buffered edits, waiting for IO threads", totalBuffered); controller.dataAvailable.wait(2000); } controller.dataAvailable.notifyAll(); } controller.checkForErrors(); } /** * @return RegionEntryBuffer a buffer of edits to be written. */ synchronized RegionEntryBuffer getChunkToWrite() { // The core part of limiting opening writers is it doesn't return chunk only if the // heap size is over maxHeapUsage. Thus it doesn't need to create a writer for each // region during splitting. It will flush all the logs in the buffer after splitting // through a threadpool, which means the number of writers it created is under control. if (splitWriterCreationBounded && totalBuffered < maxHeapUsage) { return null; } long biggestSize = 0; byte[] biggestBufferKey = null; for (Map.Entry entry : buffers.entrySet()) { long size = entry.getValue().heapSize(); if (size > biggestSize && (!currentlyWriting.contains(entry.getKey()))) { biggestSize = size; biggestBufferKey = entry.getKey(); } } if (biggestBufferKey == null) { return null; } RegionEntryBuffer buffer = buffers.remove(biggestBufferKey); currentlyWriting.add(biggestBufferKey); return buffer; } void doneWriting(RegionEntryBuffer buffer) { synchronized (this) { boolean removed = currentlyWriting.remove(buffer.encodedRegionName); assert removed; } long size = buffer.heapSize(); synchronized (controller.dataAvailable) { totalBuffered -= size; // We may unblock writers controller.dataAvailable.notifyAll(); } } synchronized boolean isRegionCurrentlyWriting(byte[] region) { return currentlyWriting.contains(region); } public void waitUntilDrained() { synchronized (controller.dataAvailable) { while (totalBuffered > 0) { try { controller.dataAvailable.wait(2000); } catch (InterruptedException e) { LOG.warn("Got interrupted while waiting for EntryBuffers is drained"); Thread.interrupted(); break; } } } } } /** * A buffer of some number of edits for a given region. * This accumulates edits and also provides a memory optimization in order to * share a single byte array instance for the table and region name. * Also tracks memory usage of the accumulated edits. */ public static class RegionEntryBuffer implements HeapSize { long heapInBuffer = 0; List entryBuffer; TableName tableName; byte[] encodedRegionName; RegionEntryBuffer(TableName tableName, byte[] region) { this.tableName = tableName; this.encodedRegionName = region; this.entryBuffer = new ArrayList<>(); } long appendEntry(Entry entry) { internify(entry); entryBuffer.add(entry); long incrHeap = entry.getEdit().heapSize() + ClassSize.align(2 * ClassSize.REFERENCE) + // WALKey pointers 0; // TODO linkedlist entry heapInBuffer += incrHeap; return incrHeap; } private void internify(Entry entry) { WALKeyImpl k = entry.getKey(); k.internTableName(this.tableName); k.internEncodedRegionName(this.encodedRegionName); } @Override public long heapSize() { return heapInBuffer; } public byte[] getEncodedRegionName() { return encodedRegionName; } public List getEntryBuffer() { return entryBuffer; } public TableName getTableName() { return tableName; } } public static class WriterThread extends Thread { private volatile boolean shouldStop = false; private PipelineController controller; private EntryBuffers entryBuffers; private OutputSink outputSink = null; WriterThread(PipelineController controller, EntryBuffers entryBuffers, OutputSink sink, int i){ super(Thread.currentThread().getName() + "-Writer-" + i); this.controller = controller; this.entryBuffers = entryBuffers; outputSink = sink; } @Override public void run() { try { doRun(); } catch (Throwable t) { LOG.error("Exiting thread", t); controller.writerThreadError(t); } } private void doRun() throws IOException { LOG.trace("Writer thread starting"); while (true) { RegionEntryBuffer buffer = entryBuffers.getChunkToWrite(); if (buffer == null) { // No data currently available, wait on some more to show up synchronized (controller.dataAvailable) { if (shouldStop && !this.outputSink.flush()) { return; } try { controller.dataAvailable.wait(500); } catch (InterruptedException ie) { if (!shouldStop) { throw new RuntimeException(ie); } } } continue; } assert buffer != null; try { writeBuffer(buffer); } finally { entryBuffers.doneWriting(buffer); } } } private void writeBuffer(RegionEntryBuffer buffer) throws IOException { outputSink.append(buffer); } void finish() { synchronized (controller.dataAvailable) { shouldStop = true; controller.dataAvailable.notifyAll(); } } } /** * The following class is an abstraction class to provide a common interface to support * different ways of consuming recovered edits. */ public static abstract class OutputSink { protected PipelineController controller; protected EntryBuffers entryBuffers; protected ConcurrentHashMap writers = new ConcurrentHashMap<>(); protected final ConcurrentHashMap regionMaximumEditLogSeqNum = new ConcurrentHashMap<>(); protected final List writerThreads = Lists.newArrayList(); /* Set of regions which we've decided should not output edits */ protected final Set blacklistedRegions = Collections .synchronizedSet(new TreeSet<>(Bytes.BYTES_COMPARATOR)); protected boolean closeAndCleanCompleted = false; protected boolean writersClosed = false; protected final int numThreads; protected CancelableProgressable reporter = null; protected AtomicLong skippedEdits = new AtomicLong(); protected List splits = null; public OutputSink(PipelineController controller, EntryBuffers entryBuffers, int numWriters) { numThreads = numWriters; this.controller = controller; this.entryBuffers = entryBuffers; } void setReporter(CancelableProgressable reporter) { this.reporter = reporter; } /** * Start the threads that will pump data from the entryBuffers to the output files. */ public synchronized void startWriterThreads() { for (int i = 0; i < numThreads; i++) { WriterThread t = new WriterThread(controller, entryBuffers, this, i); t.start(); writerThreads.add(t); } } /** * * Update region's maximum edit log SeqNum. */ void updateRegionMaximumEditLogSeqNum(Entry entry) { synchronized (regionMaximumEditLogSeqNum) { String regionName = Bytes.toString(entry.getKey().getEncodedRegionName()); Long currentMaxSeqNum = regionMaximumEditLogSeqNum.get(regionName); if (currentMaxSeqNum == null || entry.getKey().getSequenceId() > currentMaxSeqNum) { regionMaximumEditLogSeqNum.put(regionName, entry.getKey().getSequenceId()); } } } /** * @return the number of currently opened writers */ int getNumOpenWriters() { return this.writers.size(); } long getSkippedEdits() { return this.skippedEdits.get(); } /** * Wait for writer threads to dump all info to the sink * @return true when there is no error * @throws IOException */ protected boolean finishWriting(boolean interrupt) throws IOException { LOG.debug("Waiting for split writer threads to finish"); boolean progress_failed = false; for (WriterThread t : writerThreads) { t.finish(); } if (interrupt) { for (WriterThread t : writerThreads) { t.interrupt(); // interrupt the writer threads. We are stopping now. } } for (WriterThread t : writerThreads) { if (!progress_failed && reporter != null && !reporter.progress()) { progress_failed = true; } try { t.join(); } catch (InterruptedException ie) { IOException iie = new InterruptedIOException(); iie.initCause(ie); throw iie; } } controller.checkForErrors(); LOG.info("{} split writers finished; closing.", this.writerThreads.size()); return (!progress_failed); } public abstract List finishWritingAndClose() throws IOException; /** * @return a map from encoded region ID to the number of edits written out for that region. */ public abstract Map getOutputCounts(); /** * @return number of regions we've recovered */ public abstract int getNumberOfRecoveredRegions(); /** * @param buffer A WAL Edit Entry * @throws IOException */ public abstract void append(RegionEntryBuffer buffer) throws IOException; /** * WriterThread call this function to help flush internal remaining edits in buffer before close * @return true when underlying sink has something to flush */ public boolean flush() throws IOException { return false; } /** * Some WALEdit's contain only KV's for account on what happened to a region. * Not all sinks will want to get all of those edits. * * @return Return true if this sink wants to accept this region-level WALEdit. */ public abstract boolean keepRegionEvent(Entry entry); } /** * Class that manages the output streams from the log splitting process. */ class LogRecoveredEditsOutputSink extends OutputSink { public LogRecoveredEditsOutputSink(PipelineController controller, EntryBuffers entryBuffers, int numWriters) { // More threads could potentially write faster at the expense // of causing more disk seeks as the logs are split. // 3. After a certain setting (probably around 3) the // process will be bound on the reader in the current // implementation anyway. super(controller, entryBuffers, numWriters); } /** * @return null if failed to report progress * @throws IOException */ @Override public List finishWritingAndClose() throws IOException { boolean isSuccessful = false; List result = null; try { isSuccessful = finishWriting(false); } finally { result = close(); List thrown = closeLogWriters(null); if (CollectionUtils.isNotEmpty(thrown)) { throw MultipleIOException.createIOException(thrown); } } if (isSuccessful) { splits = result; } return splits; } // delete the one with fewer wal entries private void deleteOneWithFewerEntries(WriterAndPath wap, Path dst) throws IOException { long dstMinLogSeqNum = -1L; try (WAL.Reader reader = walFactory.createReader(fs, dst)) { WAL.Entry entry = reader.next(); if (entry != null) { dstMinLogSeqNum = entry.getKey().getSequenceId(); } } catch (EOFException e) { LOG.debug("Got EOF when reading first WAL entry from {}, an empty or broken WAL file?", dst, e); } if (wap.minLogSeqNum < dstMinLogSeqNum) { LOG.warn("Found existing old edits file. It could be the result of a previous failed" + " split attempt or we have duplicated wal entries. Deleting " + dst + ", length=" + fs.getFileStatus(dst).getLen()); if (!fs.delete(dst, false)) { LOG.warn("Failed deleting of old {}", dst); throw new IOException("Failed deleting of old " + dst); } } else { LOG.warn("Found existing old edits file and we have less entries. Deleting " + wap.p + ", length=" + fs.getFileStatus(wap.p).getLen()); if (!fs.delete(wap.p, false)) { LOG.warn("Failed deleting of {}", wap.p); throw new IOException("Failed deleting of " + wap.p); } } } /** * Close all of the output streams. * @return the list of paths written. */ List close() throws IOException { Preconditions.checkState(!closeAndCleanCompleted); final List paths = new ArrayList<>(); final List thrown = Lists.newArrayList(); ThreadPoolExecutor closeThreadPool = Threads .getBoundedCachedThreadPool(numThreads, 30L, TimeUnit.SECONDS, new ThreadFactory() { private int count = 1; @Override public Thread newThread(Runnable r) { Thread t = new Thread(r, "split-log-closeStream-" + count++); return t; } }); CompletionService completionService = new ExecutorCompletionService<>(closeThreadPool); boolean progress_failed; try { progress_failed = executeCloseTask(completionService, thrown, paths); } catch (InterruptedException e) { IOException iie = new InterruptedIOException(); iie.initCause(e); throw iie; } catch (ExecutionException e) { throw new IOException(e.getCause()); } finally { closeThreadPool.shutdownNow(); } if (!thrown.isEmpty()) { throw MultipleIOException.createIOException(thrown); } writersClosed = true; closeAndCleanCompleted = true; if (progress_failed) { return null; } return paths; } /** * @param completionService threadPool to execute the closing tasks * @param thrown store the exceptions * @param paths arrayList to store the paths written * @return if close tasks executed successful */ boolean executeCloseTask(CompletionService completionService, List thrown, List paths) throws InterruptedException, ExecutionException { for (final Map.Entry writersEntry : writers.entrySet()) { if (LOG.isTraceEnabled()) { LOG.trace("Submitting close of " + ((WriterAndPath) writersEntry.getValue()).p); } completionService.submit(new Callable() { @Override public Void call() throws Exception { WriterAndPath wap = (WriterAndPath) writersEntry.getValue(); Path dst = closeWriter(writersEntry.getKey(), wap, thrown); paths.add(dst); return null; } }); } boolean progress_failed = false; for (int i = 0, n = this.writers.size(); i < n; i++) { Future future = completionService.take(); future.get(); if (!progress_failed && reporter != null && !reporter.progress()) { progress_failed = true; } } return progress_failed; } Path closeWriter(String encodedRegionName, WriterAndPath wap, List thrown) throws IOException{ if (LOG.isTraceEnabled()) { LOG.trace("Closing " + wap.p); } try { wap.w.close(); } catch (IOException ioe) { LOG.error("Couldn't close log at " + wap.p, ioe); thrown.add(ioe); return null; } if (LOG.isDebugEnabled()) { LOG.debug("Closed wap " + wap.p + " (wrote " + wap.editsWritten + " edits, skipped " + wap.editsSkipped + " edits in " + (wap.nanosSpent / 1000 / 1000) + "ms"); } if (wap.editsWritten == 0) { // just remove the empty recovered.edits file if (fs.exists(wap.p) && !fs.delete(wap.p, false)) { LOG.warn("Failed deleting empty " + wap.p); throw new IOException("Failed deleting empty " + wap.p); } return null; } Path dst = getCompletedRecoveredEditsFilePath(wap.p, regionMaximumEditLogSeqNum.get(encodedRegionName)); try { if (!dst.equals(wap.p) && fs.exists(dst)) { deleteOneWithFewerEntries(wap, dst); } // Skip the unit tests which create a splitter that reads and // writes the data without touching disk. // TestHLogSplit#testThreading is an example. if (fs.exists(wap.p)) { if (!fs.rename(wap.p, dst)) { throw new IOException("Failed renaming " + wap.p + " to " + dst); } LOG.info("Rename " + wap.p + " to " + dst); } } catch (IOException ioe) { LOG.error("Couldn't rename " + wap.p + " to " + dst, ioe); thrown.add(ioe); return null; } return dst; } private List closeLogWriters(List thrown) throws IOException { if (writersClosed) { return thrown; } if (thrown == null) { thrown = Lists.newArrayList(); } try { for (WriterThread t : writerThreads) { while (t.isAlive()) { t.shouldStop = true; t.interrupt(); try { t.join(10); } catch (InterruptedException e) { IOException iie = new InterruptedIOException(); iie.initCause(e); throw iie; } } } } finally { WriterAndPath wap = null; for (SinkWriter tmpWAP : writers.values()) { try { wap = (WriterAndPath) tmpWAP; wap.w.close(); } catch (IOException ioe) { LOG.error("Couldn't close log at " + wap.p, ioe); thrown.add(ioe); continue; } LOG.info( "Closed log " + wap.p + " (wrote " + wap.editsWritten + " edits in " + (wap.nanosSpent / 1000 / 1000) + "ms)"); } writersClosed = true; } return thrown; } /** * Get a writer and path for a log starting at the given entry. This function is threadsafe so * long as multiple threads are always acting on different regions. * @return null if this region shouldn't output any logs */ WriterAndPath getWriterAndPath(Entry entry, boolean reusable) throws IOException { byte region[] = entry.getKey().getEncodedRegionName(); String regionName = Bytes.toString(region); WriterAndPath ret = (WriterAndPath) writers.get(regionName); if (ret != null) { return ret; } // If we already decided that this region doesn't get any output // we don't need to check again. if (blacklistedRegions.contains(region)) { return null; } ret = createWAP(region, entry, rootDir); if (ret == null) { blacklistedRegions.add(region); return null; } if(reusable) { writers.put(regionName, ret); } return ret; } /** * @return a path with a write for that path. caller should close. */ WriterAndPath createWAP(byte[] region, Entry entry, Path rootdir) throws IOException { Path regionedits = getRegionSplitEditsPath(fs, entry, rootdir, fileBeingSplit.getPath().getName()); if (regionedits == null) { return null; } if (fs.exists(regionedits)) { LOG.warn("Found old edits file. It could be the " + "result of a previous failed split attempt. Deleting " + regionedits + ", length=" + fs.getFileStatus(regionedits).getLen()); if (!fs.delete(regionedits, false)) { LOG.warn("Failed delete of old {}", regionedits); } } Writer w = createWriter(regionedits); LOG.debug("Creating writer path={}", regionedits); return new WriterAndPath(regionedits, w, entry.getKey().getSequenceId()); } void filterCellByStore(Entry logEntry) { Map maxSeqIdInStores = regionMaxSeqIdInStores.get(Bytes.toString(logEntry.getKey().getEncodedRegionName())); if (MapUtils.isEmpty(maxSeqIdInStores)) { return; } // Create the array list for the cells that aren't filtered. // We make the assumption that most cells will be kept. ArrayList keptCells = new ArrayList<>(logEntry.getEdit().getCells().size()); for (Cell cell : logEntry.getEdit().getCells()) { if (CellUtil.matchingFamily(cell, WALEdit.METAFAMILY)) { keptCells.add(cell); } else { byte[] family = CellUtil.cloneFamily(cell); Long maxSeqId = maxSeqIdInStores.get(family); // Do not skip cell even if maxSeqId is null. Maybe we are in a rolling upgrade, // or the master was crashed before and we can not get the information. if (maxSeqId == null || maxSeqId.longValue() < logEntry.getKey().getSequenceId()) { keptCells.add(cell); } } } // Anything in the keptCells array list is still live. // So rather than removing the cells from the array list // which would be an O(n^2) operation, we just replace the list logEntry.getEdit().setCells(keptCells); } @Override public void append(RegionEntryBuffer buffer) throws IOException { appendBuffer(buffer, true); } WriterAndPath appendBuffer(RegionEntryBuffer buffer, boolean reusable) throws IOException{ List entries = buffer.entryBuffer; if (entries.isEmpty()) { LOG.warn("got an empty buffer, skipping"); return null; } WriterAndPath wap = null; long startTime = System.nanoTime(); try { int editsCount = 0; for (Entry logEntry : entries) { if (wap == null) { wap = getWriterAndPath(logEntry, reusable); if (wap == null) { if (LOG.isTraceEnabled()) { // This log spews the full edit. Can be massive in the log. Enable only debugging // WAL lost edit issues. LOG.trace("getWriterAndPath decided we don't need to write edits for {}", logEntry); } return null; } } filterCellByStore(logEntry); if (!logEntry.getEdit().isEmpty()) { wap.w.append(logEntry); this.updateRegionMaximumEditLogSeqNum(logEntry); editsCount++; } else { wap.incrementSkippedEdits(1); } } // Pass along summary statistics wap.incrementEdits(editsCount); wap.incrementNanoTime(System.nanoTime() - startTime); } catch (IOException e) { e = e instanceof RemoteException ? ((RemoteException)e).unwrapRemoteException() : e; LOG.error(HBaseMarkers.FATAL, "Got while writing log entry to log", e); throw e; } return wap; } @Override public boolean keepRegionEvent(Entry entry) { ArrayList cells = entry.getEdit().getCells(); for (Cell cell : cells) { if (WALEdit.isCompactionMarker(cell)) { return true; } } return false; } /** * @return a map from encoded region ID to the number of edits written out for that region. */ @Override public Map getOutputCounts() { TreeMap ret = new TreeMap<>(Bytes.BYTES_COMPARATOR); for (Map.Entry entry : writers.entrySet()) { ret.put(Bytes.toBytes(entry.getKey()), entry.getValue().editsWritten); } return ret; } @Override public int getNumberOfRecoveredRegions() { return writers.size(); } } /** * */ class BoundedLogWriterCreationOutputSink extends LogRecoveredEditsOutputSink { private ConcurrentHashMap regionRecoverStatMap = new ConcurrentHashMap<>(); public BoundedLogWriterCreationOutputSink(PipelineController controller, EntryBuffers entryBuffers, int numWriters) { super(controller, entryBuffers, numWriters); } @Override public List finishWritingAndClose() throws IOException { boolean isSuccessful; List result; try { isSuccessful = finishWriting(false); } finally { result = close(); } if (isSuccessful) { splits = result; } return splits; } @Override boolean executeCloseTask(CompletionService completionService, List thrown, List paths) throws InterruptedException, ExecutionException { for (final Map.Entry buffer : entryBuffers.buffers.entrySet()) { LOG.info("Submitting writeThenClose of {}", Arrays.toString(buffer.getValue().encodedRegionName)); completionService.submit(new Callable() { @Override public Void call() throws Exception { Path dst = writeThenClose(buffer.getValue()); paths.add(dst); return null; } }); } boolean progress_failed = false; for (int i = 0, n = entryBuffers.buffers.size(); i < n; i++) { Future future = completionService.take(); future.get(); if (!progress_failed && reporter != null && !reporter.progress()) { progress_failed = true; } } return progress_failed; } /** * since the splitting process may create multiple output files, we need a map * regionRecoverStatMap to track the output count of each region. * @return a map from encoded region ID to the number of edits written out for that region. */ @Override public Map getOutputCounts() { Map regionRecoverStatMapResult = new HashMap<>(); for(Map.Entry entry: regionRecoverStatMap.entrySet()){ regionRecoverStatMapResult.put(Bytes.toBytes(entry.getKey()), entry.getValue()); } return regionRecoverStatMapResult; } /** * @return the number of recovered regions */ @Override public int getNumberOfRecoveredRegions() { return regionRecoverStatMap.size(); } /** * Append the buffer to a new recovered edits file, then close it after all done * @param buffer contain all entries of a certain region * @throws IOException when closeWriter failed */ @Override public void append(RegionEntryBuffer buffer) throws IOException { writeThenClose(buffer); } private Path writeThenClose(RegionEntryBuffer buffer) throws IOException { WriterAndPath wap = appendBuffer(buffer, false); if(wap != null) { String encodedRegionName = Bytes.toString(buffer.encodedRegionName); Long value = regionRecoverStatMap.putIfAbsent(encodedRegionName, wap.editsWritten); if (value != null) { Long newValue = regionRecoverStatMap.get(encodedRegionName) + wap.editsWritten; regionRecoverStatMap.put(encodedRegionName, newValue); } } Path dst = null; List thrown = new ArrayList<>(); if(wap != null){ dst = closeWriter(Bytes.toString(buffer.encodedRegionName), wap, thrown); } if (!thrown.isEmpty()) { throw MultipleIOException.createIOException(thrown); } return dst; } } /** * Class wraps the actual writer which writes data out and related statistics */ public abstract static class SinkWriter { /* Count of edits written to this path */ long editsWritten = 0; /* Count of edits skipped to this path */ long editsSkipped = 0; /* Number of nanos spent writing to this log */ long nanosSpent = 0; void incrementEdits(int edits) { editsWritten += edits; } void incrementSkippedEdits(int skipped) { editsSkipped += skipped; } void incrementNanoTime(long nanos) { nanosSpent += nanos; } } /** * Private data structure that wraps a Writer and its Path, also collecting statistics about the * data written to this output. */ private final static class WriterAndPath extends SinkWriter { final Path p; final Writer w; final long minLogSeqNum; WriterAndPath(final Path p, final Writer w, final long minLogSeqNum) { this.p = p; this.w = w; this.minLogSeqNum = minLogSeqNum; } } static class CorruptedLogFileException extends Exception { private static final long serialVersionUID = 1L; CorruptedLogFileException(String s) { super(s); } } /** A struct used by getMutationsFromWALEntry */ public static class MutationReplay implements Comparable { public MutationReplay(MutationType type, Mutation mutation, long nonceGroup, long nonce) { this.type = type; this.mutation = mutation; if(this.mutation.getDurability() != Durability.SKIP_WAL) { // using ASYNC_WAL for relay this.mutation.setDurability(Durability.ASYNC_WAL); } this.nonceGroup = nonceGroup; this.nonce = nonce; } public final MutationType type; public final Mutation mutation; public final long nonceGroup; public final long nonce; @Override public int compareTo(final MutationReplay d) { return this.mutation.compareTo(d.mutation); } @Override public boolean equals(Object obj) { if(!(obj instanceof MutationReplay)) { return false; } else { return this.compareTo((MutationReplay)obj) == 0; } } @Override public int hashCode() { return this.mutation.hashCode(); } } /** * This function is used to construct mutations from a WALEntry. It also * reconstructs WALKey & WALEdit from the passed in WALEntry * @param entry * @param cells * @param logEntry pair of WALKey and WALEdit instance stores WALKey and WALEdit instances * extracted from the passed in WALEntry. * @return list of Pair<MutationType, Mutation> to be replayed * @throws IOException */ public static List getMutationsFromWALEntry(WALEntry entry, CellScanner cells, Pair logEntry, Durability durability) throws IOException { if (entry == null) { // return an empty array return Collections.emptyList(); } long replaySeqId = (entry.getKey().hasOrigSequenceNumber()) ? entry.getKey().getOrigSequenceNumber() : entry.getKey().getLogSequenceNumber(); int count = entry.getAssociatedCellCount(); List mutations = new ArrayList<>(); Cell previousCell = null; Mutation m = null; WALKeyImpl key = null; WALEdit val = null; if (logEntry != null) { val = new WALEdit(); } for (int i = 0; i < count; i++) { // Throw index out of bounds if our cell count is off if (!cells.advance()) { throw new ArrayIndexOutOfBoundsException("Expected=" + count + ", index=" + i); } Cell cell = cells.current(); if (val != null) val.add(cell); boolean isNewRowOrType = previousCell == null || previousCell.getTypeByte() != cell.getTypeByte() || !CellUtil.matchingRows(previousCell, cell); if (isNewRowOrType) { // Create new mutation if (CellUtil.isDelete(cell)) { m = new Delete(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength()); // Deletes don't have nonces. mutations.add(new MutationReplay( MutationType.DELETE, m, HConstants.NO_NONCE, HConstants.NO_NONCE)); } else { m = new Put(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength()); // Puts might come from increment or append, thus we need nonces. long nonceGroup = entry.getKey().hasNonceGroup() ? entry.getKey().getNonceGroup() : HConstants.NO_NONCE; long nonce = entry.getKey().hasNonce() ? entry.getKey().getNonce() : HConstants.NO_NONCE; mutations.add(new MutationReplay(MutationType.PUT, m, nonceGroup, nonce)); } } if (CellUtil.isDelete(cell)) { ((Delete) m).add(cell); } else { ((Put) m).add(cell); } m.setDurability(durability); previousCell = cell; } // reconstruct WALKey if (logEntry != null) { org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.WALKey walKeyProto = entry.getKey(); List clusterIds = new ArrayList<>(walKeyProto.getClusterIdsCount()); for (HBaseProtos.UUID uuid : entry.getKey().getClusterIdsList()) { clusterIds.add(new UUID(uuid.getMostSigBits(), uuid.getLeastSigBits())); } key = new WALKeyImpl(walKeyProto.getEncodedRegionName().toByteArray(), TableName.valueOf( walKeyProto.getTableName().toByteArray()), replaySeqId, walKeyProto.getWriteTime(), clusterIds, walKeyProto.getNonceGroup(), walKeyProto.getNonce(), null); logEntry.setFirst(key); logEntry.setSecond(val); } return mutations; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy