All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.mapreduce.WALInputFormat Maven / Gradle / Ivy

There is a newer version: 3.0.0-beta-1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.mapreduce;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.EOFException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
import org.apache.hadoop.hbase.wal.WALFactory;
import org.apache.hadoop.hbase.wal.WALKey;
import org.apache.hadoop.hbase.wal.WAL.Entry;
import org.apache.hadoop.hbase.wal.WAL.Reader;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

/**
 * Simple {@link InputFormat} for {@link org.apache.hadoop.hbase.wal.WAL} files.
 */
@InterfaceAudience.Public
public class WALInputFormat extends InputFormat {
  private static final Log LOG = LogFactory.getLog(WALInputFormat.class);

  public static final String START_TIME_KEY = "wal.start.time";
  public static final String END_TIME_KEY = "wal.end.time";

  /**
   * {@link InputSplit} for {@link WAL} files. Each split represent
   * exactly one log file.
   */
  static class WALSplit extends InputSplit implements Writable {
    private String logFileName;
    private long fileSize;
    private long startTime;
    private long endTime;

    /** for serialization */
    public WALSplit() {}

    /**
     * Represent an WALSplit, i.e. a single WAL file.
     * Start- and EndTime are managed by the split, so that WAL files can be
     * filtered before WALEdits are passed to the mapper(s).
     * @param logFileName
     * @param fileSize
     * @param startTime
     * @param endTime
     */
    public WALSplit(String logFileName, long fileSize, long startTime, long endTime) {
      this.logFileName = logFileName;
      this.fileSize = fileSize;
      this.startTime = startTime;
      this.endTime = endTime;
    }

    @Override
    public long getLength() throws IOException, InterruptedException {
      return fileSize;
    }

    @Override
    public String[] getLocations() throws IOException, InterruptedException {
      // TODO: Find the data node with the most blocks for this WAL?
      return new String[] {};
    }

    public String getLogFileName() {
      return logFileName;
    }

    public long getStartTime() {
      return startTime;
    }

    public long getEndTime() {
      return endTime;
    }

    @Override
    public void readFields(DataInput in) throws IOException {
      logFileName = in.readUTF();
      fileSize = in.readLong();
      startTime = in.readLong();
      endTime = in.readLong();
    }

    @Override
    public void write(DataOutput out) throws IOException {
      out.writeUTF(logFileName);
      out.writeLong(fileSize);
      out.writeLong(startTime);
      out.writeLong(endTime);
    }

    @Override
    public String toString() {
      return logFileName + " (" + startTime + ":" + endTime + ") length:" + fileSize;
    }
  }

  /**
   * {@link RecordReader} for an {@link WAL} file.
   * Implementation shared with deprecated HLogInputFormat.
   */
  static abstract class WALRecordReader extends RecordReader {
    private Reader reader = null;
    // visible until we can remove the deprecated HLogInputFormat
    Entry currentEntry = new Entry();
    private long startTime;
    private long endTime;

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException {
      WALSplit hsplit = (WALSplit)split;
      Path logFile = new Path(hsplit.getLogFileName());
      Configuration conf = context.getConfiguration();
      LOG.info("Opening reader for "+split);
      try {
        this.reader = WALFactory.createReader(logFile.getFileSystem(conf), logFile, conf);
      } catch (EOFException x) {
        LOG.info("Ignoring corrupted WAL file: " + logFile
            + " (This is normal when a RegionServer crashed.)");
        this.reader = null;
      }
      this.startTime = hsplit.getStartTime();
      this.endTime = hsplit.getEndTime();
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
      if (reader == null) return false;

      Entry temp;
      long i = -1;
      do {
        // skip older entries
        try {
          temp = reader.next(currentEntry);
          i++;
        } catch (EOFException x) {
          LOG.info("Corrupted entry detected. Ignoring the rest of the file."
              + " (This is normal when a RegionServer crashed.)");
          return false;
        }
      }
      while(temp != null && temp.getKey().getWriteTime() < startTime);

      if (temp == null) {
        if (i > 0) LOG.info("Skipped " + i + " entries.");
        LOG.info("Reached end of file.");
        return false;
      } else if (i > 0) {
        LOG.info("Skipped " + i + " entries, until ts: " + temp.getKey().getWriteTime() + ".");
      }
      boolean res = temp.getKey().getWriteTime() <= endTime;
      if (!res) {
        LOG.info("Reached ts: " + temp.getKey().getWriteTime() + " ignoring the rest of the file.");
      }
      return res;
    }

    @Override
    public WALEdit getCurrentValue() throws IOException, InterruptedException {
      return currentEntry.getEdit();
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
      // N/A depends on total number of entries, which is unknown
      return 0;
    }

    @Override
    public void close() throws IOException {
      LOG.info("Closing reader");
      if (reader != null) this.reader.close();
    }
  }

  /**
   * handler for non-deprecated WALKey version. fold into WALRecordReader once we no longer
   * need to support HLogInputFormat.
   */
  static class WALKeyRecordReader extends WALRecordReader {
    @Override
    public WALKey getCurrentKey() throws IOException, InterruptedException {
      return currentEntry.getKey();
    }
  }

  @Override
  public List getSplits(JobContext context) throws IOException,
      InterruptedException {
    return getSplits(context, START_TIME_KEY, END_TIME_KEY);
  }

  /**
   * implementation shared with deprecated HLogInputFormat
   */
  List getSplits(final JobContext context, final String startKey, final String endKey)
      throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    Path inputDir = new Path(conf.get("mapreduce.input.fileinputformat.inputdir"));

    long startTime = conf.getLong(startKey, Long.MIN_VALUE);
    long endTime = conf.getLong(endKey, Long.MAX_VALUE);

    FileSystem fs = inputDir.getFileSystem(conf);
    List files = getFiles(fs, inputDir, startTime, endTime);

    List splits = new ArrayList(files.size());
    for (FileStatus file : files) {
      splits.add(new WALSplit(file.getPath().toString(), file.getLen(), startTime, endTime));
    }
    return splits;
  }

  private List getFiles(FileSystem fs, Path dir, long startTime, long endTime)
      throws IOException {
    List result = new ArrayList();
    LOG.debug("Scanning " + dir.toString() + " for WAL files");

    FileStatus[] files = fs.listStatus(dir);
    if (files == null) return Collections.emptyList();
    for (FileStatus file : files) {
      if (file.isDirectory()) {
        // recurse into sub directories
        result.addAll(getFiles(fs, file.getPath(), startTime, endTime));
      } else {
        String name = file.getPath().toString();
        int idx = name.lastIndexOf('.');
        if (idx > 0) {
          try {
            long fileStartTime = Long.parseLong(name.substring(idx+1));
            if (fileStartTime <= endTime) {
              LOG.info("Found: " + name);
              result.add(file);
            }
          } catch (NumberFormatException x) {
            idx = 0;
          }
        }
        if (idx == 0) {
          LOG.warn("File " + name + " does not appear to be an WAL file. Skipping...");
        }
      }
    }
    return result;
  }

  @Override
  public RecordReader createRecordReader(InputSplit split,
      TaskAttemptContext context) throws IOException, InterruptedException {
    return new WALKeyRecordReader();
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy