org.apache.hadoop.hbase.mapreduce.MultiTableInputFormatBase Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hbase-mapreduce Show documentation
This module contains implementations of InputFormat, OutputFormat, Mapper, Reducer, etc which are needed for running MR jobs on tables, WALs, HFiles and other HBase specific constructs. It also contains a bunch of tools: RowCounter, ImportTsv, Import, Export, CompactionTool, ExportSnapshot, WALPlayer, etc
There is a newer version: 3.0.0-beta-1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.mapreduce;

import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A base for {@link MultiTableInputFormat}s. Receives a list of {@link Scan} instances that define
 * the input tables and filters etc. Subclasses may use other TableRecordReader implementations.
 */
@InterfaceAudience.Public
public abstract class MultiTableInputFormatBase
  extends InputFormat {

  private static final Logger LOG = LoggerFactory.getLogger(MultiTableInputFormatBase.class);

  /** Holds the set of scans used to define the input. */
  private List scans;

  /** The reader scanning the table, can be a custom one. */
  private TableRecordReader tableRecordReader = null;

  /**
   * Builds a TableRecordReader. If no TableRecordReader was provided, uses the default.
   * @param split   The split to work with.
   * @param context The current context.
   * @return The newly created record reader.
   * @throws IOException          When creating the reader fails.
   * @throws InterruptedException when record reader initialization fails
   * @see InputFormat#createRecordReader(InputSplit, TaskAttemptContext)
   */
  @Override
  public RecordReader createRecordReader(InputSplit split,
    TaskAttemptContext context) throws IOException, InterruptedException {
    TableSplit tSplit = (TableSplit) split;
    LOG.info(MessageFormat.format("Input split length: {0} bytes.", tSplit.getLength()));

    if (tSplit.getTable() == null) {
      throw new IOException("Cannot create a record reader because of a"
        + " previous error. Please look at the previous logs lines from"
        + " the task's full log for more details.");
    }
    final Connection connection = ConnectionFactory.createConnection(context.getConfiguration());
    Table table = connection.getTable(tSplit.getTable());

    if (this.tableRecordReader == null) {
      this.tableRecordReader = new TableRecordReader();
    }
    final TableRecordReader trr = this.tableRecordReader;

    try {
      Scan sc = tSplit.getScan();
      sc.setStartRow(tSplit.getStartRow());
      sc.setStopRow(tSplit.getEndRow());
      trr.setScan(sc);
      trr.setTable(table);
      return new RecordReader() {

        @Override
        public void close() throws IOException {
          trr.close();
          connection.close();
        }

        @Override
        public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
          return trr.getCurrentKey();
        }

        @Override
        public Result getCurrentValue() throws IOException, InterruptedException {
          return trr.getCurrentValue();
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
          return trr.getProgress();
        }

        @Override
        public void initialize(InputSplit inputsplit, TaskAttemptContext context)
          throws IOException, InterruptedException {
          trr.initialize(inputsplit, context);
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
          return trr.nextKeyValue();
        }
      };
    } catch (IOException ioe) {
      // If there is an exception make sure that all
      // resources are closed and released.
      trr.close();
      connection.close();
      throw ioe;
    }
  }

  /**
   * Calculates the splits that will serve as input for the map tasks. The number of splits matches
   * the number of regions in a table.
   * @param context The current job context.
   * @return The list of input splits.
   * @throws IOException When creating the list of splits fails.
   * @see InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
   */
  @Override
  public List getSplits(JobContext context) throws IOException {
    if (scans.isEmpty()) {
      throw new IOException("No scans were provided.");
    }

    Map> tableMaps = new HashMap<>();
    for (Scan scan : scans) {
      byte[] tableNameBytes = scan.getAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME);
      if (tableNameBytes == null) throw new IOException("A scan object did not have a table name");

      TableName tableName = TableName.valueOf(tableNameBytes);

      List scanList = tableMaps.get(tableName);
      if (scanList == null) {
        scanList = new ArrayList<>();
        tableMaps.put(tableName, scanList);
      }
      scanList.add(scan);
    }

    List splits = new ArrayList<>();
    Iterator iter = tableMaps.entrySet().iterator();
    // Make a single Connection to the Cluster and use it across all tables.
    try (Connection conn = ConnectionFactory.createConnection(context.getConfiguration())) {
      while (iter.hasNext()) {
        Map.Entry> entry = (Map.Entry>) iter.next();
        TableName tableName = entry.getKey();
        List scanList = entry.getValue();
        try (Table table = conn.getTable(tableName);
          RegionLocator regionLocator = conn.getRegionLocator(tableName)) {
          RegionSizeCalculator sizeCalculator =
            new RegionSizeCalculator(regionLocator, conn.getAdmin());
          Pair keys = regionLocator.getStartEndKeys();
          for (Scan scan : scanList) {
            if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
              throw new IOException(
                "Expecting at least one region for table : " + tableName.getNameAsString());
            }
            int count = 0;

            byte[] startRow = scan.getStartRow();
            byte[] stopRow = scan.getStopRow();

            for (int i = 0; i < keys.getFirst().length; i++) {
              if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
                continue;
              }

              if (
                (startRow.length == 0 || keys.getSecond()[i].length == 0
                  || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0)
                  && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)
              ) {
                byte[] splitStart =
                  startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0
                    ? keys.getFirst()[i]
                    : startRow;
                byte[] splitStop =
                  (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0)
                    && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;

                HRegionLocation hregionLocation =
                  regionLocator.getRegionLocation(keys.getFirst()[i], false);
                String regionHostname = hregionLocation.getHostname();
                HRegionInfo regionInfo = hregionLocation.getRegionInfo();
                String encodedRegionName = regionInfo.getEncodedName();
                long regionSize = sizeCalculator.getRegionSize(regionInfo.getRegionName());

                TableSplit split = new TableSplit(table.getName(), scan, splitStart, splitStop,
                  regionHostname, encodedRegionName, regionSize);

                splits.add(split);

                if (LOG.isDebugEnabled()) {
                  LOG.debug("getSplits: split -> " + (count++) + " -> " + split);
                }
              }
            }
          }
        }
      }
    }

    return splits;
  }

  /**
   * Test if the given region is to be included in the InputSplit while splitting the regions of a
   * table.
   * 
   * This optimization is effective when there is a specific reasoning to exclude an entire region
   * from the M-R job, (and hence, not contributing to the InputSplit), given the start and end keys
   * of the same. 

   * Useful when we need to remember the last-processed top record and revisit the [last, current)
   * interval for M-R processing, continuously. In addition to reducing InputSplits, reduces the
   * load on the region server as well, due to the ordering of the keys. 

   * 

   * Note: It is possible that endKey.length() == 0  , for the last (recent) region.
   * 

   * Override this method, if you want to bulk exclude regions altogether from M-R. By default, no
   * region is excluded( i.e. all regions are included).
   * @param startKey Start key of the region
   * @param endKey   End key of the region
   * @return true, if this region needs to be included as part of the input (default).
   */
  protected boolean includeRegionInSplit(final byte[] startKey, final byte[] endKey) {
    return true;
  }

  /**
   * Allows subclasses to get the list of {@link Scan} objects.
   */
  protected List getScans() {
    return this.scans;
  }

  /**
   * Allows subclasses to set the list of {@link Scan} objects.
   * @param scans The list of {@link Scan} used to define the input
   */
  protected void setScans(List scans) {
    this.scans = scans;
  }

  /**
   * Allows subclasses to set the {@link TableRecordReader}.
   * @param tableRecordReader A different {@link TableRecordReader} implementation.
   */
  protected void setTableRecordReader(TableRecordReader tableRecordReader) {
    this.tableRecordReader = tableRecordReader;
  }
}