All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.mapreduce.TableInputFormatBase Maven / Gradle / Ivy

There is a newer version: 3.0.0-beta-1
Show newest version
/**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.mapreduce;

import java.io.Closeable;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import javax.naming.NamingException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.classification.InterfaceStability;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Addressing;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.RegionSizeCalculator;
import org.apache.hadoop.hbase.util.Strings;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.net.DNS;
import org.apache.hadoop.util.StringUtils;

/**
 * A base for {@link TableInputFormat}s. Receives a {@link Connection}, a {@link TableName},
 * an {@link Scan} instance that defines the input columns etc. Subclasses may use
 * other TableRecordReader implementations.
 * 

* An example of a subclass: *

 *   class ExampleTIF extends TableInputFormatBase implements JobConfigurable {
 *
 *     public void configure(JobConf job) {
 *       Connection connection =
 *          ConnectionFactory.createConnection(HBaseConfiguration.create(job));
 *       TableName tableName = TableName.valueOf("exampleTable");
 *       // mandatory
 *       initializeTable(connection, tableName);
 *       Text[] inputColumns = new byte [][] { Bytes.toBytes("cf1:columnA"),
 *         Bytes.toBytes("cf2") };
 *       // mandatory
 *       setInputColumns(inputColumns);
 *       RowFilterInterface exampleFilter = new RegExpRowFilter("keyPrefix.*");
 *       // optional
 *       setRowFilter(exampleFilter);
 *     }
 *
 *     public void validateInput(JobConf job) throws IOException {
 *     }
 *  }
 * 
*/ @InterfaceAudience.Public @InterfaceStability.Stable public abstract class TableInputFormatBase extends InputFormat { final Log LOG = LogFactory.getLog(TableInputFormatBase.class); /** Holds the details for the internal scanner. * * @see Scan */ private Scan scan = null; /** The {@link Admin}. */ private Admin admin; /** The {@link Table} to scan. */ private Table table; /** The {@link RegionLocator} of the table. */ private RegionLocator regionLocator; /** The reader scanning the table, can be a custom one. */ private TableRecordReader tableRecordReader = null; /** The reverse DNS lookup cache mapping: IPAddress => HostName */ private HashMap reverseDNSCacheMap = new HashMap(); private Connection connection; /** * Builds a {@link TableRecordReader}. If no {@link TableRecordReader} was provided, uses * the default. * * @param split The split to work with. * @param context The current context. * @return The newly created record reader. * @throws IOException When creating the reader fails. * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader( * org.apache.hadoop.mapreduce.InputSplit, * org.apache.hadoop.mapreduce.TaskAttemptContext) */ @Override public RecordReader createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException { if (table == null) { throw new IOException("Cannot create a record reader because of a" + " previous error. Please look at the previous logs lines from" + " the task's full log for more details."); } TableSplit tSplit = (TableSplit) split; LOG.info("Input split length: " + StringUtils.humanReadableInt(tSplit.getLength()) + " bytes."); final TableRecordReader trr = this.tableRecordReader != null ? this.tableRecordReader : new TableRecordReader(); Scan sc = new Scan(this.scan); sc.setStartRow(tSplit.getStartRow()); sc.setStopRow(tSplit.getEndRow()); trr.setScan(sc); trr.setTable(table); return new RecordReader() { @Override public void close() throws IOException { trr.close(); close(admin, table, regionLocator, connection); } private void close(Closeable... closables) throws IOException { for (Closeable c : closables) { if(c != null) { c.close(); } } } @Override public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException { return trr.getCurrentKey(); } @Override public Result getCurrentValue() throws IOException, InterruptedException { return trr.getCurrentValue(); } @Override public float getProgress() throws IOException, InterruptedException { return trr.getProgress(); } @Override public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException, InterruptedException { trr.initialize(inputsplit, context); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { return trr.nextKeyValue(); } }; } protected Pair getStartEndKeys() throws IOException { return regionLocator.getStartEndKeys(); } /** * Calculates the splits that will serve as input for the map tasks. The * number of splits matches the number of regions in a table. * * @param context The current job context. * @return The list of input splits. * @throws IOException When creating the list of splits fails. * @see org.apache.hadoop.mapreduce.InputFormat#getSplits( * org.apache.hadoop.mapreduce.JobContext) */ @Override public List getSplits(JobContext context) throws IOException { if (table == null) { throw new IOException("No table was provided."); } RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(regionLocator, admin); Pair keys = getStartEndKeys(); if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) { HRegionLocation regLoc = regionLocator.getRegionLocation(HConstants.EMPTY_BYTE_ARRAY, false); if (null == regLoc) { throw new IOException("Expecting at least one region."); } List splits = new ArrayList(1); long regionSize = sizeCalculator.getRegionSize(regLoc.getRegionInfo().getRegionName()); TableSplit split = new TableSplit(table.getName(), HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY, regLoc .getHostnamePort().split(Addressing.HOSTNAME_PORT_SEPARATOR)[0], regionSize); splits.add(split); return splits; } List splits = new ArrayList(keys.getFirst().length); for (int i = 0; i < keys.getFirst().length; i++) { if ( !includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) { continue; } HRegionLocation location = regionLocator.getRegionLocation(keys.getFirst()[i], false); // The below InetSocketAddress creation does a name resolution. InetSocketAddress isa = new InetSocketAddress(location.getHostname(), location.getPort()); if (isa.isUnresolved()) { LOG.warn("Failed resolve " + isa); } InetAddress regionAddress = isa.getAddress(); String regionLocation; try { regionLocation = reverseDNS(regionAddress); } catch (NamingException e) { LOG.warn("Cannot resolve the host name for " + regionAddress + " because of " + e); regionLocation = location.getHostname(); } byte[] startRow = scan.getStartRow(); byte[] stopRow = scan.getStopRow(); // determine if the given start an stop key fall into the region if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) { byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i] : startRow; byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow; byte[] regionName = location.getRegionInfo().getRegionName(); long regionSize = sizeCalculator.getRegionSize(regionName); TableSplit split = new TableSplit(table.getName(), splitStart, splitStop, regionLocation, regionSize); splits.add(split); if (LOG.isDebugEnabled()) { LOG.debug("getSplits: split -> " + i + " -> " + split); } } } return splits; } public String reverseDNS(InetAddress ipAddress) throws NamingException, UnknownHostException { String hostName = this.reverseDNSCacheMap.get(ipAddress); if (hostName == null) { String ipAddressString = null; try { ipAddressString = DNS.reverseDns(ipAddress, null); } catch (Exception e) { // We can use InetAddress in case the jndi failed to pull up the reverse DNS entry from the // name service. Also, in case of ipv6, we need to use the InetAddress since resolving // reverse DNS using jndi doesn't work well with ipv6 addresses. ipAddressString = InetAddress.getByName(ipAddress.getHostAddress()).getHostName(); } if (ipAddressString == null) throw new UnknownHostException("No host found for " + ipAddress); hostName = Strings.domainNamePointerToHostName(ipAddressString); this.reverseDNSCacheMap.put(ipAddress, hostName); } return hostName; } /** * * * Test if the given region is to be included in the InputSplit while splitting * the regions of a table. *

* This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job, * (and hence, not contributing to the InputSplit), given the start and end keys of the same.
* Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R processing, * continuously. In addition to reducing InputSplits, reduces the load on the region server as well, due to the ordering of the keys. *
*
* Note: It is possible that endKey.length() == 0 , for the last (recent) region. *
* Override this method, if you want to bulk exclude regions altogether from M-R. By default, no region is excluded( i.e. all regions are included). * * * @param startKey Start key of the region * @param endKey End key of the region * @return true, if this region needs to be included as part of the input (default). * */ protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) { return true; } /** * Allows subclasses to get the {@link HTable}. * * @deprecated */ @Deprecated protected HTable getHTable() { return (HTable) this.table; } /** * Allows subclasses to get the {@link RegionLocator}. */ protected RegionLocator getRegionLocator() { return regionLocator; } /** * Allows subclasses to get the {@link Table}. */ protected Table getTable() { return table; } /** * Allows subclasses to get the {@link Admin}. */ protected Admin getAdmin() { return admin; } /** * Allows subclasses to set the {@link HTable}. * * @param table The table to get the data from. * @throws IOException * @deprecated Use {@link #initializeTable(Connection, TableName)} instead. */ @Deprecated protected void setHTable(HTable table) throws IOException { this.table = table; this.regionLocator = table; this.admin = table.getConnection().getAdmin(); } /** * Allows subclasses to initialize the table information. * * @param connection The {@link Connection} to the HBase cluster. * @param tableName The {@link TableName} of the table to process. * @throws IOException */ protected void initializeTable(Connection connection, TableName tableName) throws IOException { this.table = connection.getTable(tableName); this.regionLocator = connection.getRegionLocator(tableName); this.admin = connection.getAdmin(); this.connection = connection; } /** * Gets the scan defining the actual details like columns etc. * * @return The internal scan instance. */ public Scan getScan() { if (this.scan == null) this.scan = new Scan(); return scan; } /** * Sets the scan defining the actual details like columns etc. * * @param scan The scan to set. */ public void setScan(Scan scan) { this.scan = scan; } /** * Allows subclasses to set the {@link TableRecordReader}. * * @param tableRecordReader A different {@link TableRecordReader} * implementation. */ protected void setTableRecordReader(TableRecordReader tableRecordReader) { this.tableRecordReader = tableRecordReader; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy