All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.mapreduce.MultithreadedTableMapper Maven / Gradle / Ivy

Go to download

This module contains implementations of InputFormat, OutputFormat, Mapper, Reducer, etc which are needed for running MR jobs on tables, WALs, HFiles and other HBase specific constructs. It also contains a bunch of tools: RowCounter, ImportTsv, Import, Export, CompactionTool, ExportSnapshot, WALPlayer, etc

There is a newer version: 3.0.0-beta-1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.mapreduce;

import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.MapContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.StatusReporter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Multithreaded implementation for @link org.apache.hbase.mapreduce.TableMapper
 * 

* It can be used instead when the Map operation is not CPU bound in order to improve throughput. *

* Mapper implementations using this MapRunnable must be thread-safe. *

* The Map-Reduce job has to be configured with the mapper to use via {@link #setMapperClass} and * the number of thread the thread-pool can use with the {@link #getNumberOfThreads} method. The * default value is 10 threads. *

*/ @InterfaceAudience.Private public class MultithreadedTableMapper extends TableMapper { private static final Logger LOG = LoggerFactory.getLogger(MultithreadedTableMapper.class); private Class> mapClass; private Context outer; private ExecutorService executor; public static final String NUMBER_OF_THREADS = "hbase.mapreduce.multithreadedmapper.threads"; public static final String MAPPER_CLASS = "hbase.mapreduce.multithreadedmapper.mapclass"; /** * The number of threads in the thread pool that will run the map function. * @param job the job * @return the number of threads */ public static int getNumberOfThreads(JobContext job) { return job.getConfiguration().getInt(NUMBER_OF_THREADS, 10); } /** * Set the number of threads in the pool for running maps. * @param job the job to modify * @param threads the new number of threads */ public static void setNumberOfThreads(Job job, int threads) { job.getConfiguration().setInt(NUMBER_OF_THREADS, threads); } /** * Get the application's mapper class. * @param the map's output key type * @param the map's output value type * @param job the job * @return the mapper class to run */ @SuppressWarnings("unchecked") public static Class> getMapperClass(JobContext job) { return (Class>) job.getConfiguration() .getClass(MAPPER_CLASS, Mapper.class); } /** * Set the application's mapper class. * @param the map output key type * @param the map output value type * @param job the job to modify * @param cls the class to use as the mapper */ public static void setMapperClass(Job job, Class> cls) { if (MultithreadedTableMapper.class.isAssignableFrom(cls)) { throw new IllegalArgumentException( "Can't have recursive " + "MultithreadedTableMapper instances."); } job.getConfiguration().setClass(MAPPER_CLASS, cls, Mapper.class); } /** * Run the application's maps using a thread pool. */ @Override public void run(Context context) throws IOException, InterruptedException { outer = context; int numberOfThreads = getNumberOfThreads(context); mapClass = getMapperClass(context); if (LOG.isDebugEnabled()) { LOG.debug("Configuring multithread runner to use " + numberOfThreads + " threads"); } executor = Executors.newFixedThreadPool(numberOfThreads); for (int i = 0; i < numberOfThreads; ++i) { MapRunner thread = new MapRunner(context); executor.execute(thread); } executor.shutdown(); while (!executor.isTerminated()) { // wait till all the threads are done Thread.sleep(1000); } } private class SubMapRecordReader extends RecordReader { private ImmutableBytesWritable key; private Result value; private Configuration conf; @Override public void close() throws IOException { } @Override public float getProgress() throws IOException, InterruptedException { return 0; } @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { conf = context.getConfiguration(); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { synchronized (outer) { if (!outer.nextKeyValue()) { return false; } key = ReflectionUtils.copy(outer.getConfiguration(), outer.getCurrentKey(), key); value = ReflectionUtils.copy(conf, outer.getCurrentValue(), value); return true; } } public ImmutableBytesWritable getCurrentKey() { return key; } @Override public Result getCurrentValue() { return value; } } private class SubMapRecordWriter extends RecordWriter { @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { } @Override public void write(K2 key, V2 value) throws IOException, InterruptedException { synchronized (outer) { outer.write(key, value); } } } private class SubMapStatusReporter extends StatusReporter { @Override public Counter getCounter(Enum name) { return outer.getCounter(name); } @Override public Counter getCounter(String group, String name) { return outer.getCounter(group, name); } @Override public void progress() { outer.progress(); } @Override public void setStatus(String status) { outer.setStatus(status); } public float getProgress() { return 0; } } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "REC_CATCH_EXCEPTION", justification = "Don't understand why FB is complaining about this one." + " We do throw exception") private class MapRunner implements Runnable { private Mapper mapper; private Context subcontext; @SuppressWarnings({ "rawtypes", "unchecked" }) MapRunner(Context context) throws IOException, InterruptedException { mapper = ReflectionUtils.newInstance(mapClass, context.getConfiguration()); try { Constructor c = context.getClass().getConstructor(Mapper.class, Configuration.class, TaskAttemptID.class, RecordReader.class, RecordWriter.class, OutputCommitter.class, StatusReporter.class, InputSplit.class); c.setAccessible(true); subcontext = (Context) c.newInstance(mapper, outer.getConfiguration(), outer.getTaskAttemptID(), new SubMapRecordReader(), new SubMapRecordWriter(), context.getOutputCommitter(), new SubMapStatusReporter(), outer.getInputSplit()); } catch (Exception e) { try { Constructor c = Class.forName("org.apache.hadoop.mapreduce.task.MapContextImpl") .getConstructor(Configuration.class, TaskAttemptID.class, RecordReader.class, RecordWriter.class, OutputCommitter.class, StatusReporter.class, InputSplit.class); c.setAccessible(true); MapContext mc = (MapContext) c.newInstance(outer.getConfiguration(), outer.getTaskAttemptID(), new SubMapRecordReader(), new SubMapRecordWriter(), context.getOutputCommitter(), new SubMapStatusReporter(), outer.getInputSplit()); Class wrappedMapperClass = Class.forName("org.apache.hadoop.mapreduce.lib.map.WrappedMapper"); Method getMapContext = wrappedMapperClass.getMethod("getMapContext", MapContext.class); subcontext = (Context) getMapContext .invoke(wrappedMapperClass.getDeclaredConstructor().newInstance(), mc); } catch (Exception ee) { // FindBugs: REC_CATCH_EXCEPTION // rethrow as IOE throw new IOException(e); } } } @Override public void run() { try { mapper.run(subcontext); } catch (Throwable ie) { LOG.error("Problem in running map.", ie); } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy