org.apache.hadoop.hbase.mapreduce.SampleUploader Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.mapreduce;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hbase.thirdparty.com.google.common.base.Splitter;
/**
* Sample Uploader MapReduce
*
* This is EXAMPLE code. You will need to change it to work for your context.
*
* Uses {@link TableReducer} to put the data into HBase. Change the InputFormat to suit your data.
* In this example, we are importing a CSV file.
*
*
*
* row,family,qualifier,value
*
*
* The table and columnfamily we're to insert into must preexist.
*
* There is no reducer in this example as it is not necessary and adds significant overhead. If you
* need to do any massaging of data before inserting into HBase, you can do this in the map as well.
*
* Do the following to start the MR job:
*
*
* ./bin/hadoop org.apache.hadoop.hbase.mapreduce.SampleUploader /tmp/input.csv TABLE_NAME
*
*
* This code was written against HBase 0.21 trunk.
*/
@InterfaceAudience.Private
public class SampleUploader extends Configured implements Tool {
private static final String NAME = "SampleUploader";
static class Uploader extends Mapper {
private long checkpoint = 100;
private long count = 0;
@Override
public void map(LongWritable key, Text line, Context context) throws IOException {
// Input is a CSV file
// Each map() is a single line, where the key is the line number
// Each line is comma-delimited; row,family,qualifier,value
// Split CSV line
List values = Splitter.on(',').splitToList(line.toString());
if (values.size() != 4) {
return;
}
Iterator i = values.iterator();
// Extract each value
byte[] row = Bytes.toBytes(i.next());
byte[] family = Bytes.toBytes(i.next());
byte[] qualifier = Bytes.toBytes(i.next());
byte[] value = Bytes.toBytes(i.next());
// Create Put
Put put = new Put(row);
put.addColumn(family, qualifier, value);
// Uncomment below to disable WAL. This will improve performance but means
// you will experience data loss in the case of a RegionServer crash.
// put.setWriteToWAL(false);
try {
context.write(new ImmutableBytesWritable(row), put);
} catch (InterruptedException e) {
e.printStackTrace();
}
// Set status every checkpoint lines
if (++count % checkpoint == 0) {
context.setStatus("Emitting Put " + count);
}
}
}
/**
* Job configuration.
*/
public static Job configureJob(Configuration conf, String[] args) throws IOException {
Path inputPath = new Path(args[0]);
String tableName = args[1];
Job job = new Job(conf, NAME + "_" + tableName);
job.setJarByClass(Uploader.class);
FileInputFormat.setInputPaths(job, inputPath);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setMapperClass(Uploader.class);
// No reducers. Just write straight to table. Call initTableReducerJob
// because it sets up the TableOutputFormat.
TableMapReduceUtil.initTableReducerJob(tableName, null, job);
job.setNumReduceTasks(0);
return job;
}
/**
* Main entry point.
* @param otherArgs The command line parameters after ToolRunner handles standard.
* @throws Exception When running the job fails.
*/
@Override
public int run(String[] otherArgs) throws Exception {
if (otherArgs.length != 2) {
System.err.println("Wrong number of arguments: " + otherArgs.length);
System.err.println("Usage: " + NAME + " ");
return -1;
}
Job job = configureJob(getConf(), otherArgs);
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception {
int status = ToolRunner.run(HBaseConfiguration.create(), new SampleUploader(), args);
System.exit(status);
}
}