org.apache.hadoop.dynamodb.tools.ExportMapper Maven / Gradle / Ivy
/**
* Copyright 2012-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file
* except in compliance with the License. A copy of the License is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "LICENSE.TXT" file accompanying this file. This file is distributed on an "AS IS"
* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under the License.
*/
package org.apache.hadoop.dynamodb.tools;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.dynamodb.DynamoDBItemWritable;
import org.apache.hadoop.dynamodb.exportformat.ExportFileFlusher;
import org.apache.hadoop.dynamodb.exportformat.ExportOutputFormat;
import org.apache.hadoop.dynamodb.util.TimeSource;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import java.io.IOException;
import java.util.UUID;
class ExportMapper extends MapReduceBase implements Mapper {
private static final Log log = LogFactory.getLog(ExportMapper.class);
private static final int MAX_ITEM_COUNT_PER_FILE = 100000;
private final OutputFormat outputFormat = new
ExportOutputFormat();
private final TimeSource time = new TimeSource();
private final ExportFileFlusher flusher = new ExportFileFlusher(time);
private long itemCount = 0;
private JobConf jobConf;
private RecordWriter recordWriter;
@Override
public void map(Text key, DynamoDBItemWritable value, OutputCollector output,
Reporter reporter) throws IOException {
// Rotate output file if needed
if (itemCount % MAX_ITEM_COUNT_PER_FILE == 0) {
long start = time.getNanoTime();
if (recordWriter != null) {
flusher.close(recordWriter, reporter);
}
String newOutputFilename = generateFilename();
recordWriter = outputFormat.getRecordWriter(null, jobConf, newOutputFilename, reporter);
long duration = time.getTimeSinceMs(start);
log.info("Rotated over to file: " + newOutputFilename + " in " + (duration / 1000.0) + " "
+ "seconds.");
// When the reducer collects these filenames we want them to be
// shuffled around - both to increase write spread on DynamoDB and
// read spread on S3 when we later consume the data. We achieve this
// by providing the reverse of the filename as the key in the mapper
// output.
String sortKey = new StringBuilder(newOutputFilename).reverse().toString();
output.collect(new Text(sortKey), new Text(newOutputFilename));
reporter.incrCounter(Counters.OUTPUT_FILES, 1);
}
// Write item to output file
recordWriter.write(NullWritable.get(), value);
reporter.incrCounter(Counters.DYNAMODB_ITEMS_READ, 1);
itemCount++;
}
@Override
public void close() throws IOException {
if (recordWriter != null) {
flusher.close(recordWriter, Reporter.NULL);
}
flusher.sync();
}
@Override
public void configure(JobConf job) {
jobConf = job;
}
private String generateFilename() {
return UUID.randomUUID().toString();
}
private enum Counters {
DYNAMODB_ITEMS_READ, OUTPUT_FILES,
}
}