com.mongodb.hadoop.output.MongoOutputCommitter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-core Show documentation
Show all versions of mongo-hadoop-core Show documentation
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
/*
* Copyright 2011-2013 10gen Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mongodb.hadoop.output;
import com.mongodb.BasicDBObject;
import com.mongodb.BulkUpdateRequestBuilder;
import com.mongodb.BulkWriteOperation;
import com.mongodb.BulkWriteRequestBuilder;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.MongoException;
import com.mongodb.hadoop.io.BSONWritable;
import com.mongodb.hadoop.io.MongoUpdateWritable;
import com.mongodb.hadoop.io.MongoWritableTypes;
import com.mongodb.hadoop.util.CompatUtils;
import com.mongodb.hadoop.util.MongoConfigUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
public class MongoOutputCommitter extends OutputCommitter {
public static final String TEMP_DIR_NAME = "_MONGO_OUT_TEMP";
private static final Log LOG = LogFactory.getLog(MongoOutputCommitter.class);
private DBCollection collection;
public MongoOutputCommitter() {}
@Override
public void setupJob(final JobContext jobContext) {
LOG.info("Setting up job.");
}
@Override
public void setupTask(final TaskAttemptContext taskContext) {
LOG.info("Setting up task.");
}
@Override
public boolean needsTaskCommit(final TaskAttemptContext taskContext)
throws IOException {
return needsTaskCommit(CompatUtils.getTaskAttemptContext(taskContext));
}
@Override
public void commitTask(final TaskAttemptContext taskContext)
throws IOException {
commitTask(CompatUtils.getTaskAttemptContext(taskContext));
}
@Override
public void abortTask(final TaskAttemptContext taskContext)
throws IOException {
abortTask(CompatUtils.getTaskAttemptContext(taskContext));
}
public boolean needsTaskCommit(
final CompatUtils.TaskAttemptContext taskContext) throws IOException {
try {
FileSystem fs = FileSystem.get(taskContext.getConfiguration());
// Commit is only necessary if there was any output.
return fs.exists(getTaskAttemptPath(taskContext));
} catch (IOException e) {
LOG.error("Could not open filesystem", e);
throw e;
}
}
public void commitTask(
final CompatUtils.TaskAttemptContext taskContext) throws IOException {
LOG.info("Committing task.");
collection =
MongoConfigUtil.getOutputCollection(taskContext.getConfiguration());
// Get temporary file.
Path tempFilePath = getTaskAttemptPath(taskContext);
LOG.info("Committing from temporary file: " + tempFilePath.toString());
long filePos = 0, fileLen;
FSDataInputStream inputStream = null;
try {
FileSystem fs = FileSystem.get(taskContext.getConfiguration());
inputStream = fs.open(tempFilePath);
fileLen = fs.getFileStatus(tempFilePath).getLen();
} catch (IOException e) {
LOG.error("Could not open temporary file for committing", e);
cleanupAfterCommit(inputStream, taskContext);
throw e;
}
int maxDocs = MongoConfigUtil.getBatchSize(
taskContext.getConfiguration());
int curBatchSize = 0;
BulkWriteOperation bulkOp;
if (MongoConfigUtil.isBulkOrdered(taskContext.getConfiguration())) {
bulkOp = collection.initializeOrderedBulkOperation();
} else {
bulkOp = collection.initializeUnorderedBulkOperation();
}
// Read Writables out of the temporary file.
BSONWritable bw = new BSONWritable();
MongoUpdateWritable muw = new MongoUpdateWritable();
while (filePos < fileLen) {
try {
// Determine writable type, and perform corresponding operation
// on MongoDB.
int mwType = inputStream.readInt();
if (MongoWritableTypes.BSON_WRITABLE == mwType) {
bw.readFields(inputStream);
bulkOp.insert(new BasicDBObject(bw.getDoc().toMap()));
} else if (MongoWritableTypes.MONGO_UPDATE_WRITABLE == mwType) {
muw.readFields(inputStream);
DBObject query = new BasicDBObject(muw.getQuery().toMap());
DBObject modifiers =
new BasicDBObject(muw.getModifiers().toMap());
BulkWriteRequestBuilder writeBuilder = bulkOp.find(query);
if (muw.isReplace()) {
writeBuilder.replaceOne(modifiers);
} else if (muw.isUpsert()) {
BulkUpdateRequestBuilder updateBuilder =
writeBuilder.upsert();
if (muw.isMultiUpdate()) {
updateBuilder.update(modifiers);
} else {
updateBuilder.updateOne(modifiers);
}
} else {
// No-upsert update.
if (muw.isMultiUpdate()) {
writeBuilder.update(modifiers);
} else {
writeBuilder.updateOne(modifiers);
}
}
} else {
throw new IOException("Unrecognized type: " + mwType);
}
filePos = inputStream.getPos();
// Write to MongoDB if the batch is full, or if this is the last
// operation to be performed for the Task.
if (++curBatchSize >= maxDocs || filePos >= fileLen) {
try {
bulkOp.execute();
} catch (MongoException e) {
LOG.error("Could not write to MongoDB", e);
throw e;
}
bulkOp = collection.initializeOrderedBulkOperation();
curBatchSize = 0;
// Signal progress back to Hadoop framework so that we
// don't time out.
taskContext.progress();
}
} catch (IOException e) {
LOG.error("Error reading from temporary file", e);
throw e;
}
}
cleanupAfterCommit(inputStream, taskContext);
}
public void abortTask(final CompatUtils.TaskAttemptContext taskContext)
throws IOException {
LOG.info("Aborting task.");
cleanupResources(taskContext);
}
/**
* Helper method to close MongoClients and FSDataInputStream and clean up
* any files still left around from map/reduce tasks.
*
* @param inputStream the FSDataInputStream to close.
*/
private void cleanupAfterCommit(
final FSDataInputStream inputStream,
final CompatUtils.TaskAttemptContext context)
throws IOException {
if (inputStream != null) {
try {
inputStream.close();
} catch (IOException e) {
LOG.error("Could not close input stream", e);
throw e;
}
}
cleanupResources(context);
}
private void cleanupResources(
final CompatUtils.TaskAttemptContext taskContext)
throws IOException {
Path currentPath = getTaskAttemptPath(taskContext);
Path tempDirectory = getTempDirectory(taskContext.getConfiguration());
FileSystem fs = FileSystem.get(taskContext.getConfiguration());
while (!currentPath.equals(tempDirectory)) {
try {
fs.delete(currentPath, true);
} catch (IOException e) {
LOG.error("Could not delete temporary file: " + currentPath, e);
throw e;
}
currentPath = currentPath.getParent();
}
if (collection != null) {
MongoConfigUtil.close(collection.getDB().getMongo());
}
}
private static Path getTempDirectory(final Configuration config) {
String basePath = config.get(
"mapreduce.task.tmp.dir",
config.get(
"mapred.child.tmp",
config.get("hadoop.tmp.dir", "/tmp")));
return new Path(basePath);
}
/**
* Get the Path to where temporary files should be stored for a
* TaskAttempt, whose TaskAttemptContext is provided.
*
* @param context the TaskAttemptContext.
* @return the Path to the temporary file for the TaskAttempt.
*/
public static Path getTaskAttemptPath(
final CompatUtils.TaskAttemptContext context) {
Configuration config = context.getConfiguration();
// Try to use the following base temporary directories, in this order:
// 1. New-style option for task tmp dir
// 2. Old-style option for task tmp dir
// 3. Hadoop system-wide tmp dir
// 4. /tmp
// Hadoop Paths always use "/" as a directory separator.
return new Path(
String.format("%s/%s/%s/_out",
getTempDirectory(config),
context.getTaskAttemptID().toString(), TEMP_DIR_NAME));
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy