com.mongodb.hadoop.output.MongoOutputCommitter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-core Show documentation
Show all versions of mongo-hadoop-core Show documentation
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
/*
* Copyright 2011-2013 10gen Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mongodb.hadoop.output;
import com.mongodb.BasicDBObject;
import com.mongodb.BulkWriteOperation;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.MongoException;
import com.mongodb.hadoop.io.BSONWritable;
import com.mongodb.hadoop.io.MongoUpdateWritable;
import com.mongodb.hadoop.io.MongoWritableTypes;
import com.mongodb.hadoop.util.MongoConfigUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
import java.util.List;
public class MongoOutputCommitter extends OutputCommitter {
public static final String TEMP_DIR_NAME = "_MONGO_OUT_TEMP";
private static final Log LOG = LogFactory.getLog(MongoOutputCommitter.class);
private List collections;
private int numberOfHosts;
private int roundRobinCounter = 0;
public MongoOutputCommitter() {
}
/**
* @deprecated Use the zero-args constructor instead.
* @param collections the MongoDB output collections.
*/
@Deprecated
public MongoOutputCommitter(final List collections) {
this();
}
/**
* Get the Path to where temporary files should be stored for a
* TaskAttempt, whose TaskAttemptContext is provided.
*
* @param context the TaskAttemptContext.
* @return the Path to the temporary file for the TaskAttempt.
*/
public static Path getTaskAttemptPath(final TaskAttemptContext context) {
Configuration config = context.getConfiguration();
// Try to use the following base temporary directories, in this order:
// 1. New-style option for task tmp dir
// 2. Old-style option for task tmp dir
// 3. Hadoop system-wide tmp dir
// 4. /tmp
String basePath = config.get(
"mapreduce.task.tmp.dir",
config.get(
"mapred.child.tmp",
config.get("hadoop.tmp.dir", "/tmp")));
// Hadoop Paths always use "/" as a directory separator.
return new Path(
String.format("%s/%s/%s/_out",
basePath, context.getTaskAttemptID().toString(), TEMP_DIR_NAME));
}
@Override
public void setupJob(final JobContext jobContext) {
LOG.info("Setting up job.");
}
@Override
public void setupTask(final TaskAttemptContext taskContext) {
LOG.info("Setting up task.");
}
@Override
public boolean needsTaskCommit(final TaskAttemptContext taskContext)
throws IOException {
try {
FileSystem fs = FileSystem.get(taskContext.getConfiguration());
// Commit is only necessary if there was any output.
return fs.exists(getTaskAttemptPath(taskContext));
} catch (IOException e) {
LOG.error("Could not open filesystem", e);
throw e;
}
}
@Override
public void commitTask(final TaskAttemptContext taskContext)
throws IOException {
LOG.info("Committing task.");
collections =
MongoConfigUtil.getOutputCollections(taskContext.getConfiguration());
numberOfHosts = collections.size();
// Get temporary file.
Path tempFilePath = getTaskAttemptPath(taskContext);
LOG.info("Committing from temporary file: " + tempFilePath.toString());
long filePos = 0, fileLen;
FSDataInputStream inputStream = null;
try {
FileSystem fs = FileSystem.get(taskContext.getConfiguration());
inputStream = fs.open(tempFilePath);
fileLen = fs.getFileStatus(tempFilePath).getLen();
} catch (IOException e) {
LOG.error("Could not open temporary file for committing", e);
cleanupAfterCommit(inputStream, taskContext);
throw e;
}
int maxDocs = MongoConfigUtil.getBatchSize(
taskContext.getConfiguration());
int curBatchSize = 0;
DBCollection coll = getDbCollectionByRoundRobin();
BulkWriteOperation bulkOp = coll.initializeOrderedBulkOperation();
// Read Writables out of the temporary file.
BSONWritable bw = new BSONWritable();
MongoUpdateWritable muw = new MongoUpdateWritable();
while (filePos < fileLen) {
try {
// Determine writable type, and perform corresponding operation
// on MongoDB.
int mwType = inputStream.readInt();
if (MongoWritableTypes.BSON_WRITABLE == mwType) {
bw.readFields(inputStream);
bulkOp.insert(new BasicDBObject(bw.getDoc().toMap()));
} else if (MongoWritableTypes.MONGO_UPDATE_WRITABLE == mwType) {
muw.readFields(inputStream);
DBObject query = new BasicDBObject(muw.getQuery().toMap());
DBObject modifiers =
new BasicDBObject(muw.getModifiers().toMap());
if (muw.isMultiUpdate()) {
if (muw.isUpsert()) {
bulkOp.find(query).upsert().update(modifiers);
} else {
bulkOp.find(query).update(modifiers);
}
} else {
if (muw.isUpsert()) {
bulkOp.find(query).upsert().updateOne(modifiers);
} else {
bulkOp.find(query).updateOne(modifiers);
}
}
} else {
throw new IOException("Unrecognized type: " + mwType);
}
filePos = inputStream.getPos();
// Write to MongoDB if the batch is full, or if this is the last
// operation to be performed for the Task.
if (++curBatchSize >= maxDocs || filePos >= fileLen) {
try {
bulkOp.execute();
} catch (MongoException e) {
LOG.error("Could not write to MongoDB", e);
throw e;
}
coll = getDbCollectionByRoundRobin();
bulkOp = coll.initializeOrderedBulkOperation();
curBatchSize = 0;
// Signal progress back to Hadoop framework so that we
// don't time out.
taskContext.progress();
}
} catch (IOException e) {
LOG.error("Error reading from temporary file", e);
throw e;
}
}
cleanupAfterCommit(inputStream, taskContext);
}
@Override
public void abortTask(final TaskAttemptContext taskContext)
throws IOException {
LOG.info("Aborting task.");
cleanupResources(taskContext);
}
/**
* Helper method to close MongoClients and FSDataInputStream and clean up
* any files still left around from map/reduce tasks.
*
* @param inputStream the FSDataInputStream to close.
*/
private void cleanupAfterCommit(
final FSDataInputStream inputStream, final TaskAttemptContext context)
throws IOException {
if (inputStream != null) {
try {
inputStream.close();
} catch (IOException e) {
LOG.error("Could not close input stream", e);
throw e;
}
}
cleanupResources(context);
}
private void cleanupResources(final TaskAttemptContext taskContext)
throws IOException {
Path tempPath = getTaskAttemptPath(taskContext);
try {
FileSystem fs = FileSystem.get(taskContext.getConfiguration());
fs.delete(tempPath, true);
} catch (IOException e) {
LOG.error("Could not delete temporary file " + tempPath, e);
throw e;
}
if (collections != null) {
for (DBCollection collection : collections) {
MongoConfigUtil.close(collection.getDB().getMongo());
}
}
}
private DBCollection getDbCollectionByRoundRobin() {
int hostIndex = (roundRobinCounter++ & 0x7FFFFFFF) % numberOfHosts;
return collections.get(hostIndex);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy