com.mongodb.hadoop.output.MongoOutputCommitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-core Show documentation
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
There is a newer version: 2.0.2
Show newest version
/*
 * Copyright 2011-2013 10gen Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mongodb.hadoop.output;

import com.mongodb.BasicDBObject;
import com.mongodb.BulkUpdateRequestBuilder;
import com.mongodb.BulkWriteOperation;
import com.mongodb.BulkWriteRequestBuilder;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.MongoException;
import com.mongodb.hadoop.io.BSONWritable;
import com.mongodb.hadoop.io.MongoUpdateWritable;
import com.mongodb.hadoop.io.MongoWritableTypes;
import com.mongodb.hadoop.util.CompatUtils;
import com.mongodb.hadoop.util.MongoConfigUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.IOException;

public class MongoOutputCommitter extends OutputCommitter {

    public static final String TEMP_DIR_NAME = "_MONGO_OUT_TEMP";
    private static final Log LOG = LogFactory.getLog(MongoOutputCommitter.class);
    private DBCollection collection;

    public MongoOutputCommitter() {}

    @Override
    public void setupJob(final JobContext jobContext) {
        LOG.info("Setting up job.");
    }

    @Override
    public void setupTask(final TaskAttemptContext taskContext) {
        LOG.info("Setting up task.");
    }

    @Override
    public boolean needsTaskCommit(final TaskAttemptContext taskContext)
      throws IOException {
        return needsTaskCommit(CompatUtils.getTaskAttemptContext(taskContext));
    }

    @Override
    public void commitTask(final TaskAttemptContext taskContext)
      throws IOException {
        commitTask(CompatUtils.getTaskAttemptContext(taskContext));
    }

    @Override
    public void abortTask(final TaskAttemptContext taskContext)
      throws IOException {
        abortTask(CompatUtils.getTaskAttemptContext(taskContext));
    }

    public boolean needsTaskCommit(
      final CompatUtils.TaskAttemptContext taskContext) throws IOException {
        try {
            FileSystem fs = FileSystem.get(taskContext.getConfiguration());
            // Commit is only necessary if there was any output.
            return fs.exists(getTaskAttemptPath(taskContext));
        } catch (IOException e) {
            LOG.error("Could not open filesystem", e);
            throw e;
        }
    }

    public void commitTask(
      final CompatUtils.TaskAttemptContext taskContext) throws IOException {
        LOG.info("Committing task.");

        collection =
          MongoConfigUtil.getOutputCollection(taskContext.getConfiguration());

        // Get temporary file.
        Path tempFilePath = getTaskAttemptPath(taskContext);
        LOG.info("Committing from temporary file: " + tempFilePath.toString());
        long filePos = 0, fileLen;
        FSDataInputStream inputStream = null;
        try {
            FileSystem fs = FileSystem.get(taskContext.getConfiguration());
            inputStream = fs.open(tempFilePath);
            fileLen = fs.getFileStatus(tempFilePath).getLen();
        } catch (IOException e) {
            LOG.error("Could not open temporary file for committing", e);
            cleanupAfterCommit(inputStream, taskContext);
            throw e;
        }

        int maxDocs = MongoConfigUtil.getBatchSize(
          taskContext.getConfiguration());
        int curBatchSize = 0;

        BulkWriteOperation bulkOp;
        if (MongoConfigUtil.isBulkOrdered(taskContext.getConfiguration())) {
            bulkOp = collection.initializeOrderedBulkOperation();
        } else {
            bulkOp = collection.initializeUnorderedBulkOperation();
        }

        // Read Writables out of the temporary file.
        BSONWritable bw = new BSONWritable();
        MongoUpdateWritable muw = new MongoUpdateWritable();
        while (filePos < fileLen) {
            try {
                // Determine writable type, and perform corresponding operation
                // on MongoDB.
                int mwType = inputStream.readInt();
                if (MongoWritableTypes.BSON_WRITABLE == mwType) {
                    bw.readFields(inputStream);
                    bulkOp.insert(new BasicDBObject(bw.getDoc().toMap()));
                } else if (MongoWritableTypes.MONGO_UPDATE_WRITABLE == mwType) {
                    muw.readFields(inputStream);
                    DBObject query = new BasicDBObject(muw.getQuery().toMap());
                    DBObject modifiers =
                        new BasicDBObject(muw.getModifiers().toMap());
                    BulkWriteRequestBuilder writeBuilder = bulkOp.find(query);
                    if (muw.isReplace()) {
                        writeBuilder.replaceOne(modifiers);
                    } else if (muw.isUpsert()) {
                        BulkUpdateRequestBuilder updateBuilder =
                          writeBuilder.upsert();
                        if (muw.isMultiUpdate()) {
                            updateBuilder.update(modifiers);
                        } else {
                            updateBuilder.updateOne(modifiers);
                        }
                    } else {
                        // No-upsert update.
                        if (muw.isMultiUpdate()) {
                            writeBuilder.update(modifiers);
                        } else {
                            writeBuilder.updateOne(modifiers);
                        }
                    }
                } else {
                    throw new IOException("Unrecognized type: " + mwType);
                }
                filePos = inputStream.getPos();
                // Write to MongoDB if the batch is full, or if this is the last
                // operation to be performed for the Task.
                if (++curBatchSize >= maxDocs || filePos >= fileLen) {
                    try {
                        bulkOp.execute();
                    } catch (MongoException e) {
                        LOG.error("Could not write to MongoDB", e);
                        throw e;
                    }
                    bulkOp = collection.initializeOrderedBulkOperation();
                    curBatchSize = 0;

                    // Signal progress back to Hadoop framework so that we
                    // don't time out.
                    taskContext.progress();
                }
            } catch (IOException e) {
                LOG.error("Error reading from temporary file", e);
                throw e;
            }
        }

        cleanupAfterCommit(inputStream, taskContext);
    }

    public void abortTask(final CompatUtils.TaskAttemptContext taskContext)
      throws IOException {
        LOG.info("Aborting task.");
        cleanupResources(taskContext);
    }

    /**
     * Helper method to close MongoClients and FSDataInputStream and clean up
     * any files still left around from map/reduce tasks.
     *
     * @param inputStream the FSDataInputStream to close.
     */
    private void cleanupAfterCommit(
        final FSDataInputStream inputStream,
        final CompatUtils.TaskAttemptContext context)
        throws IOException {
        if (inputStream != null) {
            try {
                inputStream.close();
            } catch (IOException e) {
                LOG.error("Could not close input stream", e);
                throw e;
            }
        }
        cleanupResources(context);
    }

    private void cleanupResources(
      final CompatUtils.TaskAttemptContext taskContext)
        throws IOException {
        Path currentPath = getTaskAttemptPath(taskContext);
        Path tempDirectory = getTempDirectory(taskContext.getConfiguration());
        FileSystem fs = FileSystem.get(taskContext.getConfiguration());
        while (!currentPath.equals(tempDirectory)) {
            try {
                fs.delete(currentPath, true);
            } catch (IOException e) {
                LOG.error("Could not delete temporary file: " + currentPath, e);
                throw e;
            }
            currentPath = currentPath.getParent();
        }

        if (collection != null) {
            MongoConfigUtil.close(collection.getDB().getMongo());
        }
    }

    private static Path getTempDirectory(final Configuration config) {
        String basePath = config.get(
          "mapreduce.task.tmp.dir",
          config.get(
            "mapred.child.tmp",
            config.get("hadoop.tmp.dir", "/tmp")));
        return new Path(basePath);
    }

    /**
     * Get the Path to where temporary files should be stored for a
     * TaskAttempt, whose TaskAttemptContext is provided.
     *
     * @param context the TaskAttemptContext.
     * @return the Path to the temporary file for the TaskAttempt.
     */
    public static Path getTaskAttemptPath(
      final CompatUtils.TaskAttemptContext context) {
        Configuration config = context.getConfiguration();
        // Try to use the following base temporary directories, in this order:
        // 1. New-style option for task tmp dir
        // 2. Old-style option for task tmp dir
        // 3. Hadoop system-wide tmp dir
        // 4. /tmp
        // Hadoop Paths always use "/" as a directory separator.
        return new Path(
          String.format("%s/%s/%s/_out",
            getTempDirectory(config),
            context.getTaskAttemptID().toString(), TEMP_DIR_NAME));
    }

}