com.mongodb.hadoop.splitter.ShardChunkMongoSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-core Show documentation
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
There is a newer version: 2.0.2
Show newest version
/*
 * Copyright 2010-2013 10gen Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mongodb.hadoop.splitter;

import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.MongoClientURI;
import com.mongodb.hadoop.input.MongoInputSplit;
import com.mongodb.hadoop.util.MongoConfigUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * This class is an implementation of MongoSplitter which can be used on sharded collections. It gets the chunks information from the
 * cluster's config server, and produces one split for each chunk.
 */
public class ShardChunkMongoSplitter extends MongoCollectionSplitter {

    private static final Log LOG = LogFactory.getLog(ShardChunkMongoSplitter.class);

    public ShardChunkMongoSplitter() {
    }

    public ShardChunkMongoSplitter(final Configuration conf) {
        super(conf);
    }

    /**
     * Get a list of InputSplits based on a list of MongoDB shard chunks, the shard key, and a
     * mapping of shard names to host names. This is used internally by {@link #calculateSplits()}.
     *
     * @param chunks Chunk documents from the config.chunks collection.
     * @param shardsMap A map of shard name -> an array of hostnames.
     * @return A list of InputSplits.
     */
    List calculateSplitsFromChunks(
      final List chunks, final Map> shardsMap)
      throws SplitFailedException {

        boolean targetShards = MongoConfigUtil.canReadSplitsFromShards(getConfiguration());
        List mongosHostNames = MongoConfigUtil.getInputMongosHosts(getConfiguration());
        MongoClientURI inputURI = MongoConfigUtil.getInputURI(getConfiguration());
        if (targetShards && mongosHostNames.size() > 0) {
            throw new SplitFailedException("Setting both mongo.input.split.read_from_shards and mongo.input.mongos_hosts"
              + " does not make sense. ");
        }
        Map mongosMap = null;
        if (mongosHostNames.size() > 0) {
            // Build a map of host -> mongos host string (incl. port)
            mongosMap = new HashMap();
            for (String mongosHostName : mongosHostNames) {
                String[] hostAndPort = mongosHostName.split(":");
                mongosMap.put(hostAndPort[0], mongosHostName);
            }
        }

        List splits = new ArrayList(chunks.size());
        for (DBObject chunk : chunks) {
            BasicDBObject chunkLowerBound = (BasicDBObject) chunk.get("min");
            BasicDBObject chunkUpperBound = (BasicDBObject) chunk.get("max");
            MongoInputSplit chunkSplit = createSplitFromBounds(chunkLowerBound, chunkUpperBound);
            chunkSplit.setInputURI(inputURI);
            String shard = (String) chunk.get("shard");
            if (targetShards) {
                //The job is configured to target shards, so replace the
                //mongos hostname with the host of the shard's servers
                List shardHosts = shardsMap.get(shard);
                if (shardHosts == null) {
                    throw new SplitFailedException(
                      "Couldn't find shard ID: " + shard + " in config.shards.");
                }

                MongoClientURI newURI = rewriteURI(inputURI, shardHosts);
                chunkSplit.setInputURI(newURI);
            } else if (mongosMap != null) {
                // Try to use a mongos collocated with one of the shard hosts for the input
                // split. If the user has their Hadoop/MongoDB clusters configured correctly,
                // this will allow for reading without having to transfer data over a network.
                // Note that MongoInputSplit.getLocations() just returns the hostnames from its
                // input URI.
                List chunkHosts = shardsMap.get(shard);
                String mongosHost = null;
                for (String chunkHost : chunkHosts) {
                    String[] hostAndPort = chunkHost.split(":");
                    mongosHost = mongosMap.get(hostAndPort[0]);
                    if (mongosHost != null) {
                        break;
                    }
                }
                if (null == mongosHost) {
                    // Fall back just to using the given input URI.
                    chunkSplit.setInputURI(inputURI);
                } else {
                    LOG.info("Will read split " + chunkSplit + " from mongos " + mongosHost);
                    chunkSplit.setInputURI(rewriteURI(inputURI, mongosHost));
                }
            }
            // Add this split to the list for the current shard.
            chunkSplit.setKeyField(MongoConfigUtil.getInputKey(getConfiguration()));
            splits.add(chunkSplit);
        }

        if (MongoConfigUtil.isFilterEmptySplitsEnabled(getConfiguration())) {
            return filterEmptySplits(splits);
        }
        return splits;
    }

    // Generate one split per chunk.
    @Override
    public List calculateSplits() throws SplitFailedException {
        DB configDB = getConfigDB();
        DBCollection chunksCollection = configDB.getCollection("chunks");
        Map> shardsMap;
        try {
            shardsMap = getShardsMap();
        } catch (Exception e) {
            //Something went wrong when trying to
            //read the shards data from the config server,
            //so abort the splitting
            throw new SplitFailedException("Couldn't get shards information from config server", e);
        }

        return calculateSplitsFromChunks(chunksCollection.find().toArray(), shardsMap);
    }

}