com.mongodb.hadoop.splitter.ShardChunkMongoSplitter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-core Show documentation
Show all versions of mongo-hadoop-core Show documentation
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
/*
* Copyright 2010-2013 10gen Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mongodb.hadoop.splitter;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.MongoClientURI;
import com.mongodb.hadoop.input.MongoInputSplit;
import com.mongodb.hadoop.util.MongoConfigUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* This class is an implementation of MongoSplitter which can be used on sharded collections. It gets the chunks information from the
* cluster's config server, and produces one split for each chunk.
*/
public class ShardChunkMongoSplitter extends MongoCollectionSplitter {
private static final Log LOG = LogFactory.getLog(ShardChunkMongoSplitter.class);
public ShardChunkMongoSplitter() {
}
public ShardChunkMongoSplitter(final Configuration conf) {
super(conf);
}
/**
* Get a list of InputSplits based on a list of MongoDB shard chunks, the shard key, and a
* mapping of shard names to host names. This is used internally by {@link #calculateSplits()}.
*
* @param chunks Chunk documents from the config.chunks collection.
* @param shardsMap A map of shard name -> an array of hostnames.
* @return A list of InputSplits.
*/
List calculateSplitsFromChunks(
final List chunks, final Map> shardsMap)
throws SplitFailedException {
boolean targetShards = MongoConfigUtil.canReadSplitsFromShards(getConfiguration());
List mongosHostNames = MongoConfigUtil.getInputMongosHosts(getConfiguration());
MongoClientURI inputURI = MongoConfigUtil.getInputURI(getConfiguration());
if (targetShards && mongosHostNames.size() > 0) {
throw new SplitFailedException("Setting both mongo.input.split.read_from_shards and mongo.input.mongos_hosts"
+ " does not make sense. ");
}
Map mongosMap = null;
if (mongosHostNames.size() > 0) {
// Build a map of host -> mongos host string (incl. port)
mongosMap = new HashMap();
for (String mongosHostName : mongosHostNames) {
String[] hostAndPort = mongosHostName.split(":");
mongosMap.put(hostAndPort[0], mongosHostName);
}
}
List splits = new ArrayList(chunks.size());
for (DBObject chunk : chunks) {
BasicDBObject chunkLowerBound = (BasicDBObject) chunk.get("min");
BasicDBObject chunkUpperBound = (BasicDBObject) chunk.get("max");
MongoInputSplit chunkSplit = createSplitFromBounds(chunkLowerBound, chunkUpperBound);
chunkSplit.setInputURI(inputURI);
String shard = (String) chunk.get("shard");
if (targetShards) {
//The job is configured to target shards, so replace the
//mongos hostname with the host of the shard's servers
List shardHosts = shardsMap.get(shard);
if (shardHosts == null) {
throw new SplitFailedException(
"Couldn't find shard ID: " + shard + " in config.shards.");
}
MongoClientURI newURI = rewriteURI(inputURI, shardHosts);
chunkSplit.setInputURI(newURI);
} else if (mongosMap != null) {
// Try to use a mongos collocated with one of the shard hosts for the input
// split. If the user has their Hadoop/MongoDB clusters configured correctly,
// this will allow for reading without having to transfer data over a network.
// Note that MongoInputSplit.getLocations() just returns the hostnames from its
// input URI.
List chunkHosts = shardsMap.get(shard);
String mongosHost = null;
for (String chunkHost : chunkHosts) {
String[] hostAndPort = chunkHost.split(":");
mongosHost = mongosMap.get(hostAndPort[0]);
if (mongosHost != null) {
break;
}
}
if (null == mongosHost) {
// Fall back just to using the given input URI.
chunkSplit.setInputURI(inputURI);
} else {
LOG.info("Will read split " + chunkSplit + " from mongos " + mongosHost);
chunkSplit.setInputURI(rewriteURI(inputURI, mongosHost));
}
}
// Add this split to the list for the current shard.
chunkSplit.setKeyField(MongoConfigUtil.getInputKey(getConfiguration()));
splits.add(chunkSplit);
}
if (MongoConfigUtil.isFilterEmptySplitsEnabled(getConfiguration())) {
return filterEmptySplits(splits);
}
return splits;
}
// Generate one split per chunk.
@Override
public List calculateSplits() throws SplitFailedException {
DB configDB = getConfigDB();
DBCollection chunksCollection = configDB.getCollection("chunks");
Map> shardsMap;
try {
shardsMap = getShardsMap();
} catch (Exception e) {
//Something went wrong when trying to
//read the shards data from the config server,
//so abort the splitting
throw new SplitFailedException("Couldn't get shards information from config server", e);
}
return calculateSplitsFromChunks(chunksCollection.find().toArray(), shardsMap);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy