
com.mongodb.hadoop.util.MongoSplitter Maven / Gradle / Ivy
The newest version!
package com.mongodb.hadoop.util;
import com.mongodb.*;
import com.mongodb.hadoop.*;
import com.mongodb.hadoop.input.*;
import org.apache.commons.logging.*;
import org.apache.hadoop.mapreduce.*;
import org.bson.types.MinKey;
import org.bson.types.MaxKey;
import java.net.UnknownHostException;
import java.util.*;
/**
* Copyright (c) 2010, 2011 10gen, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
public class MongoSplitter {
public static List calculateSplits( MongoConfig conf ){
if ( conf.getLimit() > 0 || conf.getSkip() > 0 ){
/**
* TODO - If they specify skip or limit we create only one input
* split
*/
throw new IllegalArgumentException(
"skip() and limit() is not currently supported due to input split issues." );
}
/**
* On the jobclient side we want *ONLY* the min and max ids for each
* split; Actual querying will be done on the individual mappers.
*/
MongoURI uri = conf.getInputURI();
DBCollection coll = MongoConfigUtil.getCollection(uri);
DB db = coll.getDB();
Mongo mongo = db.getMongo();
if( conf.getAuthURI() != null ){
MongoURI authURI = conf.getAuthURI();
if(authURI.getUsername() != null &&
authURI.getPassword() != null &&
!authURI.getDatabase().equals(db.getName()))
{
DB authTargetDB = mongo.getDB(authURI.getDatabase());
authTargetDB.authenticate(authURI.getUsername(),
authURI.getPassword());
}
}
final CommandResult stats = coll.getStats();
final boolean isSharded = stats.getBoolean( "sharded", false );
//connecting to the individual backend mongods is not safe, do not do so by default
final boolean useShards = conf.canReadSplitsFromShards();
final boolean useChunks = conf.isShardChunkedSplittingEnabled();
final boolean slaveOk = conf.canReadSplitsFromSecondary();
final boolean useRangeQuery = conf.isRangeQueryEnabled();
log.info("MongoSplitter calculating splits");
log.info("use shards: " + useShards);
log.info("use chunks: " + useChunks);
log.info("collection sharded: " + isSharded);
log.info("use range queries: " + useRangeQuery);
List retVal;
if (conf.createInputSplits()) {
log.info( "Creation of Input Splits is enabled." );
if (isSharded && (useShards || useChunks)){
log.info( "Sharding mode calculation entering." );
retVal = calculateShardedSplits( conf, useShards, useChunks, slaveOk, uri, mongo );
}
else {
// perfectly ok for sharded setups to run with a normally calculated split.
// May even be more efficient for some cases
log.info( "Using Unsharded Split mode (Calculating multiple splits though)" );
retVal = calculateUnshardedSplits( conf, slaveOk, uri, coll );
}
} else {
log.info( "Creation of Input Splits is disabled; Non-Split mode calculation entering." );
retVal = calculateSingleSplit( conf );
}
if(retVal == null){
log.info("MongoSplitter returning null InputSplits.");
}else{
log.info("MongoSplitter found " + retVal.size() + " splits.");
}
return retVal;
}
private static List calculateUnshardedSplits( MongoConfig conf, boolean slaveOk,
MongoURI uri, DBCollection coll ){
final List splits = new ArrayList();
final DBObject splitKey = conf.getInputSplitKey(); // a bit slower but forces validation of the JSON
final int splitSize = conf.getSplitSize(); // in MB
final String ns = coll.getFullName();
final DBObject q = conf.getQuery();
log.info( "Calculating unsharded input splits on namespace '" + ns + "' with Split Key '" + splitKey.toString() + "' and a split size of '" + splitSize + "'mb per" );
final DBObject cmd = BasicDBObjectBuilder.start("splitVector", ns).
add( "keyPattern", splitKey ).
add( "force", false ). // force:True is misbehaving it seems
add( "maxChunkSize", splitSize ).get();
log.trace( "Issuing Command: " + cmd );
CommandResult data = coll.getDB().command( cmd );
if ( data.containsField( "$err" ) )
throw new IllegalArgumentException( "Error calculating splits: " + data );
else if ( (Double) data.get( "ok" ) != 1.0 )
throw new IllegalArgumentException( "Unable to calculate input splits: " + ( (String) data.get( "errmsg" ) ) );
// Comes in a format where "min" and "max" are implicit and each entry is just a boundary key; not ranged
BasicDBList splitData = (BasicDBList) data.get( "splitKeys" );
if (splitData.size() <= 1) {
if (splitData.size() < 1)
log.warn( "WARNING: No Input Splits were calculated by the split code. "
+ "Proceeding with a *single* split. Data may be too small, try lowering 'mongo.input.split_size' "
+ "if this is undesirable." );
splits.add( _split( conf, q, null, null ) ); // no splits really. Just do the whole thing data is likely small
}
else {
log.info( "Calculated " + splitData.size() + " splits." );
DBObject lastKey = (DBObject) splitData.get( 0 );
splits.add( _split( conf, q, null, lastKey ) ); // first "min" split
for (int i = 1; i < splitData.size(); i++ ) {
final DBObject _tKey = (DBObject) splitData.get( i );
splits.add( _split( conf, q, lastKey, _tKey) );
lastKey = _tKey;
}
splits.add( _split( conf, q, lastKey, null ) ); // last "max" split
}
return splits;
}
private static MongoInputSplit _split( MongoConfig conf, DBObject q, DBObject min, DBObject max ) {
BasicDBObject query = new BasicDBObject();
query.putAll(q);
//BasicDBObjectBuilder b = BasicDBObjectBuilder.start( "$query", q );
//final DBObject query = b.get();
//log.trace( "Assembled Query: " + query );
return new MongoInputSplit( conf.getInputURI(), conf.getInputKey(), query, conf.getFields(),
conf.getSort(), min, max, conf.getLimit(), conf.getSkip(), conf.isNoTimeout() );
}
private static List calculateSingleSplit( MongoConfig conf ){
final List splits = new ArrayList( 1 );
// no splits, no sharding
splits.add( new MongoInputSplit( conf.getInputURI(), conf.getInputKey(), conf.getQuery(),
conf.getFields(), conf.getSort(), null, null, conf.getLimit(), conf.getSkip(),
conf.isNoTimeout() ) );
log.info( "Calculated " + splits.size() + " split objects." );
log.debug( "Dump of calculated splits ... " );
for ( InputSplit split : splits ) {
log.debug("\t Split: " + split.toString());
}
return splits;
}
private static List calculateShardedSplits(MongoConfig conf, boolean useShards, boolean useChunks, boolean slaveOk, MongoURI uri, Mongo mongo) {
final List splits;
try {
if ( useChunks )
splits = fetchSplitsViaChunks( conf, uri, mongo, useShards, slaveOk );
else if ( useShards ){
log.warn( "Fetching Input Splits directly from shards is potentially dangerous for data "
+ "consistency should migrations occur during the retrieval." );
splits = fetchSplitsFromShards( conf, uri, mongo, slaveOk );
}
else throw new IllegalStateException( "Neither useChunks nor useShards enabled; failed to pick a valid state. " );
if ( splits == null )
throw new IllegalStateException( "Failed to create/calculate Input Splits from Shard Chunks; final splits content is 'null'." );
if ( log.isDebugEnabled() ){
log.debug( "Calculated splits and returning them - splits: " + splits );
}
return splits;
}
catch ( Exception e ) {
log.error( "Could not get splits (use_shards: " + useShards + ", use_chunks: " + useChunks + ")", e );
throw new IllegalStateException(e);
}
}
/**
* This gets the URIs to the backend {@code mongod}s and returns splits that connect directly to those backends (one
* split for each backend). There are two potential problems with this: - clients that can connect to {@code
* mongos} can't necessarily connect to the individual {@code mongod}s.
- there concurrency issues (if chunks are
* in the process of getting moved around).
*/
private static List fetchSplitsFromShards( final MongoConfig conf,
MongoURI uri,
Mongo mongo,
Boolean slaveOk ){
log.warn( "WARNING getting splits that connect directly to the backend mongods"
+ " is risky and might not produce correct results" );
DB configDb = mongo.getDB( "config" );
DBCollection shardsColl = configDb.getCollection( "shards" );
Set shardSet = new java.util.HashSet();
DBCursor cur = shardsColl.find();
try {
while ( cur.hasNext() ){
final BasicDBObject row = (BasicDBObject) cur.next();
String host = row.getString( "host" );
int slashIndex = host.indexOf( '/' );
if ( slashIndex > 0 )
host = host.substring( slashIndex + 1 );
shardSet.add( host );
}
}
finally {
if ( cur != null )
cur.close();
cur = null;
}
final List splits = new ArrayList( shardSet.size() );
//todo: using stats only get the shards that actually host data for this collection
for ( String host : shardSet ){
MongoURI thisUri = getNewURI( uri, host, slaveOk );
splits.add( new MongoInputSplit( thisUri, conf.getInputKey(), conf.getQuery(), conf.getFields(),
conf.getSort(), null, null, conf.getLimit(), conf.getSkip(),
conf.isNoTimeout() ) ); // TODO - Should the input Key be the shard key?
}
return splits;
}
/**
* This constructs splits using the chunk boundaries.
*/
private static List fetchSplitsViaChunks( final MongoConfig conf,
MongoURI uri,
Mongo mongo,
boolean useShards,
Boolean slaveOk ){
DBObject originalQuery = conf.getQuery();
if ( useShards )
log.warn( "WARNING getting splits that connect directly to the backend mongods"
+ " is risky and might not produce correct results" );
if ( conf.isRangeQueryEnabled() ){
log.warn( "WARNING using range queries can produce incorrect results if values"
+ " stored under the splitting key have different types.");
}
if ( log.isDebugEnabled() ){
log.debug( "getSplitsUsingChunks(): originalQuery: " + originalQuery );
}
DB configDB = mongo.getDB( "config" );
Map shardMap = null; //key: shardname, value: host
if ( useShards ){
shardMap = new HashMap();
DBCollection shardsCollection = configDB.getCollection( "shards" );
DBCursor cur = shardsCollection.find();
try {
while ( cur.hasNext() ){
final BasicDBObject row = (BasicDBObject) cur.next();
String host = row.getString( "host" );
// for replica sets host will look like: "setname/localhost:20003,localhost:20004"
int slashIndex = host.indexOf( '/' );
if ( slashIndex > 0 )
host = host.substring( slashIndex + 1 );
shardMap.put( (String) row.get( "_id" ), host );
}
}
finally {
if ( cur != null )
cur.close();
}
}
if ( log.isDebugEnabled() ){
log.debug( "MongoInputFormat.getSplitsUsingChunks(): shard map is: " + shardMap );
}
DBCollection chunksCollection = configDB.getCollection( "chunks" );
/* Chunks looks like:
{ "_id" : "test.lines-_id_ObjectId('4d60b839874a8ad69ad8adf6')", "lastmod" : { "t" : 3000, "i" : 1 }, "ns" : "test.lines", "min" : { "_id" : ObjectId("4d60b839874a8ad69ad8adf6") }, "max" : { "_id" : ObjectId("4d60b83a874a8ad69ad8d1a9") }, "shard" : "shard0000" }
{ "_id" : "test.lines-_id_ObjectId('4d60b848874a8ad69ada8756')", "lastmod" : { "t" : 3000, "i" : 19 }, "ns" : "test.lines", "min" : { "_id" : ObjectId("4d60b848874a8ad69ada8756") }, "max" : { "_id" : { $maxKey : 1 } }, "shard" : "shard0002" }
*/
BasicDBObject query = new BasicDBObject();
query.put( "ns", uri.getDatabase() + "." + uri.getCollection() );
DBCursor cur = chunksCollection.find( query );
try {
int numChunks = 0;
final int numExpectedChunks = cur.size();
final List splits = new ArrayList( numExpectedChunks );
while ( cur.hasNext() ){
numChunks++;
final BasicDBObject row = (BasicDBObject) cur.next();
DBObject minObj = ( (DBObject) row.get( "min" ) );
DBObject shardKeyQuery = new BasicDBObject();
BasicDBObject min = new BasicDBObject();
BasicDBObject max = new BasicDBObject();
for ( String keyName : minObj.keySet() ){
Object tMin = minObj.get( keyName );
Object tMax = ( (DBObject) row.get( "max" ) ).get( keyName );
/** The shard key can be of any possible type, so this must be kept as Object */
if ( !( tMin == SplitFriendlyDBCallback.MIN_KEY_TYPE || tMin.equals( "MinKey" ) ) )
min.put( keyName, tMin );
if ( !( tMax == SplitFriendlyDBCallback.MAX_KEY_TYPE || tMax.equals( "MaxKey" ) ) )
max.put( keyName, tMax );
}
/** We have to put something for $query or we'll fail; if no original query use an empty DBObj */
if ( originalQuery == null )
originalQuery = new BasicDBObject();
DBObject splitQuery = originalQuery;
boolean useMinMax = true;
if( conf.isRangeQueryEnabled() ){
Map.Entry minKey = min.size() == 1 ?
min.entrySet().iterator().next() : null;
Map.Entry maxKey = max.size() == 1 ?
max.entrySet().iterator().next() : null;
if(minKey == null && maxKey == null ){
throw new IllegalArgumentException("Range query is enabled but one or more split boundaries contains a compound key:\n" +
"minKey: " + min.toString() + "\n" +
"maxKey: " + max.toString());
}
if( (minKey != null && originalQuery.containsKey(minKey.getKey())) ||
(maxKey != null && originalQuery.containsKey(maxKey.getKey())) ){
throw new IllegalArgumentException("Range query is enabled but split key conflicts with query filter:\n" +
"minKey: " + min.toString() + "\n" +
"maxKey: " + max.toString() + "\n" +
"query: " + originalQuery.toString());
}
BasicDBObject rangeObj = new BasicDBObject();
if( minKey!=null )//&& !SplitFriendlyDBCallback.MIN_KEY_TYPE.equals(minKey.getValue())){
rangeObj.put("$gte", minKey.getValue());
//}
if( maxKey!=null )//&& !SplitFriendlyDBCallback.MAX_KEY_TYPE.equals(maxKey.getValue())){
rangeObj.put("$lt", maxKey.getValue());
//}
splitQuery = new BasicDBObject();
splitQuery.putAll(originalQuery);
splitQuery.put(minKey.getKey(), rangeObj);
useMinMax = false;
}
shardKeyQuery.put( "$query", originalQuery );
if ( log.isDebugEnabled() ){
log.debug( "[" + numChunks + "/" + numExpectedChunks + "] new query is: " + shardKeyQuery );
}
MongoURI inputURI = conf.getInputURI();
if ( useShards ){
final String shardname = row.getString( "shard" );
String host = shardMap.get( shardname );
inputURI = getNewURI( inputURI, host, slaveOk );
}
if(useMinMax){
MongoInputSplit split = new MongoInputSplit( inputURI,
conf.getInputKey(),
splitQuery,
conf.getFields(),
conf.getSort(), // TODO - should inputKey be the shard key?
min,
max,
conf.getLimit(),
conf.getSkip(),
conf.isNoTimeout());
splits.add(split);
}else{
MongoInputSplit split = new MongoInputSplit( inputURI,
conf.getInputKey(),
splitQuery,
conf.getFields(),
conf.getSort(), // TODO - should inputKey be the shard key?
null,
null,
conf.getLimit(),
conf.getSkip(),
conf.isNoTimeout());
splits.add(split);
}
}
if ( log.isDebugEnabled() ){
log.debug( "MongoInputFormat.getSplitsUsingChunks(): There were "
+ numChunks
+ " chunks, returning "
+ splits.size()
+ " splits: " + splits );
}
return splits;
}
finally {
if ( cur != null )
cur.close();
}
}
private static MongoURI getNewURI( MongoURI originalUri, String newServerUri, Boolean slaveok ){
String originalUriString = originalUri.toString();
originalUriString = originalUriString.substring( MongoURI.MONGODB_PREFIX.length() );
// uris look like: mongodb://fred:foobar@server1[,server2]/path?options
int serverEnd = -1;
int serverStart = 0;
int idx = originalUriString.lastIndexOf( "/" );
if ( idx < 0 ){
serverEnd = originalUriString.length();
}
else{
serverEnd = idx;
}
idx = originalUriString.indexOf( "@" );
if ( idx > 0 ){
serverStart = idx + 1;
}
StringBuilder sb = new StringBuilder( originalUriString );
sb.replace( serverStart, serverEnd, newServerUri );
if ( slaveok != null ){
//If uri already contains options append option to end of uri.
//This will override any slaveok option already in the uri
if ( originalUriString.contains( "?" ) )
sb.append( "&slaveok=" ).append( slaveok );
else
sb.append( "?slaveok=" ).append( slaveok );
}
String ans = MongoURI.MONGODB_PREFIX + sb.toString();
log.debug( "getNewURI(): original " + originalUri + " new uri: " + ans );
return new MongoURI( ans );
}
private static final Log log = LogFactory.getLog( MongoSplitter.class );
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy