All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mongodb.hadoop.util.MongoSplitter Maven / Gradle / Ivy

The newest version!
package com.mongodb.hadoop.util;

import com.mongodb.*;
import com.mongodb.hadoop.*;
import com.mongodb.hadoop.input.*;
import org.apache.commons.logging.*;
import org.apache.hadoop.mapreduce.*;
import org.bson.types.MinKey;
import org.bson.types.MaxKey;

import java.net.UnknownHostException;
import java.util.*;

/**
 * Copyright (c) 2010, 2011 10gen, Inc. 
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */

public class MongoSplitter {

    public static List calculateSplits( MongoConfig conf ){

        if ( conf.getLimit() > 0 || conf.getSkip() > 0 ){
            /**
             * TODO - If they specify skip or limit we create only one input
             * split
             */
            throw new IllegalArgumentException(
                    "skip() and limit() is not currently supported due to input split issues." );
        }

        /**
         * On the jobclient side we want *ONLY* the min and max ids for each
         * split; Actual querying will be done on the individual mappers.
         */
        MongoURI uri = conf.getInputURI();
        DBCollection coll = MongoConfigUtil.getCollection(uri);
        DB db = coll.getDB(); 
        Mongo mongo = db.getMongo();

        if( conf.getAuthURI() != null ){
            MongoURI authURI = conf.getAuthURI();
            if(authURI.getUsername() != null &&
               authURI.getPassword() != null &&
               !authURI.getDatabase().equals(db.getName()))
            {
                DB authTargetDB = mongo.getDB(authURI.getDatabase());
                authTargetDB.authenticate(authURI.getUsername(),
                                          authURI.getPassword());
            }
        }
        
        final CommandResult stats = coll.getStats();
        
        final boolean isSharded = stats.getBoolean( "sharded", false );

        //connecting to the individual backend mongods is not safe, do not do so by default
        final boolean useShards = conf.canReadSplitsFromShards();

        final boolean useChunks = conf.isShardChunkedSplittingEnabled();

        final boolean slaveOk = conf.canReadSplitsFromSecondary();

        final boolean useRangeQuery = conf.isRangeQueryEnabled();

        log.info("MongoSplitter calculating splits");
        log.info("use shards: " + useShards);
        log.info("use chunks: " + useChunks);
        log.info("collection sharded: " + isSharded);
        log.info("use range queries: " + useRangeQuery);

        List retVal;
        if (conf.createInputSplits()) {
            log.info( "Creation of Input Splits is enabled." );
            if (isSharded && (useShards || useChunks)){
                log.info( "Sharding mode calculation entering." );
                retVal = calculateShardedSplits( conf, useShards, useChunks, slaveOk, uri, mongo );
            }
            else {
                // perfectly ok for sharded setups to run with a normally calculated split.
                // May even be more efficient for some cases
                log.info( "Using Unsharded Split mode (Calculating multiple splits though)" );
                retVal = calculateUnshardedSplits( conf, slaveOk, uri, coll );
            }
        } else {
            log.info( "Creation of Input Splits is disabled; Non-Split mode calculation entering." );
            retVal = calculateSingleSplit( conf );
        }
        if(retVal == null){
            log.info("MongoSplitter returning null InputSplits.");
        }else{
            log.info("MongoSplitter found " + retVal.size() + " splits.");
        }
        return retVal;

    }

    private static List calculateUnshardedSplits( MongoConfig conf, boolean slaveOk, 
                                                              MongoURI uri, DBCollection coll ){
        final List splits = new ArrayList();
        final DBObject splitKey = conf.getInputSplitKey(); // a bit slower but forces validation of the JSON
        final int splitSize = conf.getSplitSize(); // in MB
        final String ns = coll.getFullName();
        final DBObject q = conf.getQuery();

        log.info( "Calculating unsharded input splits on namespace '" + ns + "' with Split Key '" + splitKey.toString() + "' and a split size of '" + splitSize + "'mb per" );

        final DBObject cmd = BasicDBObjectBuilder.start("splitVector", ns).
                                          add( "keyPattern", splitKey ).
                                          add( "force", false ). // force:True is misbehaving it seems
                                          add( "maxChunkSize", splitSize ).get();
        
        log.trace( "Issuing Command: " + cmd );
        CommandResult data = coll.getDB().command( cmd );

        if ( data.containsField( "$err" ) )
            throw new IllegalArgumentException( "Error calculating splits: " + data );
        else if ( (Double) data.get( "ok" ) != 1.0 )
            throw new IllegalArgumentException( "Unable to calculate input splits: " + ( (String) data.get( "errmsg" ) ) );
        
        // Comes in a format where "min" and "max" are implicit and each entry is just a boundary key; not ranged
        BasicDBList splitData = (BasicDBList) data.get( "splitKeys" );
        
        if (splitData.size() <= 1) {
            if (splitData.size() < 1)
                log.warn( "WARNING: No Input Splits were calculated by the split code. "
                          + "Proceeding with a *single* split. Data may be too small, try lowering 'mongo.input.split_size' "
                          + "if this is undesirable." );
            splits.add( _split( conf, q, null, null ) ); // no splits really. Just do the whole thing data is likely small
        }
        else {
            log.info( "Calculated " + splitData.size() + " splits." );

            DBObject lastKey = (DBObject) splitData.get( 0 );

            splits.add( _split( conf, q, null, lastKey ) ); // first "min" split

            for (int i = 1; i < splitData.size(); i++ ) {
                final DBObject _tKey = (DBObject) splitData.get( i );
                splits.add( _split( conf, q, lastKey, _tKey) );
                lastKey = _tKey;
            }

            splits.add( _split( conf, q, lastKey, null ) ); // last "max" split
        }

        return splits;

    }

    private static MongoInputSplit _split( MongoConfig conf, DBObject q, DBObject min, DBObject max ) {
        BasicDBObject query = new BasicDBObject();
        query.putAll(q);
        //BasicDBObjectBuilder b = BasicDBObjectBuilder.start( "$query", q );
        //final DBObject query = b.get();
        //log.trace( "Assembled Query: " + query );

        return new MongoInputSplit( conf.getInputURI(), conf.getInputKey(), query, conf.getFields(), 
                                    conf.getSort(), min, max, conf.getLimit(), conf.getSkip(), conf.isNoTimeout() );
    }
    
    private static List calculateSingleSplit( MongoConfig conf ){
        final List splits = new ArrayList( 1 );
        // no splits, no sharding
        splits.add( new MongoInputSplit( conf.getInputURI(), conf.getInputKey(), conf.getQuery(), 
                                         conf.getFields(), conf.getSort(), null, null, conf.getLimit(), conf.getSkip(),
                                         conf.isNoTimeout() ) );


        log.info( "Calculated " + splits.size() + " split objects." );
        log.debug( "Dump of calculated splits ... " );
        for ( InputSplit split : splits ) {
            log.debug("\t Split: " + split.toString());
        }

        return splits;
    }

    private static List calculateShardedSplits(MongoConfig conf, boolean useShards, boolean useChunks, boolean slaveOk, MongoURI uri, Mongo mongo) {
        final List splits;
        try {
            if ( useChunks )
                splits = fetchSplitsViaChunks( conf, uri, mongo, useShards, slaveOk );
            else if ( useShards ){
                log.warn( "Fetching Input Splits directly from shards is potentially dangerous for data "
                          + "consistency should migrations occur during the retrieval." );
                splits = fetchSplitsFromShards( conf, uri, mongo, slaveOk );
            }
            else throw new IllegalStateException( "Neither useChunks nor useShards enabled; failed to pick a valid state. " );

            if ( splits == null )
                throw new IllegalStateException( "Failed to create/calculate Input Splits from Shard Chunks; final splits content is 'null'." );

            if ( log.isDebugEnabled() ){
                log.debug( "Calculated splits and returning them - splits: " + splits );
            }

            return splits;
        }
        catch ( Exception e ) {
            log.error( "Could not get splits (use_shards: " + useShards + ", use_chunks: " + useChunks + ")", e );
            throw new IllegalStateException(e);
        }
    }

    /**
     * This gets the URIs to the backend {@code mongod}s and returns splits that connect directly to those backends (one
     * split for each backend). There are two potential problems with this: 
  1. clients that can connect to {@code * mongos} can't necessarily connect to the individual {@code mongod}s.
  2. there concurrency issues (if chunks are * in the process of getting moved around).
*/ private static List fetchSplitsFromShards( final MongoConfig conf, MongoURI uri, Mongo mongo, Boolean slaveOk ){ log.warn( "WARNING getting splits that connect directly to the backend mongods" + " is risky and might not produce correct results" ); DB configDb = mongo.getDB( "config" ); DBCollection shardsColl = configDb.getCollection( "shards" ); Set shardSet = new java.util.HashSet(); DBCursor cur = shardsColl.find(); try { while ( cur.hasNext() ){ final BasicDBObject row = (BasicDBObject) cur.next(); String host = row.getString( "host" ); int slashIndex = host.indexOf( '/' ); if ( slashIndex > 0 ) host = host.substring( slashIndex + 1 ); shardSet.add( host ); } } finally { if ( cur != null ) cur.close(); cur = null; } final List splits = new ArrayList( shardSet.size() ); //todo: using stats only get the shards that actually host data for this collection for ( String host : shardSet ){ MongoURI thisUri = getNewURI( uri, host, slaveOk ); splits.add( new MongoInputSplit( thisUri, conf.getInputKey(), conf.getQuery(), conf.getFields(), conf.getSort(), null, null, conf.getLimit(), conf.getSkip(), conf.isNoTimeout() ) ); // TODO - Should the input Key be the shard key? } return splits; } /** * This constructs splits using the chunk boundaries. */ private static List fetchSplitsViaChunks( final MongoConfig conf, MongoURI uri, Mongo mongo, boolean useShards, Boolean slaveOk ){ DBObject originalQuery = conf.getQuery(); if ( useShards ) log.warn( "WARNING getting splits that connect directly to the backend mongods" + " is risky and might not produce correct results" ); if ( conf.isRangeQueryEnabled() ){ log.warn( "WARNING using range queries can produce incorrect results if values" + " stored under the splitting key have different types."); } if ( log.isDebugEnabled() ){ log.debug( "getSplitsUsingChunks(): originalQuery: " + originalQuery ); } DB configDB = mongo.getDB( "config" ); Map shardMap = null; //key: shardname, value: host if ( useShards ){ shardMap = new HashMap(); DBCollection shardsCollection = configDB.getCollection( "shards" ); DBCursor cur = shardsCollection.find(); try { while ( cur.hasNext() ){ final BasicDBObject row = (BasicDBObject) cur.next(); String host = row.getString( "host" ); // for replica sets host will look like: "setname/localhost:20003,localhost:20004" int slashIndex = host.indexOf( '/' ); if ( slashIndex > 0 ) host = host.substring( slashIndex + 1 ); shardMap.put( (String) row.get( "_id" ), host ); } } finally { if ( cur != null ) cur.close(); } } if ( log.isDebugEnabled() ){ log.debug( "MongoInputFormat.getSplitsUsingChunks(): shard map is: " + shardMap ); } DBCollection chunksCollection = configDB.getCollection( "chunks" ); /* Chunks looks like: { "_id" : "test.lines-_id_ObjectId('4d60b839874a8ad69ad8adf6')", "lastmod" : { "t" : 3000, "i" : 1 }, "ns" : "test.lines", "min" : { "_id" : ObjectId("4d60b839874a8ad69ad8adf6") }, "max" : { "_id" : ObjectId("4d60b83a874a8ad69ad8d1a9") }, "shard" : "shard0000" } { "_id" : "test.lines-_id_ObjectId('4d60b848874a8ad69ada8756')", "lastmod" : { "t" : 3000, "i" : 19 }, "ns" : "test.lines", "min" : { "_id" : ObjectId("4d60b848874a8ad69ada8756") }, "max" : { "_id" : { $maxKey : 1 } }, "shard" : "shard0002" } */ BasicDBObject query = new BasicDBObject(); query.put( "ns", uri.getDatabase() + "." + uri.getCollection() ); DBCursor cur = chunksCollection.find( query ); try { int numChunks = 0; final int numExpectedChunks = cur.size(); final List splits = new ArrayList( numExpectedChunks ); while ( cur.hasNext() ){ numChunks++; final BasicDBObject row = (BasicDBObject) cur.next(); DBObject minObj = ( (DBObject) row.get( "min" ) ); DBObject shardKeyQuery = new BasicDBObject(); BasicDBObject min = new BasicDBObject(); BasicDBObject max = new BasicDBObject(); for ( String keyName : minObj.keySet() ){ Object tMin = minObj.get( keyName ); Object tMax = ( (DBObject) row.get( "max" ) ).get( keyName ); /** The shard key can be of any possible type, so this must be kept as Object */ if ( !( tMin == SplitFriendlyDBCallback.MIN_KEY_TYPE || tMin.equals( "MinKey" ) ) ) min.put( keyName, tMin ); if ( !( tMax == SplitFriendlyDBCallback.MAX_KEY_TYPE || tMax.equals( "MaxKey" ) ) ) max.put( keyName, tMax ); } /** We have to put something for $query or we'll fail; if no original query use an empty DBObj */ if ( originalQuery == null ) originalQuery = new BasicDBObject(); DBObject splitQuery = originalQuery; boolean useMinMax = true; if( conf.isRangeQueryEnabled() ){ Map.Entry minKey = min.size() == 1 ? min.entrySet().iterator().next() : null; Map.Entry maxKey = max.size() == 1 ? max.entrySet().iterator().next() : null; if(minKey == null && maxKey == null ){ throw new IllegalArgumentException("Range query is enabled but one or more split boundaries contains a compound key:\n" + "minKey: " + min.toString() + "\n" + "maxKey: " + max.toString()); } if( (minKey != null && originalQuery.containsKey(minKey.getKey())) || (maxKey != null && originalQuery.containsKey(maxKey.getKey())) ){ throw new IllegalArgumentException("Range query is enabled but split key conflicts with query filter:\n" + "minKey: " + min.toString() + "\n" + "maxKey: " + max.toString() + "\n" + "query: " + originalQuery.toString()); } BasicDBObject rangeObj = new BasicDBObject(); if( minKey!=null )//&& !SplitFriendlyDBCallback.MIN_KEY_TYPE.equals(minKey.getValue())){ rangeObj.put("$gte", minKey.getValue()); //} if( maxKey!=null )//&& !SplitFriendlyDBCallback.MAX_KEY_TYPE.equals(maxKey.getValue())){ rangeObj.put("$lt", maxKey.getValue()); //} splitQuery = new BasicDBObject(); splitQuery.putAll(originalQuery); splitQuery.put(minKey.getKey(), rangeObj); useMinMax = false; } shardKeyQuery.put( "$query", originalQuery ); if ( log.isDebugEnabled() ){ log.debug( "[" + numChunks + "/" + numExpectedChunks + "] new query is: " + shardKeyQuery ); } MongoURI inputURI = conf.getInputURI(); if ( useShards ){ final String shardname = row.getString( "shard" ); String host = shardMap.get( shardname ); inputURI = getNewURI( inputURI, host, slaveOk ); } if(useMinMax){ MongoInputSplit split = new MongoInputSplit( inputURI, conf.getInputKey(), splitQuery, conf.getFields(), conf.getSort(), // TODO - should inputKey be the shard key? min, max, conf.getLimit(), conf.getSkip(), conf.isNoTimeout()); splits.add(split); }else{ MongoInputSplit split = new MongoInputSplit( inputURI, conf.getInputKey(), splitQuery, conf.getFields(), conf.getSort(), // TODO - should inputKey be the shard key? null, null, conf.getLimit(), conf.getSkip(), conf.isNoTimeout()); splits.add(split); } } if ( log.isDebugEnabled() ){ log.debug( "MongoInputFormat.getSplitsUsingChunks(): There were " + numChunks + " chunks, returning " + splits.size() + " splits: " + splits ); } return splits; } finally { if ( cur != null ) cur.close(); } } private static MongoURI getNewURI( MongoURI originalUri, String newServerUri, Boolean slaveok ){ String originalUriString = originalUri.toString(); originalUriString = originalUriString.substring( MongoURI.MONGODB_PREFIX.length() ); // uris look like: mongodb://fred:foobar@server1[,server2]/path?options int serverEnd = -1; int serverStart = 0; int idx = originalUriString.lastIndexOf( "/" ); if ( idx < 0 ){ serverEnd = originalUriString.length(); } else{ serverEnd = idx; } idx = originalUriString.indexOf( "@" ); if ( idx > 0 ){ serverStart = idx + 1; } StringBuilder sb = new StringBuilder( originalUriString ); sb.replace( serverStart, serverEnd, newServerUri ); if ( slaveok != null ){ //If uri already contains options append option to end of uri. //This will override any slaveok option already in the uri if ( originalUriString.contains( "?" ) ) sb.append( "&slaveok=" ).append( slaveok ); else sb.append( "?slaveok=" ).append( slaveok ); } String ans = MongoURI.MONGODB_PREFIX + sb.toString(); log.debug( "getNewURI(): original " + originalUri + " new uri: " + ans ); return new MongoURI( ans ); } private static final Log log = LogFactory.getLog( MongoSplitter.class ); }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy