All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mongodb.hadoop.splitter.MultiMongoCollectionSplitter Maven / Gradle / Ivy

Go to download

The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.

There is a newer version: 2.0.2
Show newest version
/*
 * Copyright 2010-2013 10gen Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mongodb.hadoop.splitter;

import com.mongodb.DBObject;
import com.mongodb.MongoClientURI;
import com.mongodb.hadoop.util.MongoConfigUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.util.ReflectionUtils;

import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import static java.lang.String.format;

public class MultiMongoCollectionSplitter extends MongoSplitter {

    public static final String MULTI_COLLECTION_CONF_KEY = "mongo.input.multi_uri.json";
    private static final Log LOG = LogFactory.getLog(MultiMongoCollectionSplitter.class);

    public MultiMongoCollectionSplitter() {
    }

    public MultiMongoCollectionSplitter(final Configuration conf) {
        super(conf);
    }

    @Override
    public List calculateSplits() throws SplitFailedException {
        List inputURIs = MongoConfigUtil.getMongoURIs(this.getConfiguration(), MongoConfigUtil.INPUT_URI);
        List returnVal = new LinkedList();
        List splitters = new LinkedList();

        //For each input URI that is specified, get the appropriate
        //splitter for each implementation.
        if (inputURIs.size() > 0) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Using global split settings for multiple URIs specified.");
            }
            //Get options from the hadoop config.
            //Use these options for all URIs in the list.

            //Splitter class is ignored here, since it would just be referring
            //to MultiMongoCollectionSplitter here anyway. If the user needs
            //to customize the splitter class, they should use the JSON key for
            //the configuration instead.
            for (MongoClientURI uri : inputURIs) {
                MongoCollectionSplitter splitter;
                Configuration confForThisUri = new Configuration(getConfiguration());
                MongoConfigUtil.setInputURI(confForThisUri, uri);
                confForThisUri.set(MongoConfigUtil.MONGO_SPLITTER_CLASS, "");
                splitter = MongoSplitterFactory.getSplitterByStats(uri, confForThisUri);
                splitters.add(splitter);
            }
        } else {
            //Otherwise the user has set options per-collection.
            if (LOG.isDebugEnabled()) {
                LOG.debug("Loading multiple input URIs from JSON stored in " + MULTI_COLLECTION_CONF_KEY);
            }
            DBObject multiUriConfig = MongoConfigUtil.getDBObject(this.getConfiguration(), MULTI_COLLECTION_CONF_KEY);

            if (!(multiUriConfig instanceof List)) {
                throw new IllegalArgumentException("Invalid JSON format in multi uri config key: Must be an array where each element "
                                                   + "is an object describing the URI and config options for each split.");
            }
            for (Object obj : (List) multiUriConfig) {
                Map configMap;
                MongoClientURI inputURI;
                Configuration confForThisUri;
                try {
                    configMap = (Map) obj;
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("building config from "
                            + configMap.toString());
                    }
                    confForThisUri = MongoConfigUtil.buildConfiguration(configMap);
                    inputURI = MongoConfigUtil.getInputURI(confForThisUri);
                } catch (ClassCastException e) {
                    throw new IllegalArgumentException("Invalid JSON format in multi uri config key: each config item must be an "
                                                       + "object with keys/values describing options for each URI.");
                }
                MongoSplitter splitter;
                Class splitterClass = MongoConfigUtil.getSplitterClass(confForThisUri);

                if (splitterClass != null) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug(format("Using custom Splitter class for namespace: %s.%s; hosts: %s",
                            inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts()));
                    }
                    //User wants to use a specific custom class for this URI.
                    //Make sure that the custom class isn't this one
                    if (splitterClass == MultiMongoCollectionSplitter.class) {
                        throw new IllegalArgumentException("Can't nest uses of MultiMongoCollectionSplitter");
                    }
                    //All clear.
                    MongoCollectionSplitter collectionSplitter;
                    collectionSplitter = (MongoCollectionSplitter)
                                             ReflectionUtils.newInstance(splitterClass, confForThisUri);
                    //Since we use no-arg constructor, need to inject
                    //configuration and input URI.
                    collectionSplitter.setConfiguration(confForThisUri);
                    splitter = collectionSplitter;
                } else {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug(format("Fetching collection stats on namespace: %s.%s; hosts: %s to choose splitter implementation.",
                              inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts()));
                    }
                    //No class was specified, so choose one by looking at
                    //collection stats.
                    splitter = MongoSplitterFactory.getSplitterByStats(inputURI, confForThisUri);
                }
                splitters.add(splitter);
            }
        }

        //Now we have splitters for all the input collections.
        //Run through all of em, get all the splits for each,
        //compile them into one big ol' list.
        for (MongoSplitter splitter : splitters) {
            returnVal.addAll(splitter.calculateSplits());
        }
        return returnVal;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy