com.mongodb.hadoop.util.MongoConfigUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-core Show documentation
mongo-hadoop-core
The newest version!
// MongoConfigUtil.java
/*
 * Copyright 2010 10gen Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mongodb.hadoop.util;

import com.mongodb.*;
import com.mongodb.util.*;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.fs.PathFilter;

import java.util.*;

/**
 * Configuration helper tool for MongoDB related Map/Reduce jobs
 */

public class MongoConfigUtil {
    private static final Log log = LogFactory.getLog( MongoConfigUtil.class );

    private static final Mongo.Holder _mongos = new Mongo.Holder();

    /**
     * The JOB_* values are entirely optional and disregarded unless you use the MongoTool base toolset... If you don't,
     * feel free to ignore these
     */
    public static final String JOB_VERBOSE = "mongo.job.verbose";
    public static final String JOB_BACKGROUND = "mongo.job.background";

    public static final String JOB_MAPPER = "mongo.job.mapper";
    public static final String JOB_COMBINER = "mongo.job.combiner";
    public static final String JOB_PARTITIONER = "mongo.job.partitioner";
    public static final String JOB_REDUCER = "mongo.job.reducer";
    public static final String JOB_SORT_COMPARATOR = "mongo.job.sort_comparator";

    public static final String JOB_MAPPER_OUTPUT_KEY = "mongo.job.mapper.output.key";
    public static final String JOB_MAPPER_OUTPUT_VALUE = "mongo.job.mapper.output.value";

    public static final String JOB_INPUT_FORMAT = "mongo.job.input.format";
    public static final String JOB_OUTPUT_FORMAT = "mongo.job.output.format";

    public static final String JOB_OUTPUT_KEY = "mongo.job.output.key";
    public static final String JOB_OUTPUT_VALUE = "mongo.job.output.value";

    public static final String INPUT_URI = "mongo.input.uri";
    public static final String OUTPUT_URI = "mongo.output.uri";


    /**
     * The MongoDB field to read from for the Mapper Input.
     *
     * This will be fed to your mapper as the "Key" for the input.
     *
     * Defaults to {@code _id}
     */
    public static final String INPUT_KEY = "mongo.input.key";
    public static final String INPUT_NOTIMEOUT = "mongo.input.notimeout";
    public static final String INPUT_QUERY = "mongo.input.query";
    public static final String INPUT_FIELDS = "mongo.input.fields";
    public static final String INPUT_SORT = "mongo.input.sort";
    public static final String INPUT_LIMIT = "mongo.input.limit";
    public static final String INPUT_SKIP = "mongo.input.skip";


    //Settings specific to bson reading/writing.
    public static final String BSON_READ_SPLITS = "bson.split.read_splits";
    public static final String BSON_WRITE_SPLITS = "bson.split.write_splits";
    public static final String BSON_OUTPUT_BUILDSPLITS = "bson.output.build_splits";
    public static final String BSON_PATHFILTER = "bson.pathfilter.class";
 

    /**
     * A username and password to use.
     *
     * This is necessary when running jobs with a sharded cluster, as 
     * access to the config database is needed to get 
     *
     */
    public static final String AUTH_URI = "mongo.auth.uri";


    /**
     * When *not* using 'read_from_shards' or 'read_shard_chunks'
     * The number of megabytes per Split to create for the input data.
     *
     * Currently defaults to 8MB, tweak it as necessary for your code.
     *
     * This default will likely change as we research better options.
     */
    public static final String INPUT_SPLIT_SIZE = "mongo.input.split_size";

    public static final int DEFAULT_SPLIT_SIZE = 8; // 8 mb per manual (non-sharding) split

    /**
     * If CREATE_INPUT_SPLITS is true but SPLITS_USE_CHUNKS is false, Mongo-Hadoop will attempt
     * to create custom input splits for you.  By default it will split on {@code _id}, which is a
     * reasonable/sane default.
     *
     * If you want to customize that split point for efficiency reasons (such as different distribution)
     * you may set this to any valid field name. The restriction on this key name are the *exact same rules*
     * as when sharding an existing MongoDB Collection.  You must have an index on the field, and follow the other
     * rules outlined in the docs.
     *
     * This must be a JSON document, and not just a field name!
     *
     * @link http://www.mongodb.org/display/DOCS/Sharding+Introduction#ShardingIntroduction-ShardKeys
     */
    public static final String INPUT_SPLIT_KEY_PATTERN = "mongo.input.split.split_key_pattern";

    /**
     * If {@code true}, the driver will attempt to split the MongoDB Input data (if reading from Mongo) into
     * multiple InputSplits to allow parallelism/concurrency in processing within Hadoop.  That is to say,
     * Hadoop will assign one InputSplit per mapper.
     * 
     * This is {@code true} by default now, but if {@code false}, only one InputSplit (your whole collection) will be
     * assigned to Hadoop – severely reducing parallel mapping.
     */
    public static final String CREATE_INPUT_SPLITS = "mongo.input.split.create_input_splits";

    /**
     * If {@code true} in a sharded setup splits will be made to connect to individual backend {@code mongod}s.  This
     * can be unsafe. If {@code mongos} is moving chunks around you might see duplicate data, or miss some data
     * entirely. Defaults to {@code false}
     */
    public static final String SPLITS_USE_SHARDS = "mongo.input.split.read_from_shards";
    /**
     * If {@code true} have one split = one shard chunk.  If {@link #SPLITS_USE_SHARDS} is not true splits will still
     * use chunks, but will connect through {@code mongos} instead of the individual backend {@code mongod}s (the safe
     * thing to do). If {@link #SPLITS_USE_SHARDS} is {@code true} but this is {@code false} one split will be made for
     * each backend shard. THIS IS UNSAFE and may result in data being run multiple times  Defaults to {@code true }
     */
    public static final String SPLITS_USE_CHUNKS = "mongo.input.split.read_shard_chunks";
    /**
     * If true then shards are replica sets run queries on slaves. If set this will override any option passed on the
     * URI.
     *
     * Defaults to {@code false}
     */
    public static final String SPLITS_SLAVE_OK = "mongo.input.split.allow_read_from_secondaries";

    /**
     * If true then queries for splits will be constructed using $lt/$gt instead of $min and $max.
     *
     * Defaults to {@code false}
     */
    public static final String SPLITS_USE_RANGEQUERY = "mongo.input.split.use_range_queries";

    public static boolean isJobVerbose( Configuration conf ){
        return conf.getBoolean( JOB_VERBOSE, false );
    }

    public static void setJobVerbose( Configuration conf, boolean val ){
        conf.setBoolean( JOB_VERBOSE, val );
    }

    public static boolean isJobBackground( Configuration conf ){
        return conf.getBoolean( JOB_BACKGROUND, false );
    }

    public static void setJobBackground( Configuration conf, boolean val ){
        conf.setBoolean( JOB_BACKGROUND, val );
    }

    // TODO - In light of key/value specifics should we have a base MongoMapper
    // class?
    public static Class getMapper( Configuration conf ){
        /** TODO - Support multiple inputs via getClasses ? **/
        return conf.getClass( JOB_MAPPER, null, Mapper.class );
    }

    public static void setMapper( Configuration conf, Class val ){
        conf.setClass( JOB_MAPPER, val, Mapper.class );
    }

    public static Class getMapperOutputKey( Configuration conf ){
        return conf.getClass( JOB_MAPPER_OUTPUT_KEY, null );
    }

    public static void setMapperOutputKey( Configuration conf, Class val ){
        conf.setClass( JOB_MAPPER_OUTPUT_KEY, val, Object.class );
    }

    public static Class getMapperOutputValue( Configuration conf ){
        return conf.getClass( JOB_MAPPER_OUTPUT_VALUE, null );
    }

    public static void setMapperOutputValue( Configuration conf, Class val ){
        conf.setClass( JOB_MAPPER_OUTPUT_VALUE, val, Object.class );
    }

    public static Class getCombiner( Configuration conf ){
        return conf.getClass( JOB_COMBINER, null, Reducer.class );
    }

    public static void setCombiner( Configuration conf, Class val ){
        conf.setClass( JOB_COMBINER, val, Reducer.class );
    }

    // TODO - In light of key/value specifics should we have a base MongoReducer
    // class?
    public static Class getReducer( Configuration conf ){
        /** TODO - Support multiple outputs via getClasses ? **/
        return conf.getClass( JOB_REDUCER, null, Reducer.class );
    }

    public static void setReducer( Configuration conf, Class val ){
        conf.setClass( JOB_REDUCER, val, Reducer.class );
    }

    public static Class getPartitioner( Configuration conf ){
        return conf.getClass( JOB_PARTITIONER, null, Partitioner.class );
    }

    public static void setPartitioner( Configuration conf, Class val ){
        conf.setClass( JOB_PARTITIONER, val, Partitioner.class );
    }

    public static Class getSortComparator( Configuration conf ){
        return conf.getClass( JOB_SORT_COMPARATOR, null, RawComparator.class );
    }

    public static void setSortComparator( Configuration conf, Class val ){
        conf.setClass( JOB_SORT_COMPARATOR, val, RawComparator.class );
    }

    public static Class getOutputFormat( Configuration conf ){
        return conf.getClass( JOB_OUTPUT_FORMAT, null, OutputFormat.class );
    }

    public static void setOutputFormat( Configuration conf, Class val ){
        conf.setClass( JOB_OUTPUT_FORMAT, val, OutputFormat.class );
    }

    public static Class getOutputKey( Configuration conf ){
        return conf.getClass( JOB_OUTPUT_KEY, null );
    }

    public static void setOutputKey( Configuration conf, Class val ){
        conf.setClass( JOB_OUTPUT_KEY, val, Object.class );
    }

    public static Class getOutputValue( Configuration conf ){
        return conf.getClass( JOB_OUTPUT_VALUE, null );
    }

    public static void setOutputValue( Configuration conf, Class val ){
        conf.setClass( JOB_OUTPUT_VALUE, val, Object.class );
    }

    public static Class getInputFormat( Configuration conf ){
        return conf.getClass( JOB_INPUT_FORMAT, null, InputFormat.class );
    }

    public static void setInputFormat( Configuration conf, Class val ){
        conf.setClass( JOB_INPUT_FORMAT, val, InputFormat.class );
    }

    public static List getMongoURIs( Configuration conf, String key ){
        final String raw = conf.get( key );
        if (raw != null && !raw.trim().isEmpty() ) {
            List result = new LinkedList();
            String[] split = StringUtils.split(raw);
            for (String mongoURI : split) {
                result.add(new MongoURI(mongoURI));
            }
            return result;
        }
        else
            return Collections.emptyList();
    }

    public static MongoURI getMongoURI( Configuration conf, String key ){
        final String raw = conf.get( key );
        if ( raw != null && !raw.trim().isEmpty() )
            return new MongoURI( raw );
        else
            return null;
    }

    public static MongoURI getInputURI( Configuration conf ){
        return getMongoURI( conf, INPUT_URI );
    }

    public static MongoURI getAuthURI( Configuration conf ){
        return getMongoURI( conf, AUTH_URI );
    }

    public static List getCollections( List uris ){
        List dbCollections = new LinkedList();
        for (MongoURI uri : uris) {
            dbCollections.add(getCollection(uri));
        }
        return dbCollections;
    }

    public static DBCollection getCollection( MongoURI uri ){
        try {
            Mongo mongo = _mongos.connect( uri );
            DB myDb = mongo.getDB(uri.getDatabase());

            //if there's a username and password
            if(uri.getUsername() != null && uri.getPassword() != null && !myDb.isAuthenticated()) {
                boolean auth = myDb.authenticate(uri.getUsername(), uri.getPassword());
                if(auth) {
                    log.info("Sucessfully authenticated with collection.");
                }
                else {
                    throw new IllegalArgumentException( "Unable to connect to collection." );
                }
            }
            return uri.connectCollection(mongo);
        }
        catch ( final Exception e ) {
            throw new IllegalArgumentException( "Unable to connect to collection." + e.getMessage(), e );
        }
    }

    public static DBCollection getOutputCollection( Configuration conf ){
        try {
            final MongoURI _uri = getOutputURI(conf);
            return getCollection(_uri);
        }
        catch ( final Exception e ) {
            throw new IllegalArgumentException( "Unable to connect to MongoDB Output Collection.", e );
        }
    }

    public static List getOutputCollections( Configuration conf ){
        try {
            final List _uris = getOutputURIs(conf);
            return getCollections(_uris);
        }
        catch ( final Exception e ) {
            throw new IllegalArgumentException( "Unable to connect to MongoDB Output Collection.", e );
        }
    }

    public static DBCollection getInputCollection( Configuration conf ){
        try {
            final MongoURI _uri = getInputURI(conf);
            return getCollection( _uri );
        }
        catch ( final Exception e ) {
            throw new IllegalArgumentException(
                    "Unable to connect to MongoDB Input Collection at '" + getInputURI( conf ) + "'", e );
        }
    }

    public static void setMongoURI( Configuration conf, String key, MongoURI value ){
        conf.set( key, value.toString() ); // todo - verify you can toString a
        // URI object
    }

    public static void setMongoURIString( Configuration conf, String key, String value ){

        try {
            final MongoURI uri = new MongoURI( value );
            setMongoURI( conf, key, uri );
        }
        catch ( final Exception e ) {
            throw new IllegalArgumentException( "Invalid Mongo URI '" + value + "' for Input URI", e );
        }
    }

    public static void setAuthURI( Configuration conf, String uri ){
        setMongoURIString( conf, AUTH_URI, uri );
    }

    public static void setInputURI( Configuration conf, String uri ){
        setMongoURIString( conf, INPUT_URI, uri );
    }

    public static void setInputURI( Configuration conf, MongoURI uri ){
        setMongoURI(conf, INPUT_URI, uri);
    }

    public static List getOutputURIs( Configuration conf ){
        return getMongoURIs(conf, OUTPUT_URI);
    }

    public static MongoURI getOutputURI( Configuration conf ){
        return getMongoURI( conf, OUTPUT_URI );
    }

    public static void setOutputURI( Configuration conf, String uri ){
        setMongoURIString( conf, OUTPUT_URI, uri );
    }

    public static void setOutputURI( Configuration conf, MongoURI uri ){
        setMongoURI( conf, OUTPUT_URI, uri );
    }

    /**
     * Set JSON but first validate it's parseable into a DBObject
     */
    public static void setJSON( Configuration conf, String key, String value ){
        try {
            final Object dbObj = JSON.parse( value );
            setDBObject( conf, key, (DBObject) dbObj );
        }
        catch ( final Exception e ) {
            log.error( "Cannot parse JSON...", e );
            throw new IllegalArgumentException( "Provided JSON String is not representable/parseable as a DBObject.",
                                                e );
        }
    }

    public static DBObject getDBObject( Configuration conf, String key ){
        try {
            final String json = conf.get( key );
            final DBObject obj = (DBObject) JSON.parse( json );
            if ( obj == null )
                return new BasicDBObject();
            else
                return obj;
        }
        catch ( final Exception e ) {
            throw new IllegalArgumentException( "Provided JSON String is not representable/parseable as a DBObject.",
                                                e );
        }
    }

    public static void setDBObject( Configuration conf, String key, DBObject value ){
        conf.set( key, JSON.serialize( value ) );
    }

    public static void setQuery( Configuration conf, String query ){
        setJSON( conf, INPUT_QUERY, query );
    }

    public static void setQuery( Configuration conf, DBObject query ){
        setDBObject( conf, INPUT_QUERY, query );
    }

    /**
     * Returns the configured query as a DBObject... If you want a string call toString() on the returned object. or use
     * JSON.serialize()
     */
    public static DBObject getQuery( Configuration conf ){
        return getDBObject( conf, INPUT_QUERY );
    }

    public static void setFields( Configuration conf, String fields ){
        setJSON( conf, INPUT_FIELDS, fields );
    }

    public static void setFields( Configuration conf, DBObject fields ){
        setDBObject( conf, INPUT_FIELDS, fields );
    }

    /**
     * Returns the configured fields as a DBObject... If you want a string call toString() on the returned object. or
     * use JSON.serialize()
     */
    public static DBObject getFields( Configuration conf ){
        return getDBObject( conf, INPUT_FIELDS );
    }

    public static void setSort( Configuration conf, String sort ){
        setJSON( conf, INPUT_SORT, sort );
    }

    public static void setSort( Configuration conf, DBObject sort ){
        setDBObject( conf, INPUT_SORT, sort );
    }

    /**
     * Returns the configured sort as a DBObject... If you want a string call toString() on the returned object. or use
     * JSON.serialize()
     */
    public static DBObject getSort( Configuration conf ){
        return getDBObject( conf, INPUT_SORT );
    }

    public static int getLimit( Configuration conf ){
        return conf.getInt( INPUT_LIMIT, 0 );
    }

    public static void setLimit( Configuration conf, int limit ){
        conf.setInt( INPUT_LIMIT, limit );
    }

    public static int getSkip( Configuration conf ){
        return conf.getInt( INPUT_SKIP, 0 );
    }

    public static void setSkip( Configuration conf, int skip ){
        conf.setInt( INPUT_SKIP, skip );
    }

    public static int getSplitSize( Configuration conf ){
        return conf.getInt( INPUT_SPLIT_SIZE, DEFAULT_SPLIT_SIZE );
    }

    public static void setSplitSize( Configuration conf, int value ){
        conf.setInt( INPUT_SPLIT_SIZE, value );
    }

    /**
     * if TRUE,
     * Splits will be queried using $lt/$gt instead of $max and $min.
     * This allows the database's query optimizer to choose the best index, 
     * instead of being forced to use the one in the $max/$min keys.
     * This will only work if the key used for splitting is *not* a compound key.
     * Make sure that all values under the splitting key are of the same type, or
     * this will cause incomplete results.
     * @return
     */
    public static boolean isRangeQueryEnabled( Configuration conf ){
        return conf.getBoolean( SPLITS_USE_RANGEQUERY, false );
    }

    public static void setRangeQueryEnabled( Configuration conf, boolean value ){
        conf.setBoolean( SPLITS_USE_RANGEQUERY, value );
    }

    /**
     * if TRUE,
     * Splits will be read by connecting to the individual shard servers,
     * Only use this 
     *  ( issue has to do with chunks moving / relocating during balancing phases)
     * @return
     */
    public static boolean canReadSplitsFromShards( Configuration conf ){
        return conf.getBoolean( SPLITS_USE_SHARDS, false );
    }

    public static void setReadSplitsFromShards( Configuration conf, boolean value ){
        conf.setBoolean( SPLITS_USE_SHARDS, value );
    }

    /**
     * If sharding is enabled,
     * Use the sharding configured chunks to split up data.
     */
    public static boolean isShardChunkedSplittingEnabled( Configuration conf ) {
        return conf.getBoolean( SPLITS_USE_CHUNKS, true );
    }

    public static void setShardChunkSplittingEnabled( Configuration conf, boolean value) {
        conf.setBoolean( SPLITS_USE_CHUNKS, value );
    }

    public static boolean canReadSplitsFromSecondary( Configuration conf ) {
        return conf.getBoolean( SPLITS_SLAVE_OK, false );
    }

    public static void setReadSplitsFromSecondary( Configuration conf, boolean value ) {
        conf.getBoolean( SPLITS_SLAVE_OK, value );
    }

    public static boolean createInputSplits( Configuration conf ) {
        return conf.getBoolean( CREATE_INPUT_SPLITS, true );
    }

    public static void setCreateInputSplits( Configuration conf, boolean value ) {
        conf.setBoolean( CREATE_INPUT_SPLITS, value );
    }

    public static void setInputSplitKeyPattern( Configuration conf, String pattern ) {
        setJSON( conf, INPUT_SPLIT_KEY_PATTERN, pattern );
    }

    public static void setInputSplitKey( Configuration conf, DBObject key ) {
        setDBObject( conf, INPUT_SPLIT_KEY_PATTERN, key );
    }
    
    public static String getInputSplitKeyPattern( Configuration conf ) {
        return conf.get( INPUT_SPLIT_KEY_PATTERN, "{ \"_id\": 1 }" );
    }
    
    public static DBObject getInputSplitKey( Configuration conf ) {
        try {
            final String json = getInputSplitKeyPattern( conf );
            final DBObject obj = (DBObject) JSON.parse( json );
            if ( obj == null )
                return new BasicDBObject("_id", 1);
            else
                return obj;
        }
        catch ( final Exception e ) {
            throw new IllegalArgumentException( "Provided JSON String is not representable/parseable as a DBObject.", e );
        }
    }


    public static void setInputKey( Configuration conf, String fieldName ) {
        // TODO (bwm) - validate key rules?
        conf.set( INPUT_KEY, fieldName );
    }
    
    public static String getInputKey( Configuration conf ) {
        return conf.get( INPUT_KEY, "_id" );
    }
   
    public static void setNoTimeout( Configuration conf, boolean value ) {
        conf.setBoolean( INPUT_NOTIMEOUT, value );
    }
    
    public static boolean isNoTimeout( Configuration conf ) {
        return conf.getBoolean( INPUT_NOTIMEOUT, false );
    }

    //BSON-specific config functions.
    public static boolean getBSONReadSplits( Configuration conf){
        return conf.getBoolean(BSON_READ_SPLITS, true);
    }

    public static void setBSONReadSplits( Configuration conf, boolean val){
        conf.setBoolean(BSON_READ_SPLITS, val);
    }

    public static boolean getBSONWriteSplits( Configuration conf){
        return conf.getBoolean(BSON_WRITE_SPLITS, true);
    }

    public static void setBSONWriteSplits( Configuration conf, boolean val){
        conf.setBoolean(BSON_WRITE_SPLITS, val);
    }

    public static boolean getBSONOutputBuildSplits( Configuration conf){
        return conf.getBoolean(BSON_OUTPUT_BUILDSPLITS, false);
    }

    public static void setBSONOutputBuildSplits( Configuration conf, boolean val){
        conf.setBoolean(BSON_OUTPUT_BUILDSPLITS, val);
    }

    public static void setBSONPathFilter( Configuration conf, Class val ){
        conf.setClass( BSON_PATHFILTER, val, PathFilter.class );
    }

    public static Class getBSONPathFilter( Configuration conf ){
        return conf.getClass( BSON_PATHFILTER, null );
    }

}