com.mongodb.hadoop.util.MongoConfigUtil Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-core Show documentation
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
There is a newer version: 2.0.2
Show newest version
/*
 * Copyright 2010-2013 10gen Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package com.mongodb.hadoop.util;

import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.MongoURI;
import com.mongodb.hadoop.splitter.MongoSplitter;
import com.mongodb.hadoop.splitter.SampleSplitter;
import com.mongodb.util.JSON;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.net.URI;
import java.net.UnknownHostException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;

/**
 * Configuration helper tool for MongoDB related Map/Reduce jobs
 */

public final class MongoConfigUtil {
    private static final Log LOG = LogFactory.getLog(MongoConfigUtil.class);

    /**
     * The JOB_* values are entirely optional and disregarded unless you use the MongoTool base toolset... If you don't, feel free to ignore
     * these
     */
    public static final String JOB_VERBOSE = "mongo.job.verbose";
    public static final String JOB_BACKGROUND = "mongo.job.background";

    public static final String JOB_MAPPER = "mongo.job.mapper";
    public static final String JOB_COMBINER = "mongo.job.combiner";
    public static final String JOB_PARTITIONER = "mongo.job.partitioner";
    public static final String JOB_REDUCER = "mongo.job.reducer";
    public static final String JOB_SORT_COMPARATOR = "mongo.job.sort_comparator";

    public static final String JOB_MAPPER_OUTPUT_KEY = "mongo.job.mapper.output.key";
    public static final String JOB_MAPPER_OUTPUT_VALUE = "mongo.job.mapper.output.value";

    public static final String JOB_INPUT_FORMAT = "mongo.job.input.format";
    public static final String JOB_OUTPUT_FORMAT = "mongo.job.output.format";

    public static final String JOB_OUTPUT_KEY = "mongo.job.output.key";
    public static final String JOB_OUTPUT_VALUE = "mongo.job.output.value";

    public static final String INPUT_URI = "mongo.input.uri";
    public static final String INPUT_MONGOS_HOSTS = "mongo.input.mongos_hosts";
    public static final String OUTPUT_URI = "mongo.output.uri";
    public static final String OUTPUT_BATCH_SIZE = "mongo.output.batch.size";
    public static final String OUTPUT_BULK_ORDERED = "mongo.output.bulk.ordered";

    public static final String MONGO_SPLITTER_CLASS = "mongo.splitter.class";

    /**
     * 
     * The MongoDB field to read from for the Mapper Input.
     * 
     * 
     * This will be fed to your mapper as the "Key" for the input.
     * 
     * 
     * Defaults to {@code _id}
     * 
     */
    public static final String INPUT_KEY = "mongo.input.key";
    public static final String INPUT_NOTIMEOUT = "mongo.input.notimeout";
    public static final String INPUT_QUERY = "mongo.input.query";
    public static final String INPUT_FIELDS = "mongo.input.fields";
    public static final String INPUT_SORT = "mongo.input.sort";
    public static final String INPUT_LIMIT = "mongo.input.limit";
    public static final String INPUT_SKIP = "mongo.input.skip";
    public static final String INPUT_LAZY_BSON = "mongo.input.lazy_bson";


    //Settings specific to bson reading/writing.
    public static final String BSON_SPLITS_PATH = "bson.split.splits_path";
    public static final String BSON_READ_SPLITS = "bson.split.read_splits";
    public static final String BSON_WRITE_SPLITS = "bson.split.write_splits";
    public static final String BSON_OUTPUT_BUILDSPLITS = "bson.output.build_splits";
    public static final String BSON_PATHFILTER = "bson.pathfilter.class";

    // Settings specific to reading from GridFS.
    public static final String GRIDFS_DELIMITER_PATTERN =
      "mongo.gridfs.delimiter.pattern";
    public static final String GRIDFS_DEFAULT_DELIMITER = "(\n|\r\n)";
    public static final String GRIDFS_WHOLE_FILE_SPLIT =
      "mongo.gridfs.whole_file";
    public static final String GRIDFS_READ_BINARY =
      "mongo.gridfs.read_binary";

    /**
     * 
     * A username and password to use.
     * 
     * 
     * This is necessary when running jobs with a sharded cluster, as access to the config database is needed to get
     * 
     */
    public static final String AUTH_URI = "mongo.auth.uri";


    /**
     * 
     * When *not* using 'read_from_shards' or 'read_shard_chunks' The number of megabytes per Split to create for the input data.
     * 
     * 
     * Currently defaults to 8MB, tweak it as necessary for your code.
     * 
     * 
     * This default will likely change as we research better options.
     * 
     */
    public static final String INPUT_SPLIT_SIZE = "mongo.input.split_size";

    public static final int DEFAULT_SPLIT_SIZE = 8; // 8 mb per manual (non-sharding) split

    /**
     * When {@code true}, MongoSplitter implementations will check for and
     * remove empty splits before returning them from {@code calculateSplits}.
     * This requires pulling a small amount of data from MongoDB but avoids
     * starting tasks that don't have any data to process.
     *
     * This option is useful when providing a query to {@link #INPUT_QUERY},
     * and that query selects a subset of the documents in the collection
     * that are clustered in the same range of the index described by
     * {@link #INPUT_SPLIT_KEY_PATTERN}. If the query selects documents
     * throughout the index, consider using
     * {@link com.mongodb.hadoop.splitter.MongoPaginatingSplitter} and setting
     * {@link #INPUT_SPLIT_MIN_DOCS} instead.
     */
    public static final String ENABLE_FILTER_EMPTY_SPLITS =
      "mongo.input.splits.filter_empty";

    /**
     * When {@link #SPLITS_USE_RANGEQUERY} is enabled, this option sets the
     * minimum number of documents to be contained in each MongoInputSplit
     * (does not apply to BSON). This option only applies when using
     * {@link com.mongodb.hadoop.splitter.MongoPaginatingSplitter} as the
     * splitter implementation.
     *
     * This value defaults to {@link #DEFAULT_INPUT_SPLIT_MIN_DOCS}.
     */
    public static final String INPUT_SPLIT_MIN_DOCS =
      "mongo.input.splits.min_docs";
    public static final int DEFAULT_INPUT_SPLIT_MIN_DOCS = 1000;

    /**
     * 
     * If CREATE_INPUT_SPLITS is true but SPLITS_USE_CHUNKS is false, Mongo-Hadoop will attempt to create custom input splits for you.  By
     * default it will split on {@code _id}, which is a reasonable/sane default.
     * 
     * 
     * If you want to customize that split point for efficiency reasons (such as different distribution) you may set this to any valid field
     * name. The restriction on this key name are the *exact same rules* as when sharding an existing MongoDB Collection.  You must have an
     * index on the field, and follow the other rules outlined in the docs.
     * 
     * 
     * To customize the range of the index that is used to create splits, see
     * the {@link #INPUT_SPLIT_KEY_MIN} and {@link #INPUT_SPLIT_KEY_MAX}
     * settings.
     * 
     * 
     * This must be a JSON document, and not just a field name!
     * 
     *
     * @see Shard Keys
     */
    public static final String INPUT_SPLIT_KEY_PATTERN = "mongo.input.split.split_key_pattern";

    /**
     * Lower-bound for splits created using the index described by
     * {@link #INPUT_SPLIT_KEY_PATTERN}. This value must be set to a JSON
     * string that describes a point in the index. This setting must be used
     * in conjunction with {@code INPUT_SPLIT_KEY_PATTERN} and
     * {@link #INPUT_SPLIT_KEY_MAX}.
     */
    public static final String INPUT_SPLIT_KEY_MIN = "mongo.input.split.split_key_min";

    /**
     * Upper-bound for splits created using the index described by
     * {@link #INPUT_SPLIT_KEY_PATTERN}. This value must be set to a JSON
     * string that describes a point in the index. This setting must be used
     * in conjuntion with {@code INPUT_SPLIT_KEY_PATTERN} and
     * {@link #INPUT_SPLIT_KEY_MIN}.
     */
    public static final String INPUT_SPLIT_KEY_MAX = "mongo.input.split.split_key_max";

    /**
     * 
     * If {@code true}, the driver will attempt to split the MongoDB Input data (if reading from Mongo) into multiple InputSplits to allow
     * parallelism/concurrency in processing within Hadoop.  That is to say, Hadoop will assign one InputSplit per mapper.
     * 
     * 
     * This is {@code true} by default now, but if {@code false}, only one InputSplit (your whole collection) will be assigned to Hadoop,
     * severely reducing parallel mapping.
     * 
     */
    public static final String CREATE_INPUT_SPLITS = "mongo.input.split.create_input_splits";

    /**
     * If {@code true} in a sharded setup splits will be made to connect to individual backend {@code mongod}s.  This can be unsafe. If
     * {@code mongos} is moving chunks around you might see duplicate data, or miss some data entirely. Defaults to {@code false}
     */
    public static final String SPLITS_USE_SHARDS = "mongo.input.split.read_from_shards";
    /**
     * If {@code true} have one split = one shard chunk.  If {@link #SPLITS_USE_SHARDS} is not true splits will still use chunks, but will
     * connect through {@code mongos} instead of the individual backend {@code mongod}s (the safe thing to do). If {@link
     * #SPLITS_USE_SHARDS} is {@code true} but this is {@code false} one split will be made for each backend shard. THIS IS UNSAFE and may
     * result in data being run multiple times  Defaults to {@code true }
     */
    public static final String SPLITS_USE_CHUNKS = "mongo.input.split.read_shard_chunks";
    /**
     * 

     * If true then shards are replica sets run queries on slaves. If set this will override any option passed on the URI.
     * 
     * 
     * Defaults to {@code false}
     * 
     */
    public static final String SPLITS_SLAVE_OK = "mongo.input.split.allow_read_from_secondaries";

    /**
     * 
     * If true then queries for splits will be constructed using $lt/$gt instead of $min and $max.
     * 
     * 
     * Defaults to {@code false}
     * 
     */
    public static final String SPLITS_USE_RANGEQUERY = "mongo.input.split.use_range_queries";

    /**
     * One client per thread
     */
    private static final ThreadLocal> CLIENTS =
      new ThreadLocal>() {
          @Override public Map initialValue() {
              return new HashMap();
          }
      };

    private static final ThreadLocal> URI_MAP =
      new ThreadLocal>() {
          @Override
          public Map initialValue() {
              return new HashMap();
          }
      };

    private MongoConfigUtil() {
    }

    public static boolean isJobVerbose(final Configuration conf) {
        return conf.getBoolean(JOB_VERBOSE, false);
    }

    public static void setJobVerbose(final Configuration conf, final boolean val) {
        conf.setBoolean(JOB_VERBOSE, val);
    }

    public static boolean isJobBackground(final Configuration conf) {
        return conf.getBoolean(JOB_BACKGROUND, false);
    }

    public static void setJobBackground(final Configuration conf, final boolean val) {
        conf.setBoolean(JOB_BACKGROUND, val);
    }

    // TODO - In light of key/value specifics should we have a base MongoMapper
    // class?
    public static Class getMapper(final Configuration conf) {
        /** TODO - Support multiple inputs via getClasses ? **/
        return conf.getClass(JOB_MAPPER, null, Mapper.class);
    }

    public static void setMapper(final Configuration conf, final Class val) {
        conf.setClass(JOB_MAPPER, val, Mapper.class);
    }

    public static Class getMapperOutputKey(final Configuration conf) {
        return conf.getClass(JOB_MAPPER_OUTPUT_KEY, null);
    }

    public static void setMapperOutputKey(final Configuration conf, final Class val) {
        conf.setClass(JOB_MAPPER_OUTPUT_KEY, val, Object.class);
    }

    public static Class getMapperOutputValue(final Configuration conf) {
        return conf.getClass(JOB_MAPPER_OUTPUT_VALUE, null);
    }

    public static void setMapperOutputValue(final Configuration conf, final Class val) {
        conf.setClass(JOB_MAPPER_OUTPUT_VALUE, val, Object.class);
    }

    public static Class getCombiner(final Configuration conf) {
        return conf.getClass(JOB_COMBINER, null, Reducer.class);
    }

    public static void setCombiner(final Configuration conf, final Class val) {
        conf.setClass(JOB_COMBINER, val, Reducer.class);
    }

    // TODO - In light of key/value specifics should we have a base MongoReducer
    // class?
    public static Class getReducer(final Configuration conf) {
        /** TODO - Support multiple outputs via getClasses ? **/
        return conf.getClass(JOB_REDUCER, null, Reducer.class);
    }

    public static void setReducer(final Configuration conf, final Class val) {
        conf.setClass(JOB_REDUCER, val, Reducer.class);
    }

    public static Class getPartitioner(final Configuration conf) {
        return conf.getClass(JOB_PARTITIONER, null, Partitioner.class);
    }

    public static void setPartitioner(final Configuration conf, final Class val) {
        conf.setClass(JOB_PARTITIONER, val, Partitioner.class);
    }

    public static Class getSortComparator(final Configuration conf) {
        return conf.getClass(JOB_SORT_COMPARATOR, null, RawComparator.class);
    }

    public static void setSortComparator(final Configuration conf, final Class val) {
        conf.setClass(JOB_SORT_COMPARATOR, val, RawComparator.class);
    }

    public static Class getOutputFormat(final Configuration conf) {
        return conf.getClass(JOB_OUTPUT_FORMAT, null, OutputFormat.class);
    }

    public static void setOutputFormat(final Configuration conf, final Class val) {
        conf.setClass(JOB_OUTPUT_FORMAT, val, OutputFormat.class);
    }

    public static Class getOutputKey(final Configuration conf) {
        return conf.getClass(JOB_OUTPUT_KEY, null);
    }

    public static void setOutputKey(final Configuration conf, final Class val) {
        conf.setClass(JOB_OUTPUT_KEY, val, Object.class);
    }

    public static Class getOutputValue(final Configuration conf) {
        return conf.getClass(JOB_OUTPUT_VALUE, null);
    }

    public static void setOutputValue(final Configuration conf, final Class val) {
        conf.setClass(JOB_OUTPUT_VALUE, val, Object.class);
    }

    public static Class getInputFormat(final Configuration conf) {
        return conf.getClass(JOB_INPUT_FORMAT, null, InputFormat.class);
    }

    public static void setInputFormat(final Configuration conf, final Class val) {
        conf.setClass(JOB_INPUT_FORMAT, val, InputFormat.class);
    }

    public static List getMongoURIs(final Configuration conf, final String key) {
        String raw = conf.get(key);
        List result = new LinkedList();
        if (raw != null && !raw.trim().isEmpty()) {
            for (String connectionString : raw.split("mongodb://")) {
                // Try to be forgiving with formatting.
                connectionString = StringUtils.strip(connectionString, ", ");
                if (!connectionString.isEmpty()) {
                    result.add(
                      new MongoClientURI("mongodb://" + connectionString));
                }
            }
        }
        return result;
    }

    /**
     * @deprecated use {@link #getMongoClientURI(Configuration, String)} instead
     * @param conf the Configuration
     * @param key the key for the setting
     * @return the MongoURI stored for the given key
     */
    @Deprecated
    @SuppressWarnings("deprecation")
    public static MongoURI getMongoURI(final Configuration conf, final String key) {
        final String raw = conf.get(key);
        if (raw != null && !raw.trim().isEmpty()) {
            return new MongoURI(raw);
        } else {
            return null;
        }
    }

    /**
     * Retrieve a setting as a {@code MongoClientURI}.
     * @param conf the Configuration
     * @param key the key for the setting
     * @return the MongoClientURI stored for the given key
     */
    public static MongoClientURI getMongoClientURI(final Configuration conf, final String key) {
        final String raw = conf.get(key);
        return raw != null && !raw.trim().isEmpty() ? new MongoClientURI(raw) : null;
    }

    public static MongoClientURI getInputURI(final Configuration conf) {
        return getMongoClientURI(conf, INPUT_URI);
    }

    public static MongoClientURI getAuthURI(final Configuration conf) {
        return getMongoClientURI(conf, AUTH_URI);
    }

    public static List getCollections(final List uris, final MongoClientURI authURI) {
        List dbCollections = new LinkedList();
        for (MongoClientURI uri : uris) {
            if (authURI != null) {
                dbCollections.add(getCollectionWithAuth(uri, authURI));
            } else {
                dbCollections.add(getCollection(uri));
            }
        }
        return dbCollections;
    }

    /**
     * @deprecated use {@link #getCollection(MongoClientURI)}
     * @param uri the MongoDB URI
     * @return the DBCollection in the URI
     */
    @Deprecated
    public static DBCollection getCollection(final MongoURI uri) {
        return getCollection(new MongoClientURI(uri.toString()));
    }

    /**
     * Retrieve a DBCollection from a MongoDB URI.
     * @param uri the MongoDB URI
     * @return the DBCollection in the URI
     */
    public static DBCollection getCollection(final MongoClientURI uri) {
        try {
            return getMongoClient(uri).getDB(uri.getDatabase()).getCollection(uri.getCollection());
        } catch (Exception e) {
            throw new IllegalArgumentException("Couldn't connect and authenticate to get collection", e);
        }
    }

    /**
     * @deprecated use {@link #getCollectionWithAuth(MongoClientURI, MongoClientURI)} instead
     * @param authURI the URI with which to authenticate
     * @param uri the MongoDB URI
     * @return the authenticated DBCollection
     */
    @Deprecated
    public static DBCollection getCollectionWithAuth(final MongoURI uri, final MongoURI authURI) {
        return getCollectionWithAuth(
          new MongoClientURI(uri.toString()),
          new MongoClientURI(authURI.toString()));
    }

    /**
     * Get an authenticated DBCollection from a MongodB URI.
     * @param authURI the URI with which to authenticate
     * @param uri the MongoDB URI
     * @return the authenticated DBCollection
     */
    public static DBCollection getCollectionWithAuth(final MongoClientURI uri, final MongoClientURI authURI) {
        //Make sure auth uri is valid and actually has a username/pw to use
        if (authURI == null || authURI.getUsername() == null || authURI.getPassword() == null) {
            throw new IllegalArgumentException("auth URI is empty or does not contain a valid username/password combination.");
        }

        DBCollection coll;
        try {
            Mongo mongo = getMongoClient(authURI);
            coll = mongo.getDB(uri.getDatabase()).getCollection(uri.getCollection());
            return coll;
        } catch (Exception e) {
            throw new IllegalArgumentException("Couldn't connect and authenticate to get collection", e);
        }
    }

    public static DBCollection getOutputCollection(final Configuration conf) {
        try {
            return getCollection(getOutputURI(conf));
        } catch (final Exception e) {
            throw new IllegalArgumentException("Unable to connect to MongoDB Output Collection.", e);
        }
    }

    public static DBCollection getInputCollection(final Configuration conf) {
        try {
            return getCollection(getInputURI(conf));
        } catch (final Exception e) {
            throw new IllegalArgumentException(
                                                  "Unable to connect to MongoDB Input Collection at '" + getInputURI(conf) + "'", e);
        }
    }

    /**
     * @deprecated use {@link #setMongoURI(Configuration, String, MongoClientURI)} instead
     * @param conf the Configuration
     * @param key the key for the setting
     * @param value the value for the setting
     */
    @Deprecated
    public static void setMongoURI(final Configuration conf, final String key, final MongoURI value) {
        conf.set(key, value.toString()); // todo - verify you can toString a
        // URI object
    }

    /**
     * Helper for providing a {@code MongoClientURI} as the value for a setting.
     * @param conf  the Configuration
     * @param key   the key for the setting
     * @param value the value for the setting
     */
    public static void setMongoURI(final Configuration conf, final String key, final MongoClientURI value) {
        conf.set(key, value.toString()); // todo - verify you can toString a
        // URI object
    }

    public static void setMongoURIString(final Configuration conf, final String key, final String value) {
        setMongoURI(conf, key, new MongoClientURI(value));
    }

    public static void setAuthURI(final Configuration conf, final String uri) {
        setMongoURIString(conf, AUTH_URI, uri);
    }

    public static void setInputURI(final Configuration conf, final String uri) {
        setMongoURIString(conf, INPUT_URI, uri);
    }

    /**
     * @deprecated use {@link #setInputURI(Configuration, MongoClientURI)} instead
     * @param conf the Configuration
     * @param uri the MongoURI
     */
    @Deprecated
    @SuppressWarnings("deprecation")
    public static void setInputURI(final Configuration conf, final MongoURI uri) {
        setMongoURI(conf, INPUT_URI, uri);
    }

    /**
     * Set the input URI for the job.
     * @param conf the Configuration
     * @param uri the MongoDB URI
     */
    public static void setInputURI(final Configuration conf, final MongoClientURI uri) {
        setMongoURI(conf, INPUT_URI, uri);
    }

    public static List getOutputURIs(final Configuration conf) {
        return getMongoURIs(conf, OUTPUT_URI);
    }

    public static MongoClientURI getOutputURI(final Configuration conf) {
        return getMongoClientURI(conf, OUTPUT_URI);
    }

    public static void setOutputURI(final Configuration conf, final String uri) {
        setMongoURIString(conf, OUTPUT_URI, uri);
    }

    /**
     * @deprecated use {@link #setOutputURI(Configuration, MongoClientURI)} instead
     * @param conf the Configuration
     * @param uri the MongoDB URI
     */
    @Deprecated
    @SuppressWarnings("deprecation")
    public static void setOutputURI(final Configuration conf, final MongoURI uri) {
        setMongoURI(conf, OUTPUT_URI, uri);
    }

    /**
     * Set the output URI for the job.
     * @param conf the Configuration
     * @param uri the MongoDB URI
     */
    public static void setOutputURI(final Configuration conf, final MongoClientURI uri) {
        setMongoURI(conf, OUTPUT_URI, uri);
    }

    /**
     * Get the maximum number of documents that should be loaded into memory
     * and sent in a batch to MongoDB as the output of a job.
     * @param conf the Configuration
     * @return the number of documents
     */
    public static int getBatchSize(final Configuration conf) {
        return conf.getInt(OUTPUT_BATCH_SIZE, 1000);
    }

    /**
     * Get whether or not bulk writes will be ordered
     * and sent in a batch to MongoDB as the output of a job.
     * @param conf the Configuration
     * @return true if bulk writes will be ordered, false otherwise
     */
    public static boolean isBulkOrdered(final Configuration conf) {
        return conf.getBoolean(OUTPUT_BULK_ORDERED, true);
    }

    /**
     * Set whether or not bulk writes should be ordered.
     * @param conf the Configuration
     * @param ordered true if bulk writes should be ordered, false otherwise
     */
    public static void setBulkOrdered(final Configuration conf, final boolean ordered) {
        conf.setBoolean(OUTPUT_BULK_ORDERED, ordered);
    }

    /**
     * Set the maximum number of documents that should be loaded into memory
     * and sent in a batch to MongoDB as the output of a job.
     * @param conf the Configuration
     * @param size the number of documents
     */
    public static void setBatchSize(final Configuration conf, final int size) {
        conf.setInt(OUTPUT_BATCH_SIZE, size);
    }

    /**
     * Helper for providing a JSON string as a value for a setting.
     * @param conf the Configuration
     * @param key the key for the setting
     * @param value the JSON string value
     */
    public static void setJSON(final Configuration conf, final String key, final String value) {
        try {
            final Object dbObj = JSON.parse(value);
            setDBObject(conf, key, (DBObject) dbObj);
        } catch (final Exception e) {
            LOG.error("Cannot parse JSON...", e);
            throw new IllegalArgumentException("Provided JSON String is not representable/parseable as a DBObject.",
                                               e);
        }
    }

    public static DBObject getDBObject(final Configuration conf, final String key) {
        try {
            final String json = conf.get(key);
            final DBObject obj = (DBObject) JSON.parse(json);
            if (obj == null) {
                return new BasicDBObject();
            } else {
                return obj;
            }
        } catch (final Exception e) {
            throw new IllegalArgumentException("Provided JSON String is not representable/parseable as a DBObject.",
                                               e);
        }
    }

    public static void setDBObject(final Configuration conf, final String key, final DBObject value) {
        conf.set(key, JSON.serialize(value));
    }

    public static void setQuery(final Configuration conf, final String query) {
        setJSON(conf, INPUT_QUERY, query);
    }

    /**
     * Set the query set for the Job using a DBObject.
     * @param conf the Configuration
     * @param query the query
     */
    public static void setQuery(final Configuration conf, final DBObject query) {
        setDBObject(conf, INPUT_QUERY, query);
    }

    public static DBObject getQuery(final Configuration conf) {
        return getDBObject(conf, INPUT_QUERY);
    }

    public static void setFields(final Configuration conf, final String fields) {
        setJSON(conf, INPUT_FIELDS, fields);
    }

    /**
     * Specify a projection document for documents retrieved from MongoDB.
     * @param conf the Configuration
     * @param fields a projection document
     */
    public static void setFields(final Configuration conf, final DBObject fields) {
        setDBObject(conf, INPUT_FIELDS, fields);
    }

    public static DBObject getFields(final Configuration conf) {
        return getDBObject(conf, INPUT_FIELDS);
    }

    public static void setSort(final Configuration conf, final String sort) {
        setJSON(conf, INPUT_SORT, sort);
    }

    /**
     * Specify the sort order as a DBObject.
     * @param conf the Configuration
     * @param sort the sort document
     */
    public static void setSort(final Configuration conf, final DBObject sort) {
        setDBObject(conf, INPUT_SORT, sort);
    }

    public static DBObject getSort(final Configuration conf) {
        return getDBObject(conf, INPUT_SORT);
    }

    public static int getLimit(final Configuration conf) {
        return conf.getInt(INPUT_LIMIT, 0);
    }

    public static void setLimit(final Configuration conf, final int limit) {
        conf.setInt(INPUT_LIMIT, limit);
    }

    public static int getSkip(final Configuration conf) {
        return conf.getInt(INPUT_SKIP, 0);
    }

    public static void setSkip(final Configuration conf, final int skip) {
        conf.setInt(INPUT_SKIP, skip);
    }

    public static boolean getLazyBSON(final Configuration conf) {
        return conf.getBoolean(INPUT_LAZY_BSON, false);
    }

    public static void setLazyBSON(final Configuration conf, final boolean lazy) {
        conf.setBoolean(INPUT_LAZY_BSON, lazy);
    }

    public static int getSplitSize(final Configuration conf) {
        return conf.getInt(INPUT_SPLIT_SIZE, DEFAULT_SPLIT_SIZE);
    }

    public static void setSplitSize(final Configuration conf, final int value) {
        conf.setInt(INPUT_SPLIT_SIZE, value);
    }

    public static void setEnableFilterEmptySplits(
      final Configuration conf, final boolean value) {
        conf.setBoolean(ENABLE_FILTER_EMPTY_SPLITS, value);
    }

    public static boolean isFilterEmptySplitsEnabled(final Configuration conf) {
        return conf.getBoolean(ENABLE_FILTER_EMPTY_SPLITS, false);
    }

    public static void setInputSplitMinDocs(
      final Configuration conf, final int value) {
        if (value < 0) {
            throw new IllegalArgumentException(
              INPUT_SPLIT_MIN_DOCS + " must be greater than 0.");
        }
        conf.setInt(INPUT_SPLIT_MIN_DOCS, value);
    }

    public static int getInputSplitMinDocs(final Configuration conf) {
        return conf.getInt(INPUT_SPLIT_MIN_DOCS, DEFAULT_INPUT_SPLIT_MIN_DOCS);
    }

    public static boolean isRangeQueryEnabled(final Configuration conf) {
        return conf.getBoolean(SPLITS_USE_RANGEQUERY, false);
    }

    /**
     * Enable using {@code $lt} and {@code $gt} to define InputSplits rather
     * than {@code $min} and {@code $max}. This allows the database's query
     * optimizer to choose the best index instead of using the one in the
     * $max/$min keys. This will only work if the key used for splitting is
     * *not* a compound key. Make sure that all values under the splitting key
     * are of the same type, or this will cause incomplete results.
     *
     * @param conf the Configuration
     * @param value enables using {@code $lt} and {@code $gt}
     */
    public static void setRangeQueryEnabled(final Configuration conf, final boolean value) {
        conf.setBoolean(SPLITS_USE_RANGEQUERY, value);
    }

    public static boolean canReadSplitsFromShards(final Configuration conf) {
        return conf.getBoolean(SPLITS_USE_SHARDS, false);
    }

    /**
     * Set whether the reading directly from shards is enabled.
     *
     * When {@code true}, splits are read directly from shards. By default,
     * splits are read through a mongos router when connected to a sharded
     * cluster. Note that reading directly from shards can lead to bizarre
     * results when there are orphaned documents or if the balancer is running.
     *
     * @param conf the Configuration
     * @param value enables reading from shards directly
     *
     * @see Sharding Balancing
     * @see cleanupOrphaned command
     */
    public static void setReadSplitsFromShards(final Configuration conf, final boolean value) {
        conf.setBoolean(SPLITS_USE_SHARDS, value);
    }

    public static boolean isShardChunkedSplittingEnabled(final Configuration conf) {
        return conf.getBoolean(SPLITS_USE_CHUNKS, true);
    }

    public static int getSamplesPerSplit(final Configuration conf) {
        return conf.getInt(
          SampleSplitter.SAMPLES_PER_SPLIT,
          SampleSplitter.DEFAULT_SAMPLES_PER_SPLIT);
    }

    public static void setSamplesPerSplit(
      final Configuration conf, final int samples) {
        conf.setInt(SampleSplitter.SAMPLES_PER_SPLIT, samples);
    }

    /**
     * Set whether using shard chunk splits as InputSplits is enabled.
     * @param conf the Configuration
     * @param value enables using shard chunk splits as InputSplits.
     */
    public static void setShardChunkSplittingEnabled(final Configuration conf, final boolean value) {
        conf.setBoolean(SPLITS_USE_CHUNKS, value);
    }

    public static boolean canReadSplitsFromSecondary(final Configuration conf) {
        return conf.getBoolean(SPLITS_SLAVE_OK, false);
    }

    public static void setReadSplitsFromSecondary(final Configuration conf, final boolean value) {
        conf.getBoolean(SPLITS_SLAVE_OK, value);
    }

    public static boolean createInputSplits(final Configuration conf) {
        return conf.getBoolean(CREATE_INPUT_SPLITS, true);
    }

    public static void setCreateInputSplits(final Configuration conf, final boolean value) {
        conf.setBoolean(CREATE_INPUT_SPLITS, value);
    }

    public static void setInputSplitKeyPattern(final Configuration conf, final String pattern) {
        setJSON(conf, INPUT_SPLIT_KEY_PATTERN, pattern);
    }

    public static void setInputSplitKey(final Configuration conf, final DBObject key) {
        setDBObject(conf, INPUT_SPLIT_KEY_PATTERN, key);
    }

    public static String getInputSplitKeyPattern(final Configuration conf) {
        return conf.get(INPUT_SPLIT_KEY_PATTERN, "{ \"_id\": 1 }");
    }

    public static DBObject getInputSplitKey(final Configuration conf) {
        try {
            final String json = getInputSplitKeyPattern(conf);
            final DBObject obj = (DBObject) JSON.parse(json);
            if (obj == null) {
                return new BasicDBObject("_id", 1);
            } else {
                return obj;
            }
        } catch (final Exception e) {
            throw new IllegalArgumentException("Provided JSON String is not representable/parsable as a DBObject.", e);
        }
    }

    public static DBObject getMinSplitKey(final Configuration configuration) {
        return getDBObject(configuration, INPUT_SPLIT_KEY_MIN);
    }

    public static DBObject getMaxSplitKey(final Configuration configuration) {
        return getDBObject(configuration, INPUT_SPLIT_KEY_MAX);
    }

    public static void setMinSplitKey(
      final Configuration conf, final String string) {
        conf.set(INPUT_SPLIT_KEY_MIN, string);
    }

    public static void setMaxSplitKey(
      final Configuration conf, final String string) {
        conf.set(INPUT_SPLIT_KEY_MAX, string);
    }

    public static void setInputKey(final Configuration conf, final String fieldName) {
        // TODO (bwm) - validate key rules?
        conf.set(INPUT_KEY, fieldName);
    }

    public static String getInputKey(final Configuration conf) {
        return conf.get(INPUT_KEY, "_id");
    }

    public static String getGridFSDelimiterPattern(final Configuration conf) {
        return conf.get(GRIDFS_DELIMITER_PATTERN, GRIDFS_DEFAULT_DELIMITER);
    }

    public static void setGridFSDelimiterPattern(
      final Configuration conf, final String pattern) {
        conf.set(GRIDFS_DELIMITER_PATTERN, pattern);
    }

    public static boolean isGridFSWholeFileSplit(final Configuration conf) {
        return conf.getBoolean(GRIDFS_WHOLE_FILE_SPLIT, false);
    }

    public static void setGridFSWholeFileSplit(
      final Configuration conf, final boolean split) {
        conf.setBoolean(GRIDFS_WHOLE_FILE_SPLIT, split);
    }

    public static boolean isGridFSReadBinary(final Configuration conf) {
        return conf.getBoolean(GRIDFS_READ_BINARY, false);
    }

    public static void setGridFSReadBinary(
      final Configuration conf, final boolean readBinary) {
        conf.setBoolean(GRIDFS_READ_BINARY, readBinary);
    }

    public static void setNoTimeout(final Configuration conf, final boolean value) {
        conf.setBoolean(INPUT_NOTIMEOUT, value);
    }

    public static boolean isNoTimeout(final Configuration conf) {
        return conf.getBoolean(INPUT_NOTIMEOUT, false);
    }

    //BSON-specific config functions.
    public static boolean getBSONReadSplits(final Configuration conf) {
        return conf.getBoolean(BSON_READ_SPLITS, true);
    }

    public static void setBSONReadSplits(final Configuration conf, final boolean val) {
        conf.setBoolean(BSON_READ_SPLITS, val);
    }

    public static boolean getBSONWriteSplits(final Configuration conf) {
        return conf.getBoolean(BSON_WRITE_SPLITS, true);
    }

    public static void setBSONWriteSplits(final Configuration conf, final boolean val) {
        conf.setBoolean(BSON_WRITE_SPLITS, val);
    }

    public static boolean getBSONOutputBuildSplits(final Configuration conf) {
        return conf.getBoolean(BSON_OUTPUT_BUILDSPLITS, false);
    }

    public static void setBSONOutputBuildSplits(final Configuration conf, final boolean val) {
        conf.setBoolean(BSON_OUTPUT_BUILDSPLITS, val);
    }

    public static void setBSONPathFilter(final Configuration conf, final Class val) {
        conf.setClass(BSON_PATHFILTER, val, PathFilter.class);
    }

    public static Class getBSONPathFilter(final Configuration conf) {
        return conf.getClass(BSON_PATHFILTER, null);
    }

    public static String getBSONSplitsPath(final Configuration conf) {
        return conf.get(BSON_SPLITS_PATH);
    }

    public static void setBSONSplitsPath(final Configuration conf, final
                                         String path) {
        conf.set(BSON_SPLITS_PATH, path);
    }

    public static Class getSplitterClass(final Configuration conf) {
        return conf.getClass(MONGO_SPLITTER_CLASS, null, MongoSplitter.class);
    }

    public static void setSplitterClass(final Configuration conf, final Class val) {
        conf.setClass(MONGO_SPLITTER_CLASS, val, MongoSplitter.class);
    }

    public static List getInputMongosHosts(final Configuration conf) {
        String raw = conf.get(INPUT_MONGOS_HOSTS, null);

        if (raw == null || raw.length() == 0) {
            return Collections.emptyList(); // empty list - no mongos specified
        }

        // List of hostnames delimited by whitespace
        return Arrays.asList(StringUtils.split(raw));
    }

    public static void setInputMongosHosts(final Configuration conf, final List hostnames) {
        String raw = "";
        if (hostnames != null) {
            raw = StringUtils.join(hostnames, ' ');
        }

        conf.set(INPUT_MONGOS_HOSTS, raw);
    }

    /**
     * Fetch a class by its name, rather than by a key name in the
     * Configuration properties. The Configuration class is used for its
     * internal cache of class names and to ensure that the same ClassLoader is
     * used across all keys.
     * @param conf the Configuration
     * @param className the name of the class
     * @param xface an interface or superclass of expected class
     * @param  the type of xface
     * @return the class or {@code null} if not found
     */
    public static  Class getClassByName(final Configuration conf,
                                                        final String className,
                                                        final Class xface) {

        if (className == null) {
            return null;
        }
        try {
            Class theClass = conf.getClassByName(className);
            if (theClass != null && !xface.isAssignableFrom(theClass)) {
                throw new RuntimeException(theClass + " not " + xface.getName());
            } else if (theClass != null) {
                return theClass.asSubclass(xface);
            } else {
                return null;
            }
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public static Configuration buildConfiguration(final Map data) {
        Configuration newConf = new Configuration();
        for (Entry entry : data.entrySet()) {
            String key = entry.getKey();
            Object val = entry.getValue();
            if (val instanceof String) {
                newConf.set(key, (String) val);
            } else if (val instanceof Boolean) {
                newConf.setBoolean(key, (Boolean) val);
            } else if (val instanceof Integer) {
                newConf.setInt(key, (Integer) val);
            } else if (val instanceof Float) {
                newConf.setFloat(key, (Float) val);
            } else if (val instanceof DBObject) {
                setDBObject(newConf, key, (DBObject) val);
            } else {
                throw new RuntimeException("can't convert " + val.getClass() + " into any type for Configuration");
            }
        }
        return newConf;
    }

    public static void close(final Mongo client) {
            MongoClientURI uri = URI_MAP.get().remove(client);
            if (uri != null) {
                MongoClient remove;
                remove = CLIENTS.get().remove(uri);
                if (remove != client) {
                    throw new IllegalStateException("different clients found");
                }
                client.close();
            }
    }

    private static MongoClient getMongoClient(final MongoClientURI uri) throws UnknownHostException {
        MongoClient mongoClient = CLIENTS.get().get(uri);
            if (mongoClient == null) {
                mongoClient = new MongoClient(uri);
                CLIENTS.get().put(uri, mongoClient);
                URI_MAP.get().put(mongoClient, uri);
            }
            return mongoClient;
        }

    /**
     * Load Properties stored in a .properties file.
     * @param conf the Configuration
     * @param filename the path to the properties file
     * @return the Properties in the file
     * @throws IOException if there was a problem reading the properties file
     */
    public static Properties readPropertiesFromFile(
      final Configuration conf, final String filename)
      throws IOException {
        Path propertiesFilePath = new Path(filename);
        FileSystem fs = FileSystem.get(URI.create(filename), conf);
        if (!fs.exists(propertiesFilePath)) {
            throw new IOException(
              "Properties file does not exist: " + filename);
        }
        FSDataInputStream inputStream = fs.open(propertiesFilePath);
        Properties properties = new Properties();
        try {
            properties.load(inputStream);
        } finally {
            inputStream.close();
        }
        return properties;
    }
}