All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mongodb.hadoop.util.MongoConfigUtil Maven / Gradle / Ivy

The newest version!
// MongoConfigUtil.java
/*
 * Copyright 2010 10gen Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mongodb.hadoop.util;

import com.mongodb.*;
import com.mongodb.util.*;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.fs.PathFilter;

import java.util.*;

/**
 * Configuration helper tool for MongoDB related Map/Reduce jobs
 */

public class MongoConfigUtil {
    private static final Log log = LogFactory.getLog( MongoConfigUtil.class );

    private static final Mongo.Holder _mongos = new Mongo.Holder();

    /**
     * The JOB_* values are entirely optional and disregarded unless you use the MongoTool base toolset... If you don't,
     * feel free to ignore these
     */
    public static final String JOB_VERBOSE = "mongo.job.verbose";
    public static final String JOB_BACKGROUND = "mongo.job.background";

    public static final String JOB_MAPPER = "mongo.job.mapper";
    public static final String JOB_COMBINER = "mongo.job.combiner";
    public static final String JOB_PARTITIONER = "mongo.job.partitioner";
    public static final String JOB_REDUCER = "mongo.job.reducer";
    public static final String JOB_SORT_COMPARATOR = "mongo.job.sort_comparator";

    public static final String JOB_MAPPER_OUTPUT_KEY = "mongo.job.mapper.output.key";
    public static final String JOB_MAPPER_OUTPUT_VALUE = "mongo.job.mapper.output.value";

    public static final String JOB_INPUT_FORMAT = "mongo.job.input.format";
    public static final String JOB_OUTPUT_FORMAT = "mongo.job.output.format";

    public static final String JOB_OUTPUT_KEY = "mongo.job.output.key";
    public static final String JOB_OUTPUT_VALUE = "mongo.job.output.value";

    public static final String INPUT_URI = "mongo.input.uri";
    public static final String OUTPUT_URI = "mongo.output.uri";


    /**
     * The MongoDB field to read from for the Mapper Input.
     *
     * This will be fed to your mapper as the "Key" for the input.
     *
     * Defaults to {@code _id}
     */
    public static final String INPUT_KEY = "mongo.input.key";
    public static final String INPUT_NOTIMEOUT = "mongo.input.notimeout";
    public static final String INPUT_QUERY = "mongo.input.query";
    public static final String INPUT_FIELDS = "mongo.input.fields";
    public static final String INPUT_SORT = "mongo.input.sort";
    public static final String INPUT_LIMIT = "mongo.input.limit";
    public static final String INPUT_SKIP = "mongo.input.skip";


    //Settings specific to bson reading/writing.
    public static final String BSON_READ_SPLITS = "bson.split.read_splits";
    public static final String BSON_WRITE_SPLITS = "bson.split.write_splits";
    public static final String BSON_OUTPUT_BUILDSPLITS = "bson.output.build_splits";
    public static final String BSON_PATHFILTER = "bson.pathfilter.class";
 

    /**
     * A username and password to use.
     *
     * This is necessary when running jobs with a sharded cluster, as 
     * access to the config database is needed to get 
     *
     */
    public static final String AUTH_URI = "mongo.auth.uri";


    /**
     * When *not* using 'read_from_shards' or 'read_shard_chunks'
     * The number of megabytes per Split to create for the input data.
     *
     * Currently defaults to 8MB, tweak it as necessary for your code.
     *
     * This default will likely change as we research better options.
     */
    public static final String INPUT_SPLIT_SIZE = "mongo.input.split_size";

    public static final int DEFAULT_SPLIT_SIZE = 8; // 8 mb per manual (non-sharding) split

    /**
     * If CREATE_INPUT_SPLITS is true but SPLITS_USE_CHUNKS is false, Mongo-Hadoop will attempt
     * to create custom input splits for you.  By default it will split on {@code _id}, which is a
     * reasonable/sane default.
     *
     * If you want to customize that split point for efficiency reasons (such as different distribution)
     * you may set this to any valid field name. The restriction on this key name are the *exact same rules*
     * as when sharding an existing MongoDB Collection.  You must have an index on the field, and follow the other
     * rules outlined in the docs.
     *
     * This must be a JSON document, and not just a field name!
     *
     * @link http://www.mongodb.org/display/DOCS/Sharding+Introduction#ShardingIntroduction-ShardKeys
     */
    public static final String INPUT_SPLIT_KEY_PATTERN = "mongo.input.split.split_key_pattern";

    /**
     * If {@code true}, the driver will attempt to split the MongoDB Input data (if reading from Mongo) into
     * multiple InputSplits to allow parallelism/concurrency in processing within Hadoop.  That is to say,
     * Hadoop will assign one InputSplit per mapper.
     * 
     * This is {@code true} by default now, but if {@code false}, only one InputSplit (your whole collection) will be
     * assigned to Hadoop – severely reducing parallel mapping.
     */
    public static final String CREATE_INPUT_SPLITS = "mongo.input.split.create_input_splits";

    /**
     * If {@code true} in a sharded setup splits will be made to connect to individual backend {@code mongod}s.  This
     * can be unsafe. If {@code mongos} is moving chunks around you might see duplicate data, or miss some data
     * entirely. Defaults to {@code false}
     */
    public static final String SPLITS_USE_SHARDS = "mongo.input.split.read_from_shards";
    /**
     * If {@code true} have one split = one shard chunk.  If {@link #SPLITS_USE_SHARDS} is not true splits will still
     * use chunks, but will connect through {@code mongos} instead of the individual backend {@code mongod}s (the safe
     * thing to do). If {@link #SPLITS_USE_SHARDS} is {@code true} but this is {@code false} one split will be made for
     * each backend shard. THIS IS UNSAFE and may result in data being run multiple times 

Defaults to {@code true } */ public static final String SPLITS_USE_CHUNKS = "mongo.input.split.read_shard_chunks"; /** * If true then shards are replica sets run queries on slaves. If set this will override any option passed on the * URI. * * Defaults to {@code false} */ public static final String SPLITS_SLAVE_OK = "mongo.input.split.allow_read_from_secondaries"; /** * If true then queries for splits will be constructed using $lt/$gt instead of $min and $max. * * Defaults to {@code false} */ public static final String SPLITS_USE_RANGEQUERY = "mongo.input.split.use_range_queries"; public static boolean isJobVerbose( Configuration conf ){ return conf.getBoolean( JOB_VERBOSE, false ); } public static void setJobVerbose( Configuration conf, boolean val ){ conf.setBoolean( JOB_VERBOSE, val ); } public static boolean isJobBackground( Configuration conf ){ return conf.getBoolean( JOB_BACKGROUND, false ); } public static void setJobBackground( Configuration conf, boolean val ){ conf.setBoolean( JOB_BACKGROUND, val ); } // TODO - In light of key/value specifics should we have a base MongoMapper // class? public static Class getMapper( Configuration conf ){ /** TODO - Support multiple inputs via getClasses ? **/ return conf.getClass( JOB_MAPPER, null, Mapper.class ); } public static void setMapper( Configuration conf, Class val ){ conf.setClass( JOB_MAPPER, val, Mapper.class ); } public static Class getMapperOutputKey( Configuration conf ){ return conf.getClass( JOB_MAPPER_OUTPUT_KEY, null ); } public static void setMapperOutputKey( Configuration conf, Class val ){ conf.setClass( JOB_MAPPER_OUTPUT_KEY, val, Object.class ); } public static Class getMapperOutputValue( Configuration conf ){ return conf.getClass( JOB_MAPPER_OUTPUT_VALUE, null ); } public static void setMapperOutputValue( Configuration conf, Class val ){ conf.setClass( JOB_MAPPER_OUTPUT_VALUE, val, Object.class ); } public static Class getCombiner( Configuration conf ){ return conf.getClass( JOB_COMBINER, null, Reducer.class ); } public static void setCombiner( Configuration conf, Class val ){ conf.setClass( JOB_COMBINER, val, Reducer.class ); } // TODO - In light of key/value specifics should we have a base MongoReducer // class? public static Class getReducer( Configuration conf ){ /** TODO - Support multiple outputs via getClasses ? **/ return conf.getClass( JOB_REDUCER, null, Reducer.class ); } public static void setReducer( Configuration conf, Class val ){ conf.setClass( JOB_REDUCER, val, Reducer.class ); } public static Class getPartitioner( Configuration conf ){ return conf.getClass( JOB_PARTITIONER, null, Partitioner.class ); } public static void setPartitioner( Configuration conf, Class val ){ conf.setClass( JOB_PARTITIONER, val, Partitioner.class ); } public static Class getSortComparator( Configuration conf ){ return conf.getClass( JOB_SORT_COMPARATOR, null, RawComparator.class ); } public static void setSortComparator( Configuration conf, Class val ){ conf.setClass( JOB_SORT_COMPARATOR, val, RawComparator.class ); } public static Class getOutputFormat( Configuration conf ){ return conf.getClass( JOB_OUTPUT_FORMAT, null, OutputFormat.class ); } public static void setOutputFormat( Configuration conf, Class val ){ conf.setClass( JOB_OUTPUT_FORMAT, val, OutputFormat.class ); } public static Class getOutputKey( Configuration conf ){ return conf.getClass( JOB_OUTPUT_KEY, null ); } public static void setOutputKey( Configuration conf, Class val ){ conf.setClass( JOB_OUTPUT_KEY, val, Object.class ); } public static Class getOutputValue( Configuration conf ){ return conf.getClass( JOB_OUTPUT_VALUE, null ); } public static void setOutputValue( Configuration conf, Class val ){ conf.setClass( JOB_OUTPUT_VALUE, val, Object.class ); } public static Class getInputFormat( Configuration conf ){ return conf.getClass( JOB_INPUT_FORMAT, null, InputFormat.class ); } public static void setInputFormat( Configuration conf, Class val ){ conf.setClass( JOB_INPUT_FORMAT, val, InputFormat.class ); } public static List getMongoURIs( Configuration conf, String key ){ final String raw = conf.get( key ); if (raw != null && !raw.trim().isEmpty() ) { List result = new LinkedList(); String[] split = StringUtils.split(raw); for (String mongoURI : split) { result.add(new MongoURI(mongoURI)); } return result; } else return Collections.emptyList(); } public static MongoURI getMongoURI( Configuration conf, String key ){ final String raw = conf.get( key ); if ( raw != null && !raw.trim().isEmpty() ) return new MongoURI( raw ); else return null; } public static MongoURI getInputURI( Configuration conf ){ return getMongoURI( conf, INPUT_URI ); } public static MongoURI getAuthURI( Configuration conf ){ return getMongoURI( conf, AUTH_URI ); } public static List getCollections( List uris ){ List dbCollections = new LinkedList(); for (MongoURI uri : uris) { dbCollections.add(getCollection(uri)); } return dbCollections; } public static DBCollection getCollection( MongoURI uri ){ try { Mongo mongo = _mongos.connect( uri ); DB myDb = mongo.getDB(uri.getDatabase()); //if there's a username and password if(uri.getUsername() != null && uri.getPassword() != null && !myDb.isAuthenticated()) { boolean auth = myDb.authenticate(uri.getUsername(), uri.getPassword()); if(auth) { log.info("Sucessfully authenticated with collection."); } else { throw new IllegalArgumentException( "Unable to connect to collection." ); } } return uri.connectCollection(mongo); } catch ( final Exception e ) { throw new IllegalArgumentException( "Unable to connect to collection." + e.getMessage(), e ); } } public static DBCollection getOutputCollection( Configuration conf ){ try { final MongoURI _uri = getOutputURI(conf); return getCollection(_uri); } catch ( final Exception e ) { throw new IllegalArgumentException( "Unable to connect to MongoDB Output Collection.", e ); } } public static List getOutputCollections( Configuration conf ){ try { final List _uris = getOutputURIs(conf); return getCollections(_uris); } catch ( final Exception e ) { throw new IllegalArgumentException( "Unable to connect to MongoDB Output Collection.", e ); } } public static DBCollection getInputCollection( Configuration conf ){ try { final MongoURI _uri = getInputURI(conf); return getCollection( _uri ); } catch ( final Exception e ) { throw new IllegalArgumentException( "Unable to connect to MongoDB Input Collection at '" + getInputURI( conf ) + "'", e ); } } public static void setMongoURI( Configuration conf, String key, MongoURI value ){ conf.set( key, value.toString() ); // todo - verify you can toString a // URI object } public static void setMongoURIString( Configuration conf, String key, String value ){ try { final MongoURI uri = new MongoURI( value ); setMongoURI( conf, key, uri ); } catch ( final Exception e ) { throw new IllegalArgumentException( "Invalid Mongo URI '" + value + "' for Input URI", e ); } } public static void setAuthURI( Configuration conf, String uri ){ setMongoURIString( conf, AUTH_URI, uri ); } public static void setInputURI( Configuration conf, String uri ){ setMongoURIString( conf, INPUT_URI, uri ); } public static void setInputURI( Configuration conf, MongoURI uri ){ setMongoURI(conf, INPUT_URI, uri); } public static List getOutputURIs( Configuration conf ){ return getMongoURIs(conf, OUTPUT_URI); } public static MongoURI getOutputURI( Configuration conf ){ return getMongoURI( conf, OUTPUT_URI ); } public static void setOutputURI( Configuration conf, String uri ){ setMongoURIString( conf, OUTPUT_URI, uri ); } public static void setOutputURI( Configuration conf, MongoURI uri ){ setMongoURI( conf, OUTPUT_URI, uri ); } /** * Set JSON but first validate it's parseable into a DBObject */ public static void setJSON( Configuration conf, String key, String value ){ try { final Object dbObj = JSON.parse( value ); setDBObject( conf, key, (DBObject) dbObj ); } catch ( final Exception e ) { log.error( "Cannot parse JSON...", e ); throw new IllegalArgumentException( "Provided JSON String is not representable/parseable as a DBObject.", e ); } } public static DBObject getDBObject( Configuration conf, String key ){ try { final String json = conf.get( key ); final DBObject obj = (DBObject) JSON.parse( json ); if ( obj == null ) return new BasicDBObject(); else return obj; } catch ( final Exception e ) { throw new IllegalArgumentException( "Provided JSON String is not representable/parseable as a DBObject.", e ); } } public static void setDBObject( Configuration conf, String key, DBObject value ){ conf.set( key, JSON.serialize( value ) ); } public static void setQuery( Configuration conf, String query ){ setJSON( conf, INPUT_QUERY, query ); } public static void setQuery( Configuration conf, DBObject query ){ setDBObject( conf, INPUT_QUERY, query ); } /** * Returns the configured query as a DBObject... If you want a string call toString() on the returned object. or use * JSON.serialize() */ public static DBObject getQuery( Configuration conf ){ return getDBObject( conf, INPUT_QUERY ); } public static void setFields( Configuration conf, String fields ){ setJSON( conf, INPUT_FIELDS, fields ); } public static void setFields( Configuration conf, DBObject fields ){ setDBObject( conf, INPUT_FIELDS, fields ); } /** * Returns the configured fields as a DBObject... If you want a string call toString() on the returned object. or * use JSON.serialize() */ public static DBObject getFields( Configuration conf ){ return getDBObject( conf, INPUT_FIELDS ); } public static void setSort( Configuration conf, String sort ){ setJSON( conf, INPUT_SORT, sort ); } public static void setSort( Configuration conf, DBObject sort ){ setDBObject( conf, INPUT_SORT, sort ); } /** * Returns the configured sort as a DBObject... If you want a string call toString() on the returned object. or use * JSON.serialize() */ public static DBObject getSort( Configuration conf ){ return getDBObject( conf, INPUT_SORT ); } public static int getLimit( Configuration conf ){ return conf.getInt( INPUT_LIMIT, 0 ); } public static void setLimit( Configuration conf, int limit ){ conf.setInt( INPUT_LIMIT, limit ); } public static int getSkip( Configuration conf ){ return conf.getInt( INPUT_SKIP, 0 ); } public static void setSkip( Configuration conf, int skip ){ conf.setInt( INPUT_SKIP, skip ); } public static int getSplitSize( Configuration conf ){ return conf.getInt( INPUT_SPLIT_SIZE, DEFAULT_SPLIT_SIZE ); } public static void setSplitSize( Configuration conf, int value ){ conf.setInt( INPUT_SPLIT_SIZE, value ); } /** * if TRUE, * Splits will be queried using $lt/$gt instead of $max and $min. * This allows the database's query optimizer to choose the best index, * instead of being forced to use the one in the $max/$min keys. * This will only work if the key used for splitting is *not* a compound key. * Make sure that all values under the splitting key are of the same type, or * this will cause incomplete results. * @return */ public static boolean isRangeQueryEnabled( Configuration conf ){ return conf.getBoolean( SPLITS_USE_RANGEQUERY, false ); } public static void setRangeQueryEnabled( Configuration conf, boolean value ){ conf.setBoolean( SPLITS_USE_RANGEQUERY, value ); } /** * if TRUE, * Splits will be read by connecting to the individual shard servers, * Only use this * ( issue has to do with chunks moving / relocating during balancing phases) * @return */ public static boolean canReadSplitsFromShards( Configuration conf ){ return conf.getBoolean( SPLITS_USE_SHARDS, false ); } public static void setReadSplitsFromShards( Configuration conf, boolean value ){ conf.setBoolean( SPLITS_USE_SHARDS, value ); } /** * If sharding is enabled, * Use the sharding configured chunks to split up data. */ public static boolean isShardChunkedSplittingEnabled( Configuration conf ) { return conf.getBoolean( SPLITS_USE_CHUNKS, true ); } public static void setShardChunkSplittingEnabled( Configuration conf, boolean value) { conf.setBoolean( SPLITS_USE_CHUNKS, value ); } public static boolean canReadSplitsFromSecondary( Configuration conf ) { return conf.getBoolean( SPLITS_SLAVE_OK, false ); } public static void setReadSplitsFromSecondary( Configuration conf, boolean value ) { conf.getBoolean( SPLITS_SLAVE_OK, value ); } public static boolean createInputSplits( Configuration conf ) { return conf.getBoolean( CREATE_INPUT_SPLITS, true ); } public static void setCreateInputSplits( Configuration conf, boolean value ) { conf.setBoolean( CREATE_INPUT_SPLITS, value ); } public static void setInputSplitKeyPattern( Configuration conf, String pattern ) { setJSON( conf, INPUT_SPLIT_KEY_PATTERN, pattern ); } public static void setInputSplitKey( Configuration conf, DBObject key ) { setDBObject( conf, INPUT_SPLIT_KEY_PATTERN, key ); } public static String getInputSplitKeyPattern( Configuration conf ) { return conf.get( INPUT_SPLIT_KEY_PATTERN, "{ \"_id\": 1 }" ); } public static DBObject getInputSplitKey( Configuration conf ) { try { final String json = getInputSplitKeyPattern( conf ); final DBObject obj = (DBObject) JSON.parse( json ); if ( obj == null ) return new BasicDBObject("_id", 1); else return obj; } catch ( final Exception e ) { throw new IllegalArgumentException( "Provided JSON String is not representable/parseable as a DBObject.", e ); } } public static void setInputKey( Configuration conf, String fieldName ) { // TODO (bwm) - validate key rules? conf.set( INPUT_KEY, fieldName ); } public static String getInputKey( Configuration conf ) { return conf.get( INPUT_KEY, "_id" ); } public static void setNoTimeout( Configuration conf, boolean value ) { conf.setBoolean( INPUT_NOTIMEOUT, value ); } public static boolean isNoTimeout( Configuration conf ) { return conf.getBoolean( INPUT_NOTIMEOUT, false ); } //BSON-specific config functions. public static boolean getBSONReadSplits( Configuration conf){ return conf.getBoolean(BSON_READ_SPLITS, true); } public static void setBSONReadSplits( Configuration conf, boolean val){ conf.setBoolean(BSON_READ_SPLITS, val); } public static boolean getBSONWriteSplits( Configuration conf){ return conf.getBoolean(BSON_WRITE_SPLITS, true); } public static void setBSONWriteSplits( Configuration conf, boolean val){ conf.setBoolean(BSON_WRITE_SPLITS, val); } public static boolean getBSONOutputBuildSplits( Configuration conf){ return conf.getBoolean(BSON_OUTPUT_BUILDSPLITS, false); } public static void setBSONOutputBuildSplits( Configuration conf, boolean val){ conf.setBoolean(BSON_OUTPUT_BUILDSPLITS, val); } public static void setBSONPathFilter( Configuration conf, Class val ){ conf.setClass( BSON_PATHFILTER, val, PathFilter.class ); } public static Class getBSONPathFilter( Configuration conf ){ return conf.getClass( BSON_PATHFILTER, null ); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy