Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Copyright 2010-2013 10gen Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mongodb.hadoop.util;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.MongoURI;
import com.mongodb.hadoop.splitter.MongoSplitter;
import com.mongodb.util.JSON;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.net.URI;
import java.net.UnknownHostException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
/**
* Configuration helper tool for MongoDB related Map/Reduce jobs
*/
public final class MongoConfigUtil {
private static final Log LOG = LogFactory.getLog(MongoConfigUtil.class);
/**
* The JOB_* values are entirely optional and disregarded unless you use the MongoTool base toolset... If you don't, feel free to ignore
* these
*/
public static final String JOB_VERBOSE = "mongo.job.verbose";
public static final String JOB_BACKGROUND = "mongo.job.background";
public static final String JOB_MAPPER = "mongo.job.mapper";
public static final String JOB_COMBINER = "mongo.job.combiner";
public static final String JOB_PARTITIONER = "mongo.job.partitioner";
public static final String JOB_REDUCER = "mongo.job.reducer";
public static final String JOB_SORT_COMPARATOR = "mongo.job.sort_comparator";
public static final String JOB_MAPPER_OUTPUT_KEY = "mongo.job.mapper.output.key";
public static final String JOB_MAPPER_OUTPUT_VALUE = "mongo.job.mapper.output.value";
public static final String JOB_INPUT_FORMAT = "mongo.job.input.format";
public static final String JOB_OUTPUT_FORMAT = "mongo.job.output.format";
public static final String JOB_OUTPUT_KEY = "mongo.job.output.key";
public static final String JOB_OUTPUT_VALUE = "mongo.job.output.value";
public static final String INPUT_URI = "mongo.input.uri";
public static final String INPUT_MONGOS_HOSTS = "mongo.input.mongos_hosts";
public static final String OUTPUT_URI = "mongo.output.uri";
public static final String OUTPUT_BATCH_SIZE = "mongo.output.batch.size";
public static final String MONGO_SPLITTER_CLASS = "mongo.splitter.class";
/**
*
* The MongoDB field to read from for the Mapper Input.
*
*
* This will be fed to your mapper as the "Key" for the input.
*
*
* Defaults to {@code _id}
*
*/
public static final String INPUT_KEY = "mongo.input.key";
public static final String INPUT_NOTIMEOUT = "mongo.input.notimeout";
public static final String INPUT_QUERY = "mongo.input.query";
public static final String INPUT_FIELDS = "mongo.input.fields";
public static final String INPUT_SORT = "mongo.input.sort";
public static final String INPUT_LIMIT = "mongo.input.limit";
public static final String INPUT_SKIP = "mongo.input.skip";
public static final String INPUT_LAZY_BSON = "mongo.input.lazy_bson";
//Settings specific to bson reading/writing.
public static final String BSON_SPLITS_PATH = "bson.split.splits_path";
public static final String BSON_READ_SPLITS = "bson.split.read_splits";
public static final String BSON_WRITE_SPLITS = "bson.split.write_splits";
public static final String BSON_OUTPUT_BUILDSPLITS = "bson.output.build_splits";
public static final String BSON_PATHFILTER = "bson.pathfilter.class";
/**
*
* A username and password to use.
*
*
* This is necessary when running jobs with a sharded cluster, as access to the config database is needed to get
*
*/
public static final String AUTH_URI = "mongo.auth.uri";
/**
*
* When *not* using 'read_from_shards' or 'read_shard_chunks' The number of megabytes per Split to create for the input data.
*
*
* Currently defaults to 8MB, tweak it as necessary for your code.
*
*
* This default will likely change as we research better options.
*
*/
public static final String INPUT_SPLIT_SIZE = "mongo.input.split_size";
public static final int DEFAULT_SPLIT_SIZE = 8; // 8 mb per manual (non-sharding) split
/**
* When {@code true}, MongoSplitter implementations will check for and
* remove empty splits before returning them from {@code calculateSplits}.
* This requires pulling a small amount of data from MongoDB but avoids
* starting tasks that don't have any data to process.
*
* This option is useful when providing a query to {@link #INPUT_QUERY},
* and that query selects a subset of the documents in the collection
* that are clustered in the same range of the index described by
* {@link #INPUT_SPLIT_KEY_PATTERN}. If the query selects documents
* throughout the index, consider using
* {@link com.mongodb.hadoop.splitter.MongoPaginatingSplitter} and setting
* {@link #INPUT_SPLIT_MIN_DOCS} instead.
*/
public static final String ENABLE_FILTER_EMPTY_SPLITS =
"mongo.input.splits.filter_empty";
/**
* When {@link #SPLITS_USE_RANGEQUERY} is enabled, this option sets the
* minimum number of documents to be contained in each MongoInputSplit
* (does not apply to BSON). This option only applies when using
* {@link com.mongodb.hadoop.splitter.MongoPaginatingSplitter} as the
* splitter implementation.
*
* This value defaults to {@link #DEFAULT_INPUT_SPLIT_MIN_DOCS}.
*/
public static final String INPUT_SPLIT_MIN_DOCS =
"mongo.input.splits.min_docs";
public static final int DEFAULT_INPUT_SPLIT_MIN_DOCS = 1000;
/**
*
* If CREATE_INPUT_SPLITS is true but SPLITS_USE_CHUNKS is false, Mongo-Hadoop will attempt to create custom input splits for you. By
* default it will split on {@code _id}, which is a reasonable/sane default.
*
*
* If you want to customize that split point for efficiency reasons (such as different distribution) you may set this to any valid field
* name. The restriction on this key name are the *exact same rules* as when sharding an existing MongoDB Collection. You must have an
* index on the field, and follow the other rules outlined in the docs.
*
*
* To customize the range of the index that is used to create splits, see
* the {@link #INPUT_SPLIT_KEY_MIN} and {@link #INPUT_SPLIT_KEY_MAX}
* settings.
*
*
* This must be a JSON document, and not just a field name!
*
*
* @see Shard Keys
*/
public static final String INPUT_SPLIT_KEY_PATTERN = "mongo.input.split.split_key_pattern";
/**
* Lower-bound for splits created using the index described by
* {@link #INPUT_SPLIT_KEY_PATTERN}. This value must be set to a JSON
* string that describes a point in the index. This setting must be used
* in conjunction with {@code INPUT_SPLIT_KEY_PATTERN} and
* {@link #INPUT_SPLIT_KEY_MAX}.
*/
public static final String INPUT_SPLIT_KEY_MIN = "mongo.input.split.split_key_min";
/**
* Upper-bound for splits created using the index described by
* {@link #INPUT_SPLIT_KEY_PATTERN}. This value must be set to a JSON
* string that describes a point in the index. This setting must be used
* in conjuntion with {@code INPUT_SPLIT_KEY_PATTERN} and
* {@link #INPUT_SPLIT_KEY_MIN}.
*/
public static final String INPUT_SPLIT_KEY_MAX = "mongo.input.split.split_key_max";
/**
*
* If {@code true}, the driver will attempt to split the MongoDB Input data (if reading from Mongo) into multiple InputSplits to allow
* parallelism/concurrency in processing within Hadoop. That is to say, Hadoop will assign one InputSplit per mapper.
*
*
* This is {@code true} by default now, but if {@code false}, only one InputSplit (your whole collection) will be assigned to Hadoop,
* severely reducing parallel mapping.
*
*/
public static final String CREATE_INPUT_SPLITS = "mongo.input.split.create_input_splits";
/**
* If {@code true} in a sharded setup splits will be made to connect to individual backend {@code mongod}s. This can be unsafe. If
* {@code mongos} is moving chunks around you might see duplicate data, or miss some data entirely. Defaults to {@code false}
*/
public static final String SPLITS_USE_SHARDS = "mongo.input.split.read_from_shards";
/**
* If {@code true} have one split = one shard chunk. If {@link #SPLITS_USE_SHARDS} is not true splits will still use chunks, but will
* connect through {@code mongos} instead of the individual backend {@code mongod}s (the safe thing to do). If {@link
* #SPLITS_USE_SHARDS} is {@code true} but this is {@code false} one split will be made for each backend shard. THIS IS UNSAFE and may
* result in data being run multiple times
Defaults to {@code true }
*/
public static final String SPLITS_USE_CHUNKS = "mongo.input.split.read_shard_chunks";
/**
*
* If true then shards are replica sets run queries on slaves. If set this will override any option passed on the URI.
*
*
* Defaults to {@code false}
*
*/
public static final String SPLITS_SLAVE_OK = "mongo.input.split.allow_read_from_secondaries";
/**
*
* If true then queries for splits will be constructed using $lt/$gt instead of $min and $max.
*
*
* Defaults to {@code false}
*
*/
public static final String SPLITS_USE_RANGEQUERY = "mongo.input.split.use_range_queries";
/**
* One client per thread
*/
private static final ThreadLocal