com.mongodb.hadoop.splitter.MongoPaginatingSplitter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-core Show documentation
Show all versions of mongo-hadoop-core Show documentation
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
package com.mongodb.hadoop.splitter;
import com.mongodb.BasicDBObject;
import com.mongodb.BasicDBObjectBuilder;
import com.mongodb.Bytes;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.hadoop.util.MongoConfigUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
/**
* MongoPaginatingSplitter is suitable for splitting a collection when given
* an input query that matches a subset of the documents in the collection,
* and these documents are spread throughout the index described by
* {@link com.mongodb.hadoop.util.MongoConfigUtil#INPUT_SPLIT_KEY_PATTERN}.
* If the documents selected by the query tend to be in a single range of the
* index, consider using
* {@link com.mongodb.hadoop.util.MongoConfigUtil#ENABLE_FILTER_EMPTY_SPLITS}
* instead.
*
* Splits produced by this Splitter implementation contain at least
* {@link com.mongodb.hadoop.util.MongoConfigUtil#INPUT_SPLIT_MIN_DOCS}
* documents, except for the last split in the collection, which may contain
* fewer.
*
* This Splitter implementation must make several queries to the database in
* order to build splits.
*/
public class MongoPaginatingSplitter extends MongoCollectionSplitter {
public MongoPaginatingSplitter() {}
public MongoPaginatingSplitter(final Configuration conf) {
super(conf);
}
public List calculateSplits() throws SplitFailedException {
Configuration conf = getConfiguration();
if (!MongoConfigUtil.isRangeQueryEnabled(conf)) {
throw new IllegalArgumentException(
"Cannot split using " + getClass().getName() + " when "
+ MongoConfigUtil.SPLITS_USE_RANGEQUERY + " is disabled.");
}
DBObject splitKeyObj = MongoConfigUtil.getInputSplitKey(conf);
Set splitKeys = splitKeyObj.keySet();
if (splitKeys.size() > 1) {
throw new IllegalArgumentException(
"Cannot split using " + getClass().getName() + " when "
+ MongoConfigUtil.INPUT_SPLIT_KEY_PATTERN + " describes a "
+ "compound key.");
}
String splitKey = splitKeys.iterator().next();
DBObject splitKeyProjection = new BasicDBObject(splitKey, 1);
if (!splitKey.equals("_id")) {
splitKeyProjection.put("_id", 0);
}
int minDocs = MongoConfigUtil.getInputSplitMinDocs(conf);
DBCollection inputCollection =
MongoConfigUtil.getInputCollection(conf);
DBObject query = MongoConfigUtil.getQuery(conf);
DBObject rangeObj = null;
List splits = new ArrayList();
Object minBound = null, maxBound;
DBCursor cursor;
try {
do {
if (null == minBound) {
cursor = inputCollection.find(query, splitKeyProjection);
} else {
if (null == rangeObj) {
rangeObj =
new BasicDBObjectBuilder()
.push(splitKey).add("$gte", minBound).pop().get();
rangeObj.putAll(query);
} else {
((DBObject) rangeObj.get(splitKey))
.put("$gte", minBound);
}
cursor = inputCollection.find(rangeObj, splitKeyProjection);
}
cursor = cursor.sort(splitKeyObj).skip(minDocs).limit(1)
.setOptions(Bytes.QUERYOPTION_NOTIMEOUT);
if (cursor.hasNext()) {
maxBound = cursor.next().get(splitKey);
} else {
maxBound = null;
}
BasicDBObject lowerBound = null, upperBound = null;
if (minBound != null) {
lowerBound = new BasicDBObject(splitKey, minBound);
}
if (maxBound != null) {
upperBound = new BasicDBObject(splitKey, maxBound);
}
splits.add(
createRangeQuerySplit(lowerBound, upperBound, query));
minBound = maxBound;
} while (maxBound != null);
} finally {
MongoConfigUtil.close(inputCollection.getDB().getMongo());
}
return splits;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy