com.mongodb.hadoop.splitter.MongoPaginatingSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-core Show documentation
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
There is a newer version: 2.0.2
Show newest version
package com.mongodb.hadoop.splitter;

import com.mongodb.BasicDBObject;
import com.mongodb.BasicDBObjectBuilder;
import com.mongodb.Bytes;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.hadoop.util.MongoConfigUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;


/**
 * MongoPaginatingSplitter is suitable for splitting a collection when given
 * an input query that matches a subset of the documents in the collection,
 * and these documents are spread throughout the index described by
 * {@link com.mongodb.hadoop.util.MongoConfigUtil#INPUT_SPLIT_KEY_PATTERN}.
 * If the documents selected by the query tend to be in a single range of the
 * index, consider using
 * {@link com.mongodb.hadoop.util.MongoConfigUtil#ENABLE_FILTER_EMPTY_SPLITS}
 * instead.
 *
 * Splits produced by this Splitter implementation contain at least
 * {@link com.mongodb.hadoop.util.MongoConfigUtil#INPUT_SPLIT_MIN_DOCS}
 * documents, except for the last split in the collection, which may contain
 * fewer.
 *
 * This Splitter implementation must make several queries to the database in
 * order to build splits.
 */
public class MongoPaginatingSplitter extends MongoCollectionSplitter {

    public MongoPaginatingSplitter() {}

    public MongoPaginatingSplitter(final Configuration conf) {
        super(conf);
    }

    public List calculateSplits() throws SplitFailedException {
        Configuration conf = getConfiguration();
        if (!MongoConfigUtil.isRangeQueryEnabled(conf)) {
            throw new IllegalArgumentException(
              "Cannot split using " + getClass().getName() + " when "
              + MongoConfigUtil.SPLITS_USE_RANGEQUERY + " is disabled.");
        }
        DBObject splitKeyObj = MongoConfigUtil.getInputSplitKey(conf);
        Set splitKeys = splitKeyObj.keySet();
        if (splitKeys.size() > 1) {
            throw new IllegalArgumentException(
              "Cannot split using " + getClass().getName() + " when "
                + MongoConfigUtil.INPUT_SPLIT_KEY_PATTERN + " describes a "
                + "compound key.");
        }
        String splitKey = splitKeys.iterator().next();

        DBObject splitKeyProjection = new BasicDBObject(splitKey, 1);
        if (!splitKey.equals("_id")) {
            splitKeyProjection.put("_id", 0);
        }

        int minDocs = MongoConfigUtil.getInputSplitMinDocs(conf);
        DBCollection inputCollection =
          MongoConfigUtil.getInputCollection(conf);
        DBObject query = MongoConfigUtil.getQuery(conf);
        DBObject rangeObj = null;
        List splits = new ArrayList();
        Object minBound = null, maxBound;
        DBCursor cursor;

        try {
            do {
                if (null == minBound) {
                    cursor = inputCollection.find(query, splitKeyProjection);
                } else {
                    if (null == rangeObj) {
                        rangeObj =
                          new BasicDBObjectBuilder()
                            .push(splitKey).add("$gte", minBound).pop().get();
                        rangeObj.putAll(query);
                    } else {
                        ((DBObject) rangeObj.get(splitKey))
                          .put("$gte", minBound);
                    }
                    cursor = inputCollection.find(rangeObj, splitKeyProjection);
                }
                cursor = cursor.sort(splitKeyObj).skip(minDocs).limit(1)
                  .setOptions(Bytes.QUERYOPTION_NOTIMEOUT);

                if (cursor.hasNext()) {
                    maxBound = cursor.next().get(splitKey);
                } else {
                    maxBound = null;
                }

                BasicDBObject lowerBound = null, upperBound = null;
                if (minBound != null) {
                    lowerBound = new BasicDBObject(splitKey, minBound);
                }
                if (maxBound != null) {
                    upperBound = new BasicDBObject(splitKey, maxBound);
                }
                splits.add(
                  createRangeQuerySplit(lowerBound, upperBound, query));

                minBound = maxBound;
            } while (maxBound != null);
        } finally {
            MongoConfigUtil.close(inputCollection.getDB().getMongo());
        }

        return splits;
    }

}