com.mongodb.hadoop.splitter.BSONSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-core Show documentation
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
The newest version!
/*
 * Copyright 2010-2013 10gen Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mongodb.hadoop.splitter;

import com.mongodb.BasicDBObjectBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.bson.BSONObject;
import org.bson.BasicBSONCallback;
import org.bson.BasicBSONDecoder;
import org.bson.BasicBSONEncoder;
import org.bson.LazyBSONCallback;
import org.bson.LazyBSONDecoder;
import org.bson.LazyBSONObject;

import java.io.IOException;
import java.util.ArrayList;

public class BSONSplitter extends Configured implements Tool {

    private static final Log LOG = LogFactory.getLog(BSONSplitter.class);

    private ArrayList splitsList;
    private Path inputPath;
    private BasicBSONCallback callback = new BasicBSONCallback();
    private LazyBSONCallback lazyCallback = new LazyBSONCallback();
    private LazyBSONDecoder lazyDec = new LazyBSONDecoder();
    private BasicBSONDecoder bsonDec = new BasicBSONDecoder();
    private BasicBSONEncoder bsonEnc = new BasicBSONEncoder();

    public static class NoSplitFileException extends Exception {
    }

    public void setInputPath(final Path p) {
        inputPath = p;
    }

    public ArrayList getAllSplits() {
        if (splitsList == null) {
            return new ArrayList(0);
        } else {
            return splitsList;
        }
    }

    public FileSplit createFileSplitFromBSON(final BSONObject obj, final FileSystem fs, final FileStatus inputFile) throws IOException {
        long start = (Long) obj.get("s");
        long splitLen = (Long) obj.get("l");
        try {
            BlockLocation[] blkLocations = fs.getFileBlockLocations(inputFile, start, splitLen);
            int blockIndex = getLargestBlockIndex(blkLocations);
            return new FileSplit(inputFile.getPath(), start, splitLen, blkLocations[blockIndex].getHosts());
        } catch (IOException e) {
            LOG.warn("Couldn't find block locations when constructing input split from BSON. Using non-block-aware input split; "
                     + e.getMessage());
            return new FileSplit(inputFile.getPath(), start, splitLen, null);
        }
    }

    public FileSplit createFileSplit(final FileStatus inFile, final FileSystem fs, final long splitStart, final long splitLen) {
        try {
            BlockLocation[] blkLocations = fs.getFileBlockLocations(inFile, splitStart, splitLen);
            int blockIndex = getLargestBlockIndex(blkLocations);
            return new FileSplit(inFile.getPath(), splitStart, splitLen, blkLocations[blockIndex].getHosts());
        } catch (IOException e) {
            LOG.warn("Couldn't find block locations when constructing input split from byte offset. Using non-block-aware input split; "
                     + e.getMessage());
            return new FileSplit(inFile.getPath(), splitStart, splitLen, null);
        }
    }

    public void loadSplitsFromSplitFile(final FileStatus inputFile, final Path splitFile) throws NoSplitFileException, IOException {
        ArrayList splits = new ArrayList();
        FileSystem fs = splitFile.getFileSystem(getConf()); // throws IOException
        FileStatus splitFileStatus;
        try {
            splitFileStatus = fs.getFileStatus(splitFile);
            LOG.info("Found split file at : " + splitFileStatus);
        } catch (Exception e) {
            throw new NoSplitFileException();
        }
        FSDataInputStream fsDataStream = fs.open(splitFile); // throws IOException
        while (fsDataStream.getPos() < splitFileStatus.getLen()) {
            callback.reset();
            bsonDec.decode(fsDataStream, callback);
            BSONObject splitInfo = (BSONObject) callback.get();
            splits.add(createFileSplitFromBSON(splitInfo, fs, inputFile));
        }
        splitsList = splits;
    }//}}}

    public static long getSplitSize(final Configuration conf, final FileStatus file) {
        long minSize = Math.max(1L, conf.getLong("mapred.min.split.size", 1L));
        long maxSize = conf.getLong("mapred.max.split.size", Long.MAX_VALUE);

        if (file != null) {
            long fileBlockSize = file.getBlockSize();
            return Math.max(minSize, Math.min(maxSize, fileBlockSize));
        } else {
            long blockSize = conf.getLong("dfs.blockSize", 64 * 1024 * 1024);
            return Math.max(minSize, Math.min(maxSize, blockSize));
        }
    }

    public void readSplitsForFile(final FileStatus file) throws IOException {
        Path path = file.getPath();
        ArrayList splits = new ArrayList();
        FileSystem fs = path.getFileSystem(getConf());
        long length = file.getLen();
        if (!getConf().getBoolean("bson.split.read_splits", true)) {
            LOG.info("Reading splits is disabled - constructing single split for " + file);
            FileSplit onesplit = createFileSplit(file, fs, 0, length);
            splits.add(onesplit);
            splitsList = splits;
            return;
        }
        if (length != 0) {
            int numDocsRead = 0;
            long splitSize = getSplitSize(getConf(), file);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Generating splits for " + path + " of up to " + splitSize + " bytes.");
            }
            FSDataInputStream fsDataStream = fs.open(path);
            long curSplitLen = 0;
            long curSplitStart = 0;
            try {
                while (fsDataStream.getPos() + 1 < length) {
                    lazyCallback.reset();
                    lazyDec.decode(fsDataStream, lazyCallback);
                    LazyBSONObject bo = (LazyBSONObject) lazyCallback.get();
                    int bsonDocSize = bo.getBSONSize();
                    if (curSplitLen + bsonDocSize >= splitSize) {
                        FileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen);
                        splits.add(split);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug(String.format("Creating new split (%d) %s", splits.size(), split));
                        }
                        curSplitStart = fsDataStream.getPos() - bsonDocSize;
                        curSplitLen = 0;
                    }
                    curSplitLen += bsonDocSize;
                    numDocsRead++;
                    if (numDocsRead % 1000 == 0) {
                        float splitProgress = 100f * ((float) fsDataStream.getPos() / length);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug(String.format("Read %d docs calculating splits for %s; %3.3f%% complete.",
                                                    numDocsRead, file.getPath(), splitProgress));
                        }
                    }
                }
                if (curSplitLen > 0) {
                    FileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen);
                    splits.add(split);
                    if (LOG.isDebugEnabled()) {
                        LOG.debug(String.format("Final split (%d) %s", splits.size(), split.getPath()));
                    }
                }
                splitsList = splits;
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Completed splits calculation for " + file.getPath());
                }
                writeSplits();
            } catch (IOException e) {
                LOG.warn("IOException: " + e);
            } finally {
                fsDataStream.close();
            }
        } else {
            LOG.warn("Zero-length file, skipping split calculation.");
        }
    }

    public void writeSplits() throws IOException {
        if (!getConf().getBoolean("bson.split.write_splits", true)) {
            LOG.info("bson.split.write_splits is set to false - skipping writing splits to disk.");
            return;
        } else {
            LOG.info("Writing splits to disk.");
        }

        if (splitsList == null) {
            LOG.info("No splits found, skipping write of splits file.");
        }

        Path outputPath = new Path(inputPath.getParent(), "." + inputPath.getName() + ".splits");

        FileSystem pathFileSystem = outputPath.getFileSystem(getConf());
        FSDataOutputStream fsDataOut = null;
        try {
            fsDataOut = pathFileSystem.create(outputPath, false);
            for (FileSplit inputSplit : splitsList) {
                BSONObject splitObj = BasicDBObjectBuilder.start()
                                                          .add("s", inputSplit.getStart())
                                                          .add("l", inputSplit.getLength()).get();
                byte[] encodedObj = bsonEnc.encode(splitObj);
                fsDataOut.write(encodedObj, 0, encodedObj.length);
            }
        } catch (IOException e) {
            LOG.error("Could not create splits file: " + e.getMessage());
            throw e;
        } finally {
            if (fsDataOut != null) {
                fsDataOut.close();
            }
        }
    }

    public void readSplits() throws IOException {
        splitsList = new ArrayList();
        if (inputPath == null) {
            throw new IllegalStateException("Input path has not been set.");
        }
        FileSystem fs = inputPath.getFileSystem(getConf());
        FileStatus file = fs.getFileStatus(inputPath);
        readSplitsForFile(file);
    }

    public int run(final String[] args) throws Exception {
        setInputPath(new Path(getConf().get("mapred.input.dir", "")));
        readSplits();
        writeSplits();
        return 0;
    }

    public static int getLargestBlockIndex(final BlockLocation[] blockLocations) {
        int retVal = -1;
        if (blockLocations == null) {
            return retVal;
        }
        long max = 0;
        for (int i = 0; i < blockLocations.length; i++) {
            BlockLocation blk = blockLocations[i];
            if (blk.getLength() > max) {
                retVal = i;
            }
        }
        return retVal;
    }

    public static void main(final String[] args) throws Exception {
        System.exit(ToolRunner.run(new BSONSplitter(), args));
    }

}