com.mongodb.hadoop.input.GridFSSplit Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-core Show documentation
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
There is a newer version: 2.0.2
Show newest version
package com.mongodb.hadoop.input;

import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.MongoClientURI;
import com.mongodb.gridfs.GridFS;
import com.mongodb.gridfs.GridFSDBFile;
import com.mongodb.hadoop.util.MongoConfigUtil;
import org.apache.hadoop.mapreduce.InputSplit;
import org.bson.types.ObjectId;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;

public class GridFSSplit extends InputSplit
  implements org.apache.hadoop.mapred.InputSplit {

    private ObjectId fileId;
    private int chunkSize;
    private long fileLength;
    private int chunkId;
    private MongoClientURI inputURI;
    private GridFS gridFS;
    private GridFSDBFile file;

    // Zero-args constructor required for mapred.InputSplit.
    public GridFSSplit() {}

    public GridFSSplit(
      final MongoClientURI inputURI,
      final ObjectId fileId,
      final int chunkSize,
      final long fileLength) {
        this(inputURI, fileId, chunkSize, fileLength, 0);
    }

    public GridFSSplit(
      final MongoClientURI inputURI,
      final ObjectId fileId,
      final int chunkSize,
      final long fileLength,
      final int chunkId) {
        this.inputURI = inputURI;
        this.fileId = fileId;
        this.chunkSize = chunkSize;
        this.fileLength = fileLength;
        this.chunkId = chunkId;
    }

    private GridFS getGridFS() {
        if (null == gridFS) {
            DBCollection rootCollection =
              MongoConfigUtil.getCollection(inputURI);
            gridFS = new GridFS(
              rootCollection.getDB(), rootCollection.getName());
        }
        return gridFS;
    }

    private GridFSDBFile getFile() throws IOException {
        if (null == file) {
            file = getGridFS().find(fileId);
            if (null == file) {
                throw new IOException("No file found for id " + fileId);
            }
        }
        return file;
    }

    /**
     * Get the value for a key in this file.
     * @param key the key to look up
     * @return the value for the key
     */
    public Object get(final String key) throws IOException {
        return getFile().get(key);
    }

    /**
     * Get the metadata associated with this file.
     * @return the metadata as a {@code DBObject}.
     */
    public DBObject getMetadata() throws IOException {
        return getFile().getMetaData();
    }

    /**
     * Get the id of the chunk this split represents.
     * @return the chunk id
     */
    public int getChunkId() {
        return chunkId;
    }

    /**
     * Get the size of the chunk this split represents, in bytes.
     * @return the size in bytes of the chunk
     */
    public int getChunkSize() {
        return chunkSize;
    }

    /**
     * Read data from the GridFS chunk represented by this split.
     * @return The decoded data from the chunk
     * @throws IOException if there is an exception reading the data.
     */
    public InputStream getData() throws IOException {
        InputStream fileStream = getFile().getInputStream();

        // Skip to chunk. GridFSInputStream will do what we want here.
        // noinspection ResultOfMethodCallIgnored
        fileStream.skip(chunkSize * chunkId);
        return fileStream;
    }

    @Override
    public long getLength() throws IOException {
        return fileLength;
    }

    @Override
    public String[] getLocations() throws IOException {
        List hosts = inputURI.getHosts();
        return hosts.toArray(new String[hosts.size()]);
    }

    // Implement mapred.InputSplit.
    @Override
    public void write(final DataOutput out) throws IOException {
        out.writeUTF(inputURI.toString());
        out.write(fileId.toByteArray());
        out.writeInt(chunkId);
        out.writeLong(fileLength);
        out.writeInt(chunkSize);
    }

    @Override
    public void readFields(final DataInput in) throws IOException {
        inputURI = new MongoClientURI(in.readUTF());
        byte[] oidBytes = new byte[12];
        in.readFully(oidBytes);
        fileId = new ObjectId(oidBytes);
        chunkId = in.readInt();
        fileLength = in.readLong();
        chunkSize = in.readInt();
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder("GridFSSplit{");
        if (inputURI != null) {
            sb.append("inputURI hosts=")
              .append(inputURI.getHosts())
              .append(", ");
        }
        if (fileId != null) {
            sb.append("fileId=").append(fileId).append(", ");
        }
        return sb
          .append("chunkSize=").append(chunkSize)
          .append(", fileLength=").append(fileLength)
          .append(", chunkId=").append(chunkId).append("}").toString();
    }

    @Override
    public int hashCode() {
        int hash = (null == inputURI ? 0 : inputURI.hashCode());
        hash = hash * 31 + (null == fileId ? 0 : fileId.hashCode());
        return hash * 31 + chunkId;
    }

    @Override
    public boolean equals(final Object obj) {
        if (obj instanceof GridFSSplit) {
            GridFSSplit other = (GridFSSplit) obj;
            return (null == inputURI
              ? null == other.inputURI : inputURI .equals(other.inputURI))
              && fileId.equals(other.fileId)
              && chunkId == other.chunkId;
        }
        return false;
    }
}