com.mongodb.hadoop.GridFSInputFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-core Show documentation
Show all versions of mongo-hadoop-core Show documentation
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
package com.mongodb.hadoop;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.MongoClientURI;
import com.mongodb.gridfs.GridFS;
import com.mongodb.gridfs.GridFSDBFile;
import com.mongodb.hadoop.input.GridFSSplit;
import com.mongodb.hadoop.util.MongoConfigUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.BinaryComparable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.bson.types.ObjectId;
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GridFSInputFormat
extends InputFormat {
private static final Log LOG = LogFactory.getLog(GridFSInputFormat.class);
@Override
public List getSplits(final JobContext context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
DBCollection inputCollection =
MongoConfigUtil.getInputCollection(conf);
MongoClientURI inputURI = MongoConfigUtil.getInputURI(conf);
GridFS gridFS = new GridFS(
inputCollection.getDB(),
inputCollection.getName());
DBObject query = MongoConfigUtil.getQuery(conf);
List splits = new LinkedList();
for (GridFSDBFile file : gridFS.find(query)) {
// One split per file.
if (MongoConfigUtil.isGridFSWholeFileSplit(conf)) {
splits.add(
new GridFSSplit(
inputURI,
(ObjectId) file.getId(),
(int) file.getChunkSize(),
file.getLength()));
}
// One split per file chunk.
else {
for (int chunk = 0; chunk < file.numChunks(); ++chunk) {
splits.add(
new GridFSSplit(
inputURI,
(ObjectId) file.getId(),
(int) file.getChunkSize(),
file.getLength(),
chunk));
}
}
}
LOG.debug("Found GridFS splits: " + splits);
return splits;
}
@Override
public RecordReader
createRecordReader(final InputSplit split, final TaskAttemptContext context)
throws IOException, InterruptedException {
if (MongoConfigUtil.isGridFSReadBinary(context.getConfiguration())) {
// Read GridFS files as binary files.
return new GridFSBinaryRecordReader();
} else {
// Read GridFS files as text.
return new GridFSTextRecordReader();
}
}
static class GridFSBinaryRecordReader
extends RecordReader {
private final BytesWritable bw = new BytesWritable();
private GridFSSplit split;
private InputStream stream;
private boolean readLast;
private byte[] buff;
@Override
public void initialize(
final InputSplit split, final TaskAttemptContext context)
throws IOException, InterruptedException {
this.split = (GridFSSplit) split;
readLast = false;
buff = new byte[1024 * 1024 * 16];
stream = this.split.getData();
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
// Read the whole split once.
if (readLast) {
return false;
}
int totalBytes = 0, bytesRead;
do {
bytesRead = stream.read(
buff, totalBytes, buff.length - totalBytes);
if (bytesRead > 0) {
totalBytes += bytesRead;
}
} while (bytesRead > 0);
bw.set(buff, 0, totalBytes);
readLast = true;
return true;
}
@Override
public NullWritable getCurrentKey()
throws IOException, InterruptedException {
return NullWritable.get();
}
@Override
public BytesWritable getCurrentValue()
throws IOException, InterruptedException {
return bw;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return readLast ? 1.0f : 0.0f;
}
@Override
public void close() throws IOException {
stream.close();
}
}
static class ChunkReadingCharSequence implements CharSequence, Closeable {
private Reader reader;
private int chunkSize;
private int length;
private StringBuilder builder;
private char[] buff;
public ChunkReadingCharSequence(final GridFSSplit split)
throws IOException {
this.reader = new BufferedReader(
new InputStreamReader(split.getData()));
this.chunkSize = split.getChunkSize();
builder = new StringBuilder();
buff = new char[1024 * 1024 * 16];
// How many more bytes can be read starting from this chunk?
length = (int) split.getLength() - split.getChunkId() * chunkSize;
}
@Override
public int length() {
return length;
}
private void advanceToIndex(final int index) throws IOException {
if (index >= builder.length()) {
while (index >= builder.length()) {
int bytesRead = reader.read(buff);
if (bytesRead > 0) {
builder.append(buff, 0, bytesRead);
} else {
break;
}
}
}
}
@Override
public char charAt(final int index) {
try {
advanceToIndex(index);
} catch (IOException e) {
throw new IndexOutOfBoundsException(
"Could not advance stream to index: "
+ index + "; reason: " + e.getMessage());
}
return builder.charAt(index);
}
@Override
public CharSequence subSequence(final int start, final int end) {
try {
advanceToIndex(end);
} catch (IOException e) {
throw new RuntimeException(e);
}
return builder.subSequence(start, end);
}
/**
* Get the entire contents of this GridFS chunk.
* @return the contents of the chunk as a CharSequence (a String).
*/
public CharSequence chunkContents() {
return subSequence(0, Math.min(chunkSize, length));
}
public CharSequence fileContents() {
return subSequence(0, length);
}
@Override
public void close() throws IOException {
reader.close();
}
}
static class GridFSTextRecordReader
extends RecordReader {
private GridFSSplit split;
private final Text text = new Text();
private int totalMatches = 0;
private long chunkSize;
private boolean readLast;
private boolean readWholeFile;
private Pattern delimiterPattern;
private Matcher matcher;
private int previousMatchIndex = 0;
private ChunkReadingCharSequence chunkData;
@Override
public void initialize(final InputSplit split, final TaskAttemptContext context)
throws IOException, InterruptedException {
this.split = (GridFSSplit) split;
Configuration conf = context.getConfiguration();
String patternString =
MongoConfigUtil.getGridFSDelimiterPattern(conf);
chunkSize = this.split.getChunkSize();
chunkData = new ChunkReadingCharSequence(this.split);
readLast = false;
readWholeFile = MongoConfigUtil.isGridFSWholeFileSplit(conf);
if (!(null == patternString || patternString.isEmpty())) {
delimiterPattern = Pattern.compile(patternString);
matcher = delimiterPattern.matcher(chunkData);
// Skip past the first delimiter if this is not the first chunk.
if (this.split.getChunkId() > 0) {
nextToken();
}
}
}
private CharSequence nextToken() {
if (matcher.find()) {
CharSequence slice = chunkData.subSequence(
previousMatchIndex, matcher.start());
// Skip the delimiter.
previousMatchIndex = matcher.end();
return slice;
}
// Last token after the final delimiter.
readLast = true;
return chunkData.subSequence(
previousMatchIndex, chunkData.length());
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (readLast) {
LOG.debug("skipping the rest of this chunk because we've "
+ "read beyond the end: " + previousMatchIndex
+ "; read " + totalMatches + " matches here.");
return false;
}
// No delimiter being used, and we haven't returned anything yet.
if (null == matcher) {
if (readWholeFile) {
text.set(chunkData.fileContents().toString());
} else {
text.set(chunkData.chunkContents().toString());
}
++totalMatches;
readLast = true;
return true;
}
// Delimiter used; do we have more matches?
CharSequence nextToken = nextToken();
if (nextToken != null) {
// Read one more token past the end of the split.
if (!readWholeFile && previousMatchIndex >= chunkSize) {
readLast = true;
}
text.set(nextToken.toString());
++totalMatches;
return true;
} else if (LOG.isDebugEnabled()) {
LOG.debug("Read " + totalMatches + " segments.");
}
// No match.
return false;
}
@Override
public NullWritable getCurrentKey()
throws IOException, InterruptedException {
return NullWritable.get();
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return text;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return (float) Math.min(
previousMatchIndex / (float) chunkSize, 1.0);
}
@Override
public void close() throws IOException {
chunkData.close();
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy