com.microsoft.azure.documentdb.hadoop.DocumentDBInputFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of azure-documentdb-hadoop Show documentation
Show all versions of azure-documentdb-hadoop Show documentation
Hadoop Connector for Microsoft Azure DocumentDB
//------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
//------------------------------------------------------------
package com.microsoft.azure.documentdb.hadoop;
import java.io.IOException;
import java.util.List;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.commons.logging.Log;
/**
* An input format that can read data from Azure DocumentDB. It sends one Document
* at a time to the mapper.
*/
public class DocumentDBInputFormat extends InputFormat {
private static final Log LOG = LogFactory.getLog(DocumentDBWritable.class);
/**
* Creates an instance of DocumentDBRecordReader
*/
@Override
public RecordReader createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
return new DocumentDBRecordReader((DocumentDBInputSplit) split);
}
/**
* Gets a list of DocumentDBInputSplit and validates all the required properties to read from documentdb.
*/
@Override
public List getSplits(JobContext context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
final String endpoint = ConfigurationUtil.getDBEndpoint(conf);
final String key = ConfigurationUtil.getDBKey(conf);
final String dbName = ConfigurationUtil.getDBName(conf);
final String[] collectionNames = ConfigurationUtil.getInputCollectionNames(conf);
final String query = ConfigurationUtil.getQuery(conf);
if (endpoint == null)
throw new IOException("DB_HOST must be set for the jobconf");
if (key == null)
throw new IOException("DB_KEY must be set for the jobconf");
if (dbName == null)
throw new IOException("DB_NAME must be set for the jobconf");
if (collectionNames.length < 1)
throw new IOException("INPUT_COLLECTION_NAMES must be set for the jobconf as comma separated names");
return DocumentDBInputSplit.getSplits(conf, endpoint, key, dbName, collectionNames, query);
}
}