com.microsoft.azure.documentdb.mapred.hadoop.DocumentDBInputFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of azure-documentdb-hadoop Show documentation
Show all versions of azure-documentdb-hadoop Show documentation
Hadoop Connector for Microsoft Azure DocumentDB
The newest version!
//------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
//------------------------------------------------------------
package com.microsoft.azure.documentdb.mapred.hadoop;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hive.com.esotericsoftware.minlog.Log;
import com.microsoft.azure.documentdb.hadoop.ConfigurationUtil;
import com.microsoft.azure.documentdb.hadoop.DocumentDBInputSplit;
import com.microsoft.azure.documentdb.hadoop.DocumentDBWritable;
/**
* An input format that can read data from Azure DocumentDB. It sends one Document
* at a time to the mapper.
*/
public class DocumentDBInputFormat implements InputFormat {
/**
* Creates an instance of DocumentDBRecordReader
*/
public RecordReader getRecordReader(InputSplit split, JobConf conf,
Reporter reporter) throws IOException {
return new DocumentDBRecordReader((WrapperSplit) split);
}
/**
* Gets an array of DocumentDBInputSplit and validates all the required properties to read from documentdb.
*/
public InputSplit[] getSplits(JobConf conf, int numberOfSplits) throws IOException {
final String endpoint = ConfigurationUtil.getDBEndpoint(conf);
final String key = ConfigurationUtil.getDBKey(conf);
final String dbName = ConfigurationUtil.getDBName(conf);
final String[] collectionNames = ConfigurationUtil.getInputCollectionNames(conf);
final String query = ConfigurationUtil.getQuery(conf);
if (endpoint == null)
throw new IOException("DB_HOST must be set for the jobconf");
if (key == null)
throw new IOException("DB_KEY must be set for the jobconf");
if (dbName == null)
throw new IOException("DB_NAME must be set for the jobconf");
if (collectionNames.length < 1)
throw new IOException("INPUT_COLLECTION_NAMES must be set for the jobconf as comma separated names");
List innerSplits = DocumentDBInputSplit.getSplits(conf, endpoint, key,
dbName, collectionNames, query);
Path[] tablePaths = FileInputFormat.getInputPaths(conf);
ArrayList ret = new ArrayList();
for (org.apache.hadoop.mapreduce.InputSplit split : innerSplits) {
ret.add(new WrapperSplit((DocumentDBInputSplit) split, tablePaths[0], conf));
}
return ret.toArray(new InputSplit[0]);
}
}