com.microsoft.azure.documentdb.pig.DocumentDBLoader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of azure-documentdb-hadoop Show documentation
Hadoop Connector for Microsoft Azure DocumentDB. Bumped version by Keedio
The newest version!
//------------------------------------------------------------
// Copyright (c) Microsoft Corporation.  All rights reserved.
//------------------------------------------------------------

package com.microsoft.azure.documentdb.pig;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.pig.LoadFunc;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputFormat;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;

import com.microsoft.azure.documentdb.Document;
import com.microsoft.azure.documentdb.hadoop.ConfigurationUtil;
import com.microsoft.azure.documentdb.hadoop.DocumentDBConnectorUtil;
import com.microsoft.azure.documentdb.hadoop.DocumentDBInputFormat;
import com.microsoft.azure.documentdb.hadoop.DocumentDBWritable;

/**
 * A Pig data loader from DocumentDB.
 */
public class DocumentDBLoader extends LoadFunc{
    private static final String PIG_LOADER_USERAGENT = " PigConnectorLoader/1.0.0";
    private String masterkey = null;
    private RecordReader reader = null;
    private ResourceFieldSchema[] fields;
    private ResourceSchema schema = null;
    private String dbName;
    private String inputCollections;
    private String query;
    private static TupleFactory tupleFactory = TupleFactory.getInstance();
    Log log = LogFactory.getLog(DocumentDBLoader.class);

    public DocumentDBLoader(String masterkey, String dbName, String inputCollections){
        this(masterkey, dbName, inputCollections, null);
    }
    
    public DocumentDBLoader(String masterkey, String dbName, String inputCollections, String query) {
        this.masterkey = masterkey;
        this.fields = null;
        this.dbName = dbName;
        // Comma separated collection names
        this.inputCollections = inputCollections; 
        this.query = query;
        
        //Set the userAgent to pig loader
        if (!DocumentDBConnectorUtil.UserAgentSuffix.contains(DocumentDBLoader.PIG_LOADER_USERAGENT)) {
            DocumentDBConnectorUtil.UserAgentSuffix += DocumentDBLoader.PIG_LOADER_USERAGENT;
        }
    }

    /**
     * Returns an instance of DocumentDBInputFormat
     */
    @Override
    public InputFormat getInputFormat() throws IOException {
        log.debug("getting input format");
        return new DocumentDBInputFormat();
    }

    private Tuple translate(DocumentDBWritable value) throws IOException {
        Tuple t = null;
        Document doc = value.getDoc();
        
        if (this.fields == null) {
            // dynamic schema: we will output a tuple with one element, which is
            // map storing the key-value of JSON.
            
            HashMap properties = doc.getHashMap();
            
            t = tupleFactory.newTuple(1);
            t.set(0, convertToPigType(properties));
        } else {
            // TODO: when scheme is specified.
            t = tupleFactory.newTuple(this.fields.length);
            for (int i = 0; i < this.fields.length; i++) {
                String fieldTemp = this.fields[i].getName();
                t.set(i, convertToPigType(doc.getObject(fieldTemp)));
            }
               
        }
        
        return t;
    }

    private Object convertToPigType(Object o) throws ExecException{
        if (o == null || o.equals(null)) {
            return null;
        } else if (o instanceof Number || o instanceof String) {
            return o;
        } else if (o instanceof ArrayList) {
            ArrayList list = (ArrayList) o;
            Tuple t = tupleFactory.newTuple(list.size());
            for (int i = 0; i < list.size(); i++) {
                t.set(i, convertToPigType(list.get(i)));
            }
            
            return t;
        } else if (o instanceof Map) {
            // TODO make this more efficient for lazy objects?
            Map fieldsMap = (Map) o;
            HashMap pigMap = new HashMap(fieldsMap.size());
            for (Map.Entry field : fieldsMap.entrySet()) {
                pigMap.put(field.getKey(), convertToPigType(field.getValue()));
            }
            
            return pigMap;
        } else {
            return o;
        }
    }
    
    /**
     * Converts a DocumentDBWritable to a pig Tuple.
     */
    @Override
    public Tuple getNext() throws IOException {
        LongWritable key = null;
        DocumentDBWritable value = null;

        try {
            if (!reader.nextKeyValue()) return null;
            key = (LongWritable) reader.getCurrentKey();
            value = (DocumentDBWritable) reader.getCurrentValue();

        } catch (InterruptedException e) {
            throw new IOException("Error reading in key/value", e);
        }

        if (key == null || value == null) {
            return null;
        }

        return this.translate(value);

    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void prepareToRead(RecordReader reader, PigSplit split)
            throws IOException {
        this.reader = reader;
    }

    /**
     * Sets the DocumentDB connector input configuration properties.
     */
    @Override
    public void setLocation(String location, Job job) throws IOException {
        Configuration conf = job.getConfiguration();
        
        conf.set(ConfigurationUtil.DB_HOST, location);
        conf.set(ConfigurationUtil.DB_KEY, this.masterkey);
        conf.set(ConfigurationUtil.DB_NAME, this.dbName);
        conf.set(ConfigurationUtil.INPUT_COLLECTION_NAMES, this.inputCollections);
        if (this.query != null) {
            conf.set(ConfigurationUtil.QUERY, this.query);
        }
        
        conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, true);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public String relativeToAbsolutePath(final String location, final Path currentDir) {
        return location;
    }
    
    /**
     * {@inheritDoc}
     */
    @Override
    public void setUDFContextSignature(String signature) {
        
    }

}