All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mongodb.hadoop.pig.MongoLoader Maven / Gradle / Ivy

package com.mongodb.hadoop.pig;

import com.mongodb.hadoop.MongoInputFormat;
import com.mongodb.hadoop.util.MongoConfigUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.pig.Expression;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.util.Utils;
import org.bson.BSONObject;

import java.io.IOException;

public class MongoLoader extends LoadFunc implements LoadMetadata {
    private static final Log LOG = LogFactory.getLog(MongoStorage.class);
    private static TupleFactory tupleFactory = TupleFactory.getInstance();
    // Pig specific settings
    private ResourceSchema schema = null;
    private RecordReader in = null;
    private final MongoInputFormat inputFormat = new MongoInputFormat();
    private ResourceFieldSchema[] fields;
    private String idAlias = null;

    @Override
    public void setUDFContextSignature(final String signature) {
    }

    public MongoLoader() {
        LOG.info("Initializing MongoLoader in dynamic schema mode.");
        schema = null;
        fields = null;
    }

    public ResourceFieldSchema[] getFields() {
        return fields;
    }

    public MongoLoader(final String userSchema, final String idAlias) {
        this.idAlias = idAlias;
        try {
            schema = new ResourceSchema(Utils.getSchemaFromString(userSchema));
            fields = schema.getFields();
        } catch (Exception e) {
            throw new IllegalArgumentException("Invalid Schema Format");
        }
    }

    public MongoLoader(final String userSchema) {
        this(userSchema, null);
    }

    @Override
    public void setLocation(final String location, final Job job) throws IOException {
        MongoConfigUtil.setInputURI(job.getConfiguration(), location);

    }

    @Override
    public InputFormat getInputFormat() throws IOException {
        return inputFormat;
    }

    @Override
    public void prepareToRead(final RecordReader reader, final PigSplit split) throws IOException {
        in = reader;
        if (in == null) {
            throw new IOException("Invalid Record Reader");
        }
    }

    @Override
    public Tuple getNext() throws IOException {
        BSONObject val;
        try {
            if (!in.nextKeyValue()) {
                return null;
            }
            val = (BSONObject) in.getCurrentValue();
        } catch (Exception ie) {
            throw new IOException(ie);
        }

        Tuple t;
        if (fields == null) {
            // dynamic schema mode - just output a tuple with a single element,
            // which is a map storing the keys/values in the document
            t = tupleFactory.newTuple(1);
            t.set(0, BSONLoader.convertBSONtoPigType(val));
        } else {
            t = tupleFactory.newTuple(fields.length);
            for (int i = 0; i < fields.length; i++) {
                String fieldTemp = fields[i].getName();
                if (idAlias != null && idAlias.equals(fieldTemp)) {
                    fieldTemp = "_id";
                }
                t.set(i, BSONLoader.readField(val.get(fieldTemp), fields[i]));
            }
        }
        return t;
    }

    @Override
    public String relativeToAbsolutePath(final String location, final Path curDir) throws IOException {
        // This is a mongo URI and has no notion of relative/absolute,
        // thus we want to always use just the same location.
        return location;
    }


    @Override
    public ResourceSchema getSchema(final String location, final Job job) throws IOException {
        if (schema != null) {
            return schema;
        }
        return null;
    }

    @Override
    public ResourceStatistics getStatistics(final String location, final Job job) throws IOException {
        // No statistics available. In the future
        // we could maybe construct something from db.collection.stats() here
        // but the class/API for this is unstable anyway, so this is unlikely
        // to be high priority.
        return null;
    }

    @Override
    public String[] getPartitionKeys(final String location, final Job job) throws IOException {
        // No partition keys.
        return null;
    }

    @Override
    public void setPartitionFilter(final Expression partitionFilter) throws IOException {
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy