All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mongodb.hadoop.pig.BSONStorage Maven / Gradle / Ivy

Go to download

The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.

The newest version!
/*
 * Copyright 2011 10gen Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mongodb.hadoop.pig;

import com.mongodb.BasicDBObjectBuilder;
import com.mongodb.hadoop.BSONFileOutputFormat;
import com.mongodb.hadoop.pig.udf.types.PigBoxedBSONValue;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.pig.LoadFunc;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.StoreFunc;
import org.apache.pig.StoreMetadata;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;
import org.joda.time.DateTime;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Properties;

public class BSONStorage extends StoreFunc implements StoreMetadata {

    private static final Log LOG = LogFactory.getLog(MongoStorage.class);
    static final String SCHEMA_SIGNATURE = "bson.pig.output.schema";
    //CHECKSTYLE:OFF
    protected ResourceSchema schema = null;
    //CHECKSTYLE:ON
    private RecordWriter out;

    private String udfcSignature = null;
    private String idField = null;

    private final BSONFileOutputFormat outputFormat = new BSONFileOutputFormat();

    public BSONStorage() {
    }

    public BSONStorage(final String idField) {
        this.idField = idField;
    }

    /**
     * Returns object more suited for BSON storage. Object o corresponds to a field value in pig.
     *
     * @param o        object representing pig type to convert to BSON-like object
     * @param field    field to place o in
     * @param toIgnore name of field in Object o to ignore
     * @return an Object that can be stored as BSON.
     * @throws IOException if no schema is available from the field
     */
    public static Object getTypeForBSON(final Object o, final ResourceFieldSchema field, final String toIgnore)
      throws IOException {
        byte dataType;
        ResourceSchema fieldInnerSchema = null;
        if (null == o) {
            return null;
        }
        if (null == field || DataType.UNKNOWN == field.getType()) {
            dataType = DataType.findType(o);
        } else {
            dataType = field.getType();
            fieldInnerSchema = field.getSchema();
        }

        if (dataType == DataType.BYTEARRAY && o instanceof Map) {
            dataType = DataType.MAP;
        }

        switch (dataType) {
            case DataType.NULL:
                return null;
            case DataType.INTEGER:
            case DataType.LONG:
            case DataType.FLOAT:
            case DataType.DOUBLE:
                return o;
            case DataType.BYTEARRAY:
                if (o instanceof PigBoxedBSONValue) {
                    return ((PigBoxedBSONValue) o).getObject();
                }
                return o.toString();
            case DataType.CHARARRAY:
                return o;
            case DataType.DATETIME:
                return ((DateTime) o).toDate();
            //Given a TUPLE, create a Map so BSONEncoder will eat it
            case DataType.TUPLE:
                // If there is no inner schema, just return the Tuple.
                // BasicBSONEncoder will consume it as an Iterable.
                if (fieldInnerSchema == null) {
                    return o;
                }

                // If there was an inner schema, create a Map from the Tuple.
                ResourceFieldSchema[] fs = fieldInnerSchema.getFields();
                // check if fs[0] should be 'unnamed', in which case, we create
                // an array of 'inner' elements.
                // For example, {("a"),("b")} becomes ["a","b"] if/
                // unnamedStr == "t" and schema for bag is {<*>:(t:chararray)}/
                // <*> -> can be any string since the field name of the tuple in
                // a bag should be ignored
                if (1 == fs.length && fs[0].getName().equals(toIgnore)) {
                    return getTypeForBSON(((Tuple) o).get(0), fs[0], toIgnore);
                }
                // If there is more than one field in the tuple or no fields
                // to ignore, treat the Tuple as a Map.
                Map m = new LinkedHashMap();
                for (int j = 0; j < fs.length; j++) {
                    m.put(fs[j].getName(), getTypeForBSON(((Tuple) o).get(j), fs[j], toIgnore));
                }
                return m;
            // Given a BAG, create an Array so BSONEncoder will eat it.
            case DataType.BAG:
                // If there is no inner schema, just return the Bag.
                // BasicBSONEncoder will consume it as an Iterable.
                if (null == fieldInnerSchema) {
                    return o;
                }
                fs = fieldInnerSchema.getFields();
                ArrayList bagList = new ArrayList();
                for (Tuple t : (DataBag) o) {
                    bagList.add(getTypeForBSON(t, fs[0], toIgnore));
                }
                return bagList;
            case DataType.MAP:
                if (o == null) {
                    return null;
                }
                Map map = (Map) o;
                Map out = new HashMap(map.size());
                for (Object key : map.keySet()) {
                    out.put(key.toString(), getTypeForBSON(map.get(key), null, toIgnore));
                }
                return out;
            default:
                return o;
        }
    }

    @SuppressWarnings("unchecked")
    protected void writeField(final BasicDBObjectBuilder builder, final ResourceFieldSchema field, final Object d) throws IOException {
        Object convertedType = getTypeForBSON(d, field, null);
        String fieldName = field != null ? field.getName() : "value";

        if (convertedType instanceof Map) {
            for (Map.Entry mapentry : ((Map) convertedType).entrySet()) {
                String addKey = mapentry.getKey().equals(idField) ? "_id" : mapentry.getKey();
                builder.add(addKey, mapentry.getValue());
            }
        } else {
            builder.add(fieldName, convertedType);
        }

    }

    @Override
    public void checkSchema(final ResourceSchema schema) throws IOException {
        this.schema = schema;
        UDFContext context = UDFContext.getUDFContext();

        Properties p = context.getUDFProperties(getClass(), new String[]{udfcSignature});
        p.setProperty(SCHEMA_SIGNATURE, schema.toString());
    }

    @Override
    public void storeSchema(final ResourceSchema schema, final String location, final Job job) {
        // not implemented
    }


    @Override
    public void storeStatistics(final ResourceStatistics stats, final String location, final Job job) {
        // not implemented
    }

    @Override
    public void putNext(final Tuple tuple) throws IOException {
        try {
            final BasicDBObjectBuilder builder = BasicDBObjectBuilder.start();
            ResourceFieldSchema[] fields = null;
            if (schema != null) {
                fields = schema.getFields();
            }
            if (fields != null) {
                for (int i = 0; i < fields.length; i++) {
                    writeField(builder, fields[i], tuple.get(i));
                }
            } else {
                for (int i = 0; i < tuple.size(); i++) {
                    writeField(builder, null, tuple.get(i));
                }
            }

            out.write(null, builder.get());
        } catch (Exception e) {
            throw new IOException("Couldn't convert tuple to bson: ", e);
        }
    }

    @Override
    public void prepareToWrite(final RecordWriter writer) throws IOException {
        out = writer;
        if (out == null) {
            throw new IOException("Invalid Record Writer");
        }

        UDFContext udfc = UDFContext.getUDFContext();
        Properties p = udfc.getUDFProperties(getClass(), new String[]{udfcSignature});
        String strSchema = p.getProperty(SCHEMA_SIGNATURE);
        if (strSchema == null) {
            LOG.warn("Could not find schema in UDF context!");
            LOG.warn("Will attempt to write records without schema.");
        }

        try {
            // Parse the schema from the string stored in the properties object.
            schema = new ResourceSchema(Utils.getSchemaFromString(strSchema));
        } catch (Exception e) {
            schema = null;
            LOG.warn(e.getMessage());
        }

    }

    @Override
    public OutputFormat getOutputFormat() throws IOException {
        return outputFormat;
    }

    @Override
    public String relToAbsPathForStoreLocation(final String location, final Path curDir) throws IOException {
        return LoadFunc.getAbsolutePath(location, curDir);
    }

    @Override
    public void setStoreLocation(final String location, final Job job) throws IOException {
        final Configuration config = job.getConfiguration();
        // Old property.
        config.set("mapred.output.dir", location);
        // Modern property.
        config.set("mapreduce.output.fileoutputformat.outputdir", location);
    }


    @Override
    public void setStoreFuncUDFContextSignature(final String signature) {
        udfcSignature = signature;
    }

}