All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mongodb.hadoop.pig.MongoInsertStorage Maven / Gradle / Ivy

Go to download

The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.

The newest version!
/*
 * Copyright 2011 10gen Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mongodb.hadoop.pig;

import com.mongodb.BasicDBObjectBuilder;
import com.mongodb.MongoClientURI;
import com.mongodb.hadoop.MongoOutputFormat;
import com.mongodb.hadoop.util.MongoConfigUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.StoreFunc;
import org.apache.pig.StoreMetadata;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;

import java.io.IOException;
import java.util.Map;
import java.util.Properties;

@SuppressWarnings("unchecked")
public class MongoInsertStorage extends StoreFunc implements StoreMetadata {

    // Pig specific settings
    static final String SCHEMA_SIGNATURE = "mongoinsert.pig.output.schema";
    private static final Log LOG = LogFactory.getLog(MongoStorage.class);
    private final MongoOutputFormat outputFormat = new MongoOutputFormat();
    //CHECKSTYLE:OFF
    protected ResourceSchema schema = null;
    //CHECKSTYLE:ON
    private RecordWriter out;
    private String udfcSignature = null;
    private String idField = null;

    public MongoInsertStorage() {
    }

    /**
     * @param idField   the field standing in for {@code _id}
     * @param useUpsert is parameter is unused
     * @deprecated useUpsert is unused. Use {@link #MongoInsertStorage(String)}
     * instead.
     */
    @Deprecated
    @SuppressWarnings("UnusedParameters")
    public MongoInsertStorage(final String idField, final String useUpsert) {
        this.idField = idField;
    }

    /**
     * Create a new MongoInsertStorage.
     *
     * @param idField the field standing in for {@code _id}
     */
    public MongoInsertStorage(final String idField) {
        this.idField = idField;
    }

    public String relToAbsPathForStoreLocation(final String location, final Path curDir) throws IOException {
        // Don't convert anything - override to keep base from messing with URI
        return location;
    }

    public OutputFormat getOutputFormat() throws IOException {
        return outputFormat;
        //final MongoOutputFormat outputFmt = options == null ? new MongoOutputFormat() : new MongoOutputFormat(options.getUpdate().keys,
        // options.getUpdate().multi);
        //LOG.info( "OutputFormat... " + outputFmt );
        //return outputFmt;
    }

    public void setStoreLocation(final String location, final Job job) throws IOException {
        final Configuration config = job.getConfiguration();
        if (!location.startsWith("mongodb://")) {
            throw new IllegalArgumentException("Invalid URI Format.  URIs must begin with a mongodb:// protocol string.");
        }
        MongoClientURI locURI = new MongoClientURI(location);
        LOG.info(String.format(
            "Store location config: %s; for namespace: %s.%s; hosts: %s",
            config, locURI.getDatabase(), locURI.getCollection(),
            locURI.getHosts()));
        MongoConfigUtil.setOutputURI(config, locURI);
    }

    @Override
    public void checkSchema(final ResourceSchema schema) throws IOException {
        this.schema = schema;
        UDFContext udfc = UDFContext.getUDFContext();

        Properties p = udfc.getUDFProperties(getClass(), new String[]{udfcSignature});
        p.setProperty(SCHEMA_SIGNATURE, schema.toString());
    }

    public void prepareToWrite(final RecordWriter writer) throws IOException {
        out = writer;
        if (out == null) {
            throw new IOException("Invalid Record Writer");
        }

        UDFContext udfc = UDFContext.getUDFContext();
        Properties p = udfc.getUDFProperties(getClass(), new String[]{udfcSignature});
        String strSchema = p.getProperty(SCHEMA_SIGNATURE);
        if (strSchema == null) {
            LOG.warn("Could not find schema in UDF context. Interpreting each tuple as containing a single map.");
        } else {
            try {
                // Parse the schema from the string stored in the properties object.
                schema = new ResourceSchema(Utils.getSchemaFromString(strSchema));
            } catch (Exception e) {
                schema = null;
                LOG.warn(e.getMessage());
            }

            if (LOG.isDebugEnabled()) {
                LOG.debug("GOT A SCHEMA " + schema + " " + strSchema);
            }
        }
    }

    @Override
    public void putNext(final Tuple tuple) throws IOException {
        try {
            final BasicDBObjectBuilder builder = BasicDBObjectBuilder.start();
            ResourceFieldSchema[] fields = null;
            if (schema != null) {
                fields = schema.getFields();
            }
            if (fields != null) {
                for (int i = 0; i < fields.length; i++) {
                    writeField(builder, fields[i], tuple.get(i));
                }
            } else {
                // Assume that the tuple contains only a map, as produced by
                // MongoLoader, for example.
                if (tuple.size() != 1) {
                    throw new IOException("Could not retrieve schema, but tuples did not contain a single item: " + tuple);
                }
                Object result = BSONStorage.getTypeForBSON(
                    tuple.get(0), null, null);
                if (!(result instanceof Map)) {
                    throw new IOException("Could not retrieve schema, but tuples contained something other than a Map: " + tuple);
                }
                Map documentMap = (Map) result;
                for (Map.Entry entry : documentMap.entrySet()) {
                    builder.add(entry.getKey(), entry.getValue());
                }
            }

            out.write(null, builder.get());
        } catch (Exception e) {
            throw new IOException("Couldn't convert tuple to bson: ", e);
        }
    }

    @Override
    public void setStoreFuncUDFContextSignature(final String signature) {
        udfcSignature = signature;
    }

    @Override
    public void storeStatistics(final ResourceStatistics stats, final String location, final Job job) {
        // not implemented
    }

    @Override
    public void storeSchema(final ResourceSchema schema, final String location, final Job job) {
        // not implemented
    }

    protected void writeField(final BasicDBObjectBuilder builder,
                              final ResourceFieldSchema field,
                              final Object d) throws IOException {
        Object convertedType = BSONStorage.getTypeForBSON(d, field, null);
        if (field.getName() != null && field.getName().equals(idField)) {
            builder.add("_id", convertedType);
        } else {
            builder.add(field.getName(), convertedType);
        }

    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy