com.mongodb.hadoop.hive.input.HiveMongoInputFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-hive Show documentation
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
There is a newer version: 2.0.2
Show newest version
/*
 * Copyright 2010-2013 10gen Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mongodb.hadoop.hive.input;

import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.mongodb.hadoop.hive.BSONSerDe;
import com.mongodb.hadoop.hive.MongoStorageHandler;
import com.mongodb.hadoop.input.MongoInputSplit;
import com.mongodb.hadoop.io.BSONWritable;
import com.mongodb.hadoop.mapred.input.MongoRecordReader;
import com.mongodb.hadoop.splitter.MongoSplitter;
import com.mongodb.hadoop.splitter.MongoSplitterFactory;
import com.mongodb.hadoop.splitter.SplitFailedException;
import com.mongodb.hadoop.util.MongoConfigUtil;
import com.mongodb.util.JSON;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer;
import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.bson.BSONObject;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/*
 * Defines a HiveInputFormat for use in reading data from MongoDB into a hive table
 * 
 */
public class HiveMongoInputFormat extends HiveInputFormat {

    private static final String EQUAL_OP = GenericUDFOPEqual.class.getName();
    private static final Map MONGO_OPS =
      new HashMap() {{
          put(GenericUDFOPLessThan.class.getName(), "$lt");
          put(GenericUDFOPEqualOrLessThan.class.getName(), "$lte");
          put(GenericUDFOPGreaterThan.class.getName(), "$gt");
          put(GenericUDFOPEqualOrGreaterThan.class.getName(), "$gte");
      }};

    private static final Log LOG = LogFactory.getLog(HiveMongoInputFormat.class);

    @Override
    public RecordReader getRecordReader(final InputSplit split,
                                                                    final JobConf conf,
                                                                    final Reporter reporter)
        throws IOException {

        // split is of type 'MongoHiveInputSplit'
        MongoHiveInputSplit mhis = (MongoHiveInputSplit) split;

        // Get column name mapping.
        Map colToMongoNames = columnMapping(conf);

        // Add projection from Hive.
        DBObject mongoProjection = getProjection(conf, colToMongoNames);
        MongoInputSplit delegate = (MongoInputSplit) mhis.getDelegate();
        if (mongoProjection != null) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Adding MongoDB projection : " + mongoProjection);
            }
            delegate.setFields(mongoProjection);
        }
        // Filter from Hive.
        DBObject filter = getFilter(conf, colToMongoNames);
        // Combine with filter from table, if there is one.
        if (conf.get(MongoConfigUtil.INPUT_QUERY) != null) {
            DBObject tableFilter = MongoConfigUtil.getQuery(conf);
            if (null == filter) {
                filter = tableFilter;
            } else {
                BasicDBList conditions = new BasicDBList();
                conditions.add(filter);
                conditions.add(tableFilter);
                // Use $and clause so we don't overwrite any of the table
                // filter.
                filter = new BasicDBObject("$and", conditions);
            }
        }
        if (filter != null) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Adding MongoDB query: " + filter);
            }
            delegate.setQuery(filter);
        }

        // return MongoRecordReader. Delegate is of type 'MongoInputSplit'
        return new MongoRecordReader(delegate);
    }

    DBObject getFilter(
      final JobConf conf, final Map colToMongoNames) {
        String serializedExpr = conf.get(TableScanDesc.FILTER_EXPR_CONF_STR);
        if (serializedExpr != null) {
            ExprNodeGenericFuncDesc expr =
              Utilities.deserializeExpression(serializedExpr);
            IndexPredicateAnalyzer analyzer =
              IndexPredicateAnalyzer.createAnalyzer(false);

            // Allow all column names.
            String columnNamesStr =
              conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR);
            String[] columnNames =
              StringUtils.split(columnNamesStr, '\\', StringUtils.COMMA);
            for (String colName : columnNames) {
                analyzer.allowColumnName(colName);
            }

            List searchConditions =
              new LinkedList();
            analyzer.analyzePredicate(expr, searchConditions);

            return getFilter(searchConditions, colToMongoNames);
        }
        return null;
    }

    DBObject getFilter(
      final List searchConditions,
      final Map colToMongoNames) {
        DBObject filter = new BasicDBObject();
        for (IndexSearchCondition isc : searchConditions) {
            String comparisonName = isc.getComparisonOp();
            Object constant = isc.getConstantDesc().getValue();
            String columnName = isc.getColumnDesc().getColumn();
            String mongoName = resolveMongoName(columnName, colToMongoNames);

            if (EQUAL_OP.equals(comparisonName)) {
                filter.put(mongoName, constant);
            } else {
                String mongoOp = MONGO_OPS.get(comparisonName);
                if (mongoOp != null) {
                    filter.put(
                      mongoName, new BasicDBObject(mongoOp, constant));
                } else {
                    // Log the fact that we don't support this operator.
                    // It's still ok to return a query; we'll just return a
                    // super set of the documents needed by Hive.
                    LOG.warn("unsupported operator type: " + comparisonName);
                }
            }
        }
        return filter;
    }

    DBObject getProjection(
      final JobConf conf, final Map colToMongoNames) {
        boolean readAllCols =
          conf.getBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, true);
        DBObject mongoProjection = null;
        if (!readAllCols) {
            String columnNamesStr =
              conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR);
            String[] columnNames =
              StringUtils.split(columnNamesStr, '\\', StringUtils.COMMA);
            boolean foundId = false;
            mongoProjection = new BasicDBObject();
            for (String col : columnNames) {
                String mapped = resolveMongoName(col, colToMongoNames);
                if ("_id".equals(mapped)) {
                    foundId = true;
                }
                mongoProjection.put(mapped, 1);
            }
            // Remove _id unless asked for explicitly.
            if (!foundId) {
                mongoProjection.put("_id", 0);
            }
        }
        return mongoProjection;
    }

    private Map columnMapping(final JobConf conf) {
        String colMapString = conf.get(BSONSerDe.MONGO_COLS);
        if (null == colMapString) {
            return null;
        }
        BSONObject mappingBSON = (BSONObject) JSON.parse(colMapString);
        Map mapping = new HashMap();
        for (String key : mappingBSON.keySet()) {
            mapping.put(key.toLowerCase(), (String) mappingBSON.get(key));
        }
        return mapping;
    }

    private String resolveMongoName(
      final String colName, final Map colNameMapping) {
        if (null == colNameMapping) {
            return colName;
        }
        String mapped = colNameMapping.get(colName);
        if (null == mapped) {
            return colName;
        }
        return mapped;
    }

    @Override
    public FileSplit[] getSplits(final JobConf conf, final int numSplits)
        throws IOException {
        try {
            MongoSplitter splitterImpl = MongoSplitterFactory.getSplitter(conf);
            final List splits =
                splitterImpl.calculateSplits();
            InputSplit[] splitIns = splits.toArray(new InputSplit[splits.size()]);

            // wrap InputSplits in FileSplits so that 'getPath' 
            // doesn't produce an error (Hive bug)
            FileSplit[] wrappers = new FileSplit[splitIns.length];
            Path path = new Path(conf.get(MongoStorageHandler.TABLE_LOCATION));
            for (int i = 0; i < wrappers.length; i++) {
                wrappers[i] = new MongoHiveInputSplit(splitIns[i], path);
            }

            return wrappers;
        } catch (SplitFailedException spfe) {
            // split failed because no namespace found 
            // (so the corresponding collection doesn't exist)
            LOG.error(spfe.getMessage(), spfe);
            throw new IOException(spfe.getMessage(), spfe);
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    /*
     * MongoHiveInputSplit ->
     *  Used to wrap MongoInputSplits (as a delegate) to by-pass the Hive bug where
     *  'HiveInputSplit.getPath' is always called.
     */
    public static class MongoHiveInputSplit extends FileSplit {
        private InputSplit delegate;
        private Path path;

        MongoHiveInputSplit() {
            this(new MongoInputSplit());
        }

        MongoHiveInputSplit(final InputSplit delegate) {
            this(delegate, null);
        }

        MongoHiveInputSplit(final InputSplit delegate, final Path path) {
            super(path, 0, 0, (String[]) null);
            this.delegate = delegate;
            this.path = path;
        }

        public InputSplit getDelegate() {
            return delegate;
        }

        @Override
        public long getLength() {
            return 1L;
        }

        @Override
        public void write(final DataOutput out) throws IOException {
            Text.writeString(out, path.toString());
            delegate.write(out);
        }

        @Override
        public void readFields(final DataInput in) throws IOException {
            path = new Path(Text.readString(in));
            delegate.readFields(in);
        }

        @Override
        public String toString() {
            return delegate.toString();
        }

        @Override
        public Path getPath() {
            return path;
        }
    }
}