Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.mongodb.hadoop.hive.BSONSerDe Maven / Gradle / Ivy
Go to download
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
/*
* Copyright 2010-2013 10gen Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mongodb.hadoop.hive;
import com.mongodb.hadoop.io.BSONWritable;
import com.mongodb.util.JSON;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.Writable;
import org.bson.BSONObject;
import org.bson.BasicBSONObject;
import org.bson.types.BSONTimestamp;
import org.bson.types.BasicBSONList;
import org.bson.types.ObjectId;
import org.bson.types.Symbol;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import static java.lang.String.format;
/**
* The BSONSerDe class deserializes (parses) and serializes object from BSON to Hive represented object. It's initialized with the hive
* columns and hive recognized types as well as other config variables mandated by the StorageHanders.
*/
public class BSONSerDe implements SerDe {
private static final Log LOG = LogFactory.getLog(BSONSerDe.class);
// stores the 1-to-1 mapping of MongoDB fields to hive columns
public static final String MONGO_COLS = "mongo.columns.mapping";
// ObjectId should be translated to a struct, these are
// the pre-defined field names and values identifying
// that struct as an ObjectId struct
private static final int BSON_NUM = 8;
private static final String OID = "oid";
private static final String BSON_TYPE = "bsontype";
private StructTypeInfo docTypeInfo;
private ObjectInspector docOI;
//CHECKSTYLE:OFF
public List columnNames;
public List columnTypes;
// maps hive columns to fields in a MongoDB collection
public Map hiveToMongo;
//CHECKSTYLE:ON
// A row represents a row in the Hive table
private List row = new ArrayList();
/**
* Finds out the information of the table, including the column names and types.
*/
@SuppressWarnings("unchecked")
@Override
public void initialize(final Configuration conf, final Properties tblProps) throws SerDeException {
// regex used to split column names between commas
String splitCols = "\\s*,\\s*";
// Get the table column names
String colNamesStr = tblProps.getProperty(serdeConstants.LIST_COLUMNS);
columnNames = Arrays.asList(colNamesStr.split(splitCols));
// Get mappings specified by the user
if (tblProps.containsKey(MONGO_COLS)) {
String mongoFieldsStr = tblProps.getProperty(MONGO_COLS);
Map rules = ((BasicBSONObject) JSON.parse(mongoFieldsStr)).toMap();
// register the hive field mappings to mongo field mappings
hiveToMongo = new HashMap();
registerMappings(rules);
}
// Get the table column types
String colTypesStr = tblProps.getProperty(serdeConstants.LIST_COLUMN_TYPES);
columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(colTypesStr);
if (columnNames.size() != columnTypes.size()) {
throw new SerDeException("Column Names and Types don't match in size");
}
// Get the structure and object inspector
docTypeInfo =
(StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
docOI =
TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(docTypeInfo);
}
/**
* Takes in the object represented by JSON for Hive to Mongo/BSON mapping. Records these mappings and infers upper level mappings from
* lower level declarations.
*/
private void registerMappings(final Map rules) throws SerDeException {
// explode/infer shorter mappings
for (Entry e : rules.entrySet()) {
String key = (String) e.getKey();
String value = (String) e.getValue();
if (hiveToMongo.containsKey(key) && !hiveToMongo.get(key).equals(value)) {
throw new SerDeException("Ambiguous rule definition for " + key);
} else {
hiveToMongo.put(key.toLowerCase(), value);
}
if (key.contains(".")) {
// split by "."
String[] miniKeys = key.split("\\.");
String[] miniValues = value.split("\\.");
if (miniKeys.length != miniValues.length) {
throw new SerDeException(key + " should be of same depth as " + value);
}
int i = 0;
String curKey = "", curValue = "";
while (i < miniKeys.length - 1) {
curKey += miniKeys[i];
curValue += miniValues[i];
if (hiveToMongo.containsKey(curKey) && !hiveToMongo.get(curKey).equals(curValue)) {
throw new SerDeException("Ambiguous rule definition for " + curKey);
} else {
hiveToMongo.put(curKey.toLowerCase(), curValue);
}
curKey += ".";
curValue += ".";
i += 1;
}
}
}
}
/**
* Given a Writable object of BSON, turn it into a Hive table row
*/
@Override
//CHECKSTYLE:OFF
public Object deserialize(final Writable writable) throws SerDeException {
//CHECKSTYLE:ON
BSONObject doc;
row.clear();
// Make sure it's a BSONWritable object
if (writable instanceof BSONWritable) {
doc = ((BSONWritable) writable).getDoc();
} else {
throw new SerDeException(format("%srequires a BSONWritable object, not%s", getClass(), writable.getClass()));
}
// For each field, cast it to a HIVE type and add to the current row
Object value;
List structFieldNames = docTypeInfo.getAllStructFieldNames();
for (String fieldName : structFieldNames) {
try {
TypeInfo fieldTypeInfo = docTypeInfo.getStructFieldTypeInfo(fieldName);
// get the corresponding field name in MongoDB
String mongoMapping;
if (hiveToMongo == null) {
mongoMapping = fieldName;
} else {
mongoMapping = hiveToMongo.containsKey(fieldName)
? hiveToMongo.get(fieldName)
: fieldName;
}
value = deserializeField(getValue(doc, mongoMapping), fieldTypeInfo, fieldName);
} catch (Exception e) {
LOG.warn("Could not find the appropriate field for name " + fieldName);
value = null;
}
row.add(value);
}
return row;
}
private Object getValue(final BSONObject doc, final String mongoMapping) {
if (mongoMapping.contains(".")) {
int index = mongoMapping.indexOf('.');
BSONObject object = (BSONObject) doc.get(mongoMapping.substring(0, index));
return getValue(object, mongoMapping.substring(index + 1));
}
return doc.get(mongoMapping);
}
/**
* For a given Object value and its supposed TypeInfo determine and return its Hive object representation
*
* Map in here must be of the same type, so instead an embedded doc becomes a struct instead. ***
*/
public Object deserializeField(final Object value, final TypeInfo valueTypeInfo, final String ext) {
if (value != null) {
switch (valueTypeInfo.getCategory()) {
case LIST:
return deserializeList(value, (ListTypeInfo) valueTypeInfo, ext);
case MAP:
return deserializeMap(value, (MapTypeInfo) valueTypeInfo, ext);
case PRIMITIVE:
return deserializePrimitive(value, (PrimitiveTypeInfo) valueTypeInfo);
case STRUCT:
// Supports both struct and map, but should use struct
return deserializeStruct(value, (StructTypeInfo) valueTypeInfo, ext);
case UNION:
// Mongo also has no union
LOG.warn("BSONSerDe does not support unions.");
return null;
default:
// Must be an unknown (a Mongo specific type)
return deserializeMongoType(value);
}
}
return null;
}
/**
* Deserialize a List with the same listElemTypeInfo for its elements
*/
private Object deserializeList(final Object value, final ListTypeInfo valueTypeInfo, final String ext) {
BasicBSONList list = (BasicBSONList) value;
TypeInfo listElemTypeInfo = valueTypeInfo.getListElementTypeInfo();
for (int i = 0; i < list.size(); i++) {
list.set(i, deserializeField(list.get(i), listElemTypeInfo, ext));
}
return list.toArray();
}
/**
* deserialize the struct stored in 'value' ext : the hive mapping(s) seen so far before 'value' is encountered.
*/
@SuppressWarnings("unchecked")
private Object deserializeStruct(final Object value, final StructTypeInfo valueTypeInfo, final String ext) {
// ObjectId will be stored in a special struct
if (value instanceof ObjectId) {
return deserializeObjectId(value, valueTypeInfo);
} else {
Map map = (Map) value;
ArrayList structNames = valueTypeInfo.getAllStructFieldNames();
ArrayList structTypes = valueTypeInfo.getAllStructFieldTypeInfos();
List struct = new ArrayList(structNames.size());
for (int i = 0; i < structNames.size(); i++) {
String fieldName = structNames.get(i);
// hiveMapping -> prefixed by parent struct names.
// For example, in {"wife":{"name":{"first":"Sydney"}}},
// the hiveMapping of "first" is "wife.name.first"
String hiveMapping = ext.length() == 0 ? fieldName : ext + "." + fieldName;
// get the corresponding field name in MongoDB
String mongoMapping;
if (hiveToMongo == null) {
mongoMapping = hiveMapping;
} else {
if (hiveToMongo.containsKey(hiveMapping)) {
mongoMapping = hiveToMongo.get(hiveMapping);
} else {
mongoMapping = ext.length() > 0 && hiveToMongo.containsKey(ext)
? hiveToMongo.get(ext) + "." + fieldName
: hiveMapping;
}
}
String nextFieldTrans = extractMongoField(mongoMapping, hiveMapping, ext);
struct.add(deserializeField(map.get(nextFieldTrans), structTypes.get(i), hiveMapping));
}
return struct;
}
}
/*
* Gets the next field to be extracted in the process of (recursively) mapping fields in
* MongoDB to Hive struct field names
*/
private String extractMongoField(final String mongoMapping, final String hiveMapping, final String ext) {
String[] splitMongo = mongoMapping.split("\\.");
String[] splitHive = hiveMapping.split("\\.");
int i = 0;
String mongoSeen = "", hiveSeen = "";
while (i < splitMongo.length - 1) {
mongoSeen += splitMongo[i];
hiveSeen += splitHive[i];
if (hiveSeen.equals(ext)) {
return splitMongo[i + 1];
}
mongoSeen += ".";
hiveSeen += ".";
i++;
}
return null;
}
/**
* Also deserialize a Map with the same mapElemTypeInfo
*/
private Object deserializeMap(final Object value, final MapTypeInfo valueTypeInfo, final String ext) {
BasicBSONObject b = (BasicBSONObject) value;
TypeInfo mapValueTypeInfo = valueTypeInfo.getMapValueTypeInfo();
for (Entry entry : b.entrySet()) {
b.put(entry.getKey(), deserializeField(entry.getValue(), mapValueTypeInfo, ext));
}
return b.toMap();
}
/**
* Most primitives are included, but some are specific to Mongo instances
*/
private Object deserializePrimitive(final Object value, final PrimitiveTypeInfo valueTypeInfo) {
switch (valueTypeInfo.getPrimitiveCategory()) {
case BINARY:
return value;
case BOOLEAN:
return value;
case DOUBLE:
return value;
case FLOAT:
return value;
case INT:
if (value instanceof Double) {
return ((Double) value).intValue();
}
return value;
case LONG:
return value;
case SHORT:
return value;
case STRING:
return value.toString();
case TIMESTAMP:
if (value instanceof Date) {
return new Timestamp(((Date) value).getTime());
} else if (value instanceof BSONTimestamp) {
return new Timestamp(((BSONTimestamp) value).getTime() * 1000L);
} else {
return value;
}
default:
return deserializeMongoType(value);
}
}
/**
* For Mongo Specific types, return the most appropriate java types
*/
private Object deserializeMongoType(final Object value) {
if (value instanceof Symbol) {
return value.toString();
} else {
LOG.error("Unable to parse " + value + " for type " + value.getClass());
return null;
}
}
/**
* Parses an ObjectId into the corresponding struct declared in Hive
*/
private Object deserializeObjectId(final Object value, final StructTypeInfo valueTypeInfo) {
ArrayList structNames = valueTypeInfo.getAllStructFieldNames();
List struct = new ArrayList(structNames.size());
for (String structName : structNames) {
LOG.warn("SWEET ------ structName is " + structName);
if (structName.equals(OID)) {
struct.add(value.toString());
} else if (structName.equals(BSON_TYPE)) {
// the bson type is an int order type
// http://docs.mongodb.org.manual/faq/developers/
struct.add(BSON_NUM);
}
}
return struct;
}
@Override
//CHECKSTYLE:OFF
public ObjectInspector getObjectInspector() throws SerDeException {
//CHECKSTYLE:ON
return docOI;
}
@Override
public SerDeStats getSerDeStats() {
return null;
}
@Override
public Class extends Writable> getSerializedClass() {
return BSONWritable.class;
}
//CHECKSTYLE:OFF
@Override
public Writable serialize(final Object obj, final ObjectInspector oi) throws SerDeException {
return new BSONWritable((BSONObject) serializeStruct(obj, (StructObjectInspector) oi, ""));
}
//CHECKSTYLE:ON
public Object serializeObject(final Object obj, final ObjectInspector oi, final String ext) {
switch (oi.getCategory()) {
case LIST:
return serializeList(obj, (ListObjectInspector) oi, ext);
case MAP:
return serializeMap(obj, (MapObjectInspector) oi, ext);
case PRIMITIVE:
return serializePrimitive(obj, (PrimitiveObjectInspector) oi);
case STRUCT:
return serializeStruct(obj, (StructObjectInspector) oi, ext);
case UNION:
default:
LOG.error("Cannot serialize " + obj + " of type " + obj);
break;
}
return null;
}
private Object serializeList(final Object obj, final ListObjectInspector oi, final String ext) {
BasicBSONList list = new BasicBSONList();
List> field = oi.getList(obj);
if (field == null) {
return list;
}
ObjectInspector elemOI = oi.getListElementObjectInspector();
for (Object elem : field) {
list.add(serializeObject(elem, elemOI, ext));
}
return list;
}
/**
* Turn struct obj into a BasicBSONObject
*/
private Object serializeStruct(final Object obj, final StructObjectInspector structOI, final String ext) {
if (ext.length() > 0 && isObjectIdStruct(obj, structOI)) {
String objectIdString = "";
for (StructField s : structOI.getAllStructFieldRefs()) {
if (s.getFieldName().equals(OID)) {
objectIdString = structOI.getStructFieldData(obj, s).toString();
break;
}
}
return new ObjectId(objectIdString);
} else {
BasicBSONObject bsonObject = new BasicBSONObject();
// fields is the list of all variable names and information within the struct obj
List extends StructField> fields = structOI.getAllStructFieldRefs();
for (int i = 0; i < fields.size(); i++) {
StructField field = fields.get(i);
String fieldName, hiveMapping;
// get corresponding mongoDB field
if (ext.length() == 0) {
fieldName = columnNames.get(i);
hiveMapping = fieldName;
} else {
fieldName = field.getFieldName();
hiveMapping = ext + "." + fieldName;
}
ObjectInspector fieldOI = field.getFieldObjectInspector();
Object fieldObj = structOI.getStructFieldData(obj, field);
if (hiveToMongo != null && hiveToMongo.containsKey(hiveMapping)) {
String mongoMapping = hiveToMongo.get(hiveMapping);
int lastDotPos = mongoMapping.lastIndexOf(".");
String lastMapping = lastDotPos == -1 ? mongoMapping : mongoMapping.substring(lastDotPos + 1);
bsonObject.put(lastMapping,
serializeObject(fieldObj, fieldOI, hiveMapping));
} else {
bsonObject.put(fieldName,
serializeObject(fieldObj, fieldOI, hiveMapping));
}
}
return bsonObject;
}
}
/**
* Given a struct, look to se if it contains the fields that a ObjectId struct should contain
*/
private boolean isObjectIdStruct(final Object obj, final StructObjectInspector structOI) {
List extends StructField> fields = structOI.getAllStructFieldRefs();
// If the struct are of incorrect size, then there's no need to create
// a list of names
if (fields.size() != 2) {
return false;
}
boolean hasOID = false;
boolean isBSONType = false;
for (StructField s : fields) {
String fieldName = s.getFieldName();
if (fieldName.equals(OID)) {
hasOID = true;
} else if (fieldName.equals(BSON_TYPE)) {
String num = structOI.getStructFieldData(obj, s).toString();
isBSONType = Integer.parseInt(num) == BSON_NUM;
}
}
return hasOID && isBSONType;
}
/**
* For a map of convert to an embedded document
*/
private Object serializeMap(final Object obj, final MapObjectInspector mapOI, final String ext) {
BasicBSONObject bsonObject = new BasicBSONObject();
ObjectInspector mapValOI = mapOI.getMapValueObjectInspector();
// Each value is guaranteed to be of the same type
for (Entry, ?> entry : mapOI.getMap(obj).entrySet()) {
String field = entry.getKey().toString();
Object value = serializeObject(entry.getValue(), mapValOI, ext);
bsonObject.put(field, value);
}
return bsonObject;
}
/**
* For primitive types, depending on the primitive type, cast it to types that Mongo supports
*/
private Object serializePrimitive(final Object obj, final PrimitiveObjectInspector oi) {
switch (oi.getPrimitiveCategory()) {
case TIMESTAMP:
Timestamp ts = (Timestamp) oi.getPrimitiveJavaObject(obj);
if (ts == null) {
return null;
}
return new Date(ts.getTime());
default:
return oi.getPrimitiveJavaObject(obj);
}
}
}