com.mongodb.hadoop.pig.BSONLoader Maven / Gradle / Ivy
package com.mongodb.hadoop.pig;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.hadoop.BSONFileInputFormat;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.pig.LoadFunc;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.util.Utils;
import org.bson.BSONObject;
import org.bson.BasicBSONObject;
import org.bson.types.BasicBSONList;
import org.bson.types.ObjectId;
import java.io.IOException;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
public class BSONLoader extends LoadFunc {
private static TupleFactory tupleFactory = TupleFactory.getInstance();
private static BagFactory bagFactory = BagFactory.getInstance();
private static final Log LOG = LogFactory.getLog(BSONLoader.class);
private final BSONFileInputFormat inputFormat = new BSONFileInputFormat();
//CHECKSTYLE:OFF
protected RecordReader in = null;
private ResourceFieldSchema[] fields;
protected ResourceSchema schema = null;
//CHECKSTYLE:ON
private String idAlias = null;
public BSONLoader() {
}
public BSONLoader(final String idAlias, final String userSchema) {
this.idAlias = idAlias;
try {
schema = new ResourceSchema(Utils.getSchemaFromString(userSchema));
fields = schema.getFields();
} catch (Exception e) {
throw new IllegalArgumentException("Invalid Schema Format");
}
}
@Override
public void setLocation(final String location, final Job job) throws IOException {
BSONFileInputFormat.setInputPaths(job, location);
}
@Override
public InputFormat getInputFormat() {
return this.inputFormat;
}
@Override
public void prepareToRead(final RecordReader reader, final PigSplit split) {
this.in = reader;
}
@Override
public Tuple getNext() throws IOException {
BSONObject val;
try {
if (!in.nextKeyValue()) {
return null;
}
val = (BSONObject) in.getCurrentValue();
Tuple t;
if (this.fields == null) {
// dynamic schema mode - just output a tuple with a single element,
// which is a map storing the keys/vals in the document
t = tupleFactory.newTuple(1);
t.set(0, BSONLoader.convertBSONtoPigType(val));
} else {
t = tupleFactory.newTuple(fields.length);
for (int i = 0; i < fields.length; i++) {
String fieldTemp = fields[i].getName();
if (this.idAlias != null && this.idAlias.equals(fieldTemp)) {
fieldTemp = "_id";
}
t.set(i, BSONLoader.readField(val.get(fieldTemp), fields[i]));
}
}
return t;
} catch (InterruptedException e) {
throw new ExecException("Error while reading input", 6018);
}
}
@SuppressWarnings({"rawtypes", "unchecked"})
protected static Object readField(final Object obj, final ResourceFieldSchema field) throws IOException {
if (obj == null) {
return null;
}
try {
if (field == null) {
return obj;
}
switch (field.getType()) {
case DataType.INTEGER:
return Integer.parseInt(obj.toString());
case DataType.LONG:
return Long.parseLong(obj.toString());
case DataType.FLOAT:
return Float.parseFloat(obj.toString());
case DataType.DOUBLE:
return Double.parseDouble(obj.toString());
case DataType.BYTEARRAY:
return BSONLoader.convertBSONtoPigType(obj);
case DataType.CHARARRAY:
return obj.toString();
case DataType.TUPLE:
ResourceSchema s = field.getSchema();
ResourceFieldSchema[] fs = s.getFields();
Tuple t = tupleFactory.newTuple(fs.length);
BasicDBObject val = (BasicDBObject) obj;
for (int j = 0; j < fs.length; j++) {
t.set(j, readField(val.get(fs[j].getName()), fs[j]));
}
return t;
case DataType.BAG:
s = field.getSchema();
fs = s.getFields();
s = fs[0].getSchema();
fs = s.getFields();
DataBag bag = bagFactory.newDefaultBag();
BasicDBList vals = (BasicDBList) obj;
for (Object val1 : vals) {
t = tupleFactory.newTuple(fs.length);
for (int k = 0; k < fs.length; k++) {
t.set(k, readField(((BasicDBObject) val1).get(fs[k].getName()), fs[k]));
}
bag.add(t);
}
return bag;
case DataType.MAP:
s = field.getSchema();
fs = s != null ? s.getFields() : null;
BasicBSONObject inputMap = (BasicBSONObject) obj;
Map outputMap = new HashMap();
for (String key : inputMap.keySet()) {
if (fs != null) {
outputMap.put(key, readField(inputMap.get(key), fs[0]));
} else {
outputMap.put(key, readField(inputMap.get(key), null));
}
}
return outputMap;
default:
LOG.info("asfkjabskfjbsaf default for " + field.getName());
return BSONLoader.convertBSONtoPigType(obj);
}
} catch (Exception e) {
String fieldName = field.getName() == null ? "" : field.getName();
String type = DataType.genTypeToNameMap().get(field.getType());
LOG.warn("Type " + type + " for field " + fieldName + " can not be applied to " + obj.getClass().toString());
return null;
}
}
public static Object convertBSONtoPigType(final Object o) throws ExecException {
if (o == null) {
return null;
} else if (o instanceof Number || o instanceof String) {
return o;
} else if (o instanceof Date) {
return ((Date) o).getTime();
} else if (o instanceof ObjectId) {
return o.toString();
} else if (o instanceof BasicBSONList) {
BasicBSONList bl = (BasicBSONList) o;
Tuple t = tupleFactory.newTuple(bl.size());
for (int i = 0; i < bl.size(); i++) {
t.set(i, convertBSONtoPigType(bl.get(i)));
}
return t;
} else if (o instanceof Map) {
//TODO make this more efficient for lazy objects?
Map fieldsMap = (Map) o;
HashMap pigMap = new HashMap(fieldsMap.size());
for (Map.Entry field : fieldsMap.entrySet()) {
pigMap.put(field.getKey(), convertBSONtoPigType(field.getValue()));
}
return pigMap;
} else {
return o;
}
}
}