
com.mongodb.hadoop.input.MongoInputSplit Maven / Gradle / Ivy
The newest version!
// MongoInputSplit.java
/*
* Copyright 2010 10gen Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mongodb.hadoop.input;
import com.mongodb.*;
import com.mongodb.hadoop.util.*;
import org.apache.commons.logging.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.bson.*;
import java.io.*;
import java.util.*;
public class MongoInputSplit extends InputSplit implements Writable, org.apache.hadoop.mapred.InputSplit {
public MongoInputSplit( MongoURI inputURI,
String keyField,
DBObject query,
DBObject fields,
DBObject sort,
Object specialMin,
Object specialMax,
int limit,
int skip,
boolean noTimeout ){
log.debug( "Creating a new MongoInputSplit for MongoURI '"
+ inputURI + "', keyField: " + keyField + ", query: '" + query + "', fieldSpec: '" + fields
+ "', sort: '" + sort + "', limit: " + limit + ", skip: " + skip + " noTimeout? " + noTimeout + "." );
_mongoURI = inputURI;
_keyField = keyField;
_querySpec = query;
_fieldSpec = fields;
_sortSpec = sort;
_limit = limit;
_skip = skip;
_notimeout = noTimeout;
_specialMin = specialMin;
_specialMax = specialMax;
getCursor();
getBSONDecoder();
getBSONEncoder();
}
/**
* This is supposed to return the size of the split in bytes, but for now, for sanity sake we return the # of docs
* in the split instead.
*
* @return
*/
@Override
public long getLength(){
return Long.MAX_VALUE;
}
@Override
public String[] getLocations(){
return _mongoURI.getHosts().toArray( new String[_mongoURI.getHosts().size()] );
}
/**
* Serialize the Split instance
*/
public void write( final DataOutput out ) throws IOException{
BSONEncoder enc = getBSONEncoder();
BSONObject spec = BasicDBObjectBuilder.start().
add( "uri", _mongoURI.toString() ).
add( "key", _keyField ).
add( "query", _querySpec ).
add( "field", _fieldSpec ).
add( "sort", _sortSpec ).
add( "limit", _limit ).
add( "skip", _skip ).
add( "specialMin", _specialMin).
add( "specialMax", _specialMax).
add( "notimeout", _notimeout ).get();
byte[] buf = enc.encode( spec );
out.write( buf );
}
public void readFields( DataInput in ) throws IOException{
BSONDecoder dec = getBSONDecoder();
BSONCallback cb = new BasicBSONCallback();
BSONObject spec;
// Read the BSON length from the start of the record
byte[] l = new byte[4];
try {
in.readFully( l );
int dataLen = org.bson.io.Bits.readInt( l );
if ( log.isDebugEnabled() ) log.debug( "*** Expected DataLen: " + dataLen );
byte[] data = new byte[dataLen + 4];
System.arraycopy( l, 0, data, 0, 4 );
in.readFully( data, 4, dataLen - 4 );
dec.decode( data, cb );
spec = (BSONObject) cb.get();
if ( log.isTraceEnabled() ) log.trace( "Decoded a BSON Object: " + spec );
}
catch ( Exception e ) {
/* If we can't read another length it's not an error, just return quietly. */
// TODO - Figure out how to gracefully mark this as an empty
log.info( "No Length Header available." + e );
spec = new BasicDBObject();
}
_mongoURI = new MongoURI((String) spec.get( "uri" ));
_keyField = (String) spec.get( "key" );
_querySpec = new BasicDBObject( ((BSONObject) spec.get( "query" )).toMap() );
_fieldSpec = new BasicDBObject( ((BSONObject) spec.get( "field" )).toMap() ) ;
_sortSpec = new BasicDBObject( ((BSONObject) spec.get( "sort" )).toMap() );
_specialMin = spec.get("specialMin");
_specialMax = spec.get("specialMax");
_limit = (Integer) spec.get( "limit" );
_skip = (Integer) spec.get( "skip" );
_notimeout = (Boolean) spec.get( "notimeout" );
getCursor();
log.info( "Deserialized MongoInputSplit ... { length = " + getLength() + ", locations = "
+ Arrays.toString( getLocations() ) + ", keyField = " + _keyField + ", query = " + _querySpec
+ ", fields = " + _fieldSpec + ", sort = " + _sortSpec + ", limit = " + _limit + ", skip = "
+ _skip + ", noTimeout = " + _notimeout + ", specialMin = " + _specialMin
+ ", specialMax = " + _specialMax + "}" );
}
public DBCursor getCursor(){
// Return the cursor with the split's query, etc. already slotted in for
// them.
// todo - support limit/skip
if ( _cursor == null ){
log.debug("reading data from " + _mongoURI);
_cursor = MongoConfigUtil.getCollection( _mongoURI ).find( _querySpec, _fieldSpec ).sort( _sortSpec );
if (_notimeout) _cursor.setOptions( Bytes.QUERYOPTION_NOTIMEOUT );
if (_specialMin != null) _cursor.addSpecial("$min", this._specialMin);
if (_specialMax != null) _cursor.addSpecial("$max", this._specialMax);
_cursor.slaveOk();
}
return _cursor;
}
BSONEncoder getBSONEncoder(){
if (_bsonEncoder == null)
_bsonEncoder = new BasicBSONEncoder();
return _bsonEncoder;
}
BSONDecoder getBSONDecoder(){
if (_bsonDecoder == null)
_bsonDecoder = new BasicBSONDecoder();
return _bsonDecoder;
}
@Override
public String toString(){
return "MongoInputSplit{URI=" + _mongoURI.toString()
+ ", keyField=" + _keyField
+ ", min=" + _specialMin + ", max=" + _specialMax
+ ", query=" + _querySpec
+ ", sort=" + _sortSpec
+ ", fields=" + _fieldSpec
+ ", limit=" + _limit
+ ", skip=" + _skip
+ ", notimeout=" + _notimeout + '}' ;
}
public MongoInputSplit(){ }
public MongoURI getMongoURI(){
return _mongoURI;
}
public DBObject getQuerySpec(){
return _querySpec;
}
public DBObject getFieldSpec(){
return _fieldSpec;
}
public DBObject getSortSpec(){
return _sortSpec;
}
public int getLimit(){
return _limit;
}
public int getSkip(){
return _skip;
}
/**
* The field to use as the Mapper Key
*/
public String getKeyField(){
return _keyField;
}
public boolean equals( Object o ){
if ( this == o ) return true;
if ( o == null || getClass() != o.getClass() ) return false;
MongoInputSplit that = (MongoInputSplit) o;
if ( _limit != that._limit ) return false;
if ( _notimeout != that._notimeout ) return false;
if ( _skip != that._skip ) return false;
if ( _fieldSpec != null ? !_fieldSpec.equals( that._fieldSpec ) : that._fieldSpec != null ) return false;
if ( _keyField != null ? !_keyField.equals( that._keyField ) : that._keyField != null ) return false;
if ( _mongoURI != null ? !_mongoURI.toString().equals( that._mongoURI.toString() ) : that._mongoURI != null ) return false;
if ( _querySpec != null ? !_querySpec.equals( that._querySpec ) : that._querySpec != null ) return false;
if ( _sortSpec != null ? !_sortSpec.equals( that._sortSpec ) : that._sortSpec != null ) return false;
return true;
}
public int hashCode(){
int result = _mongoURI != null ? _mongoURI.hashCode() : 0;
result = 31 * result + ( _keyField != null ? _keyField.hashCode() : 0 );
result = 31 * result + ( _querySpec != null ? _querySpec.hashCode() : 0 );
result = 31 * result + ( _fieldSpec != null ? _fieldSpec.hashCode() : 0 );
result = 31 * result + ( _sortSpec != null ? _sortSpec.hashCode() : 0 );
result = 31 * result + ( _notimeout ? 1 : 0 );
result = 31 * result + _limit;
result = 31 * result + _skip;
return result;
}
private MongoURI _mongoURI;
private String _keyField = "_id";
private Object _specialMin = null;
private Object _specialMax = null;
private DBObject _querySpec;
private DBObject _fieldSpec;
private DBObject _sortSpec;
private boolean _notimeout;
private int _limit = 0;
private int _skip = 0;
private long _length = -1;
private transient DBCursor _cursor;
private transient BSONEncoder _bsonEncoder;
private transient BSONDecoder _bsonDecoder;
private static final Log log = LogFactory.getLog( MongoInputSplit.class );
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy