All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mongodb.hadoop.input.MongoInputSplit Maven / Gradle / Ivy

The newest version!
// MongoInputSplit.java
/*
 * Copyright 2010 10gen Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mongodb.hadoop.input;

import com.mongodb.*;
import com.mongodb.hadoop.util.*;
import org.apache.commons.logging.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.bson.*;

import java.io.*;
import java.util.*;

public class MongoInputSplit extends InputSplit implements Writable, org.apache.hadoop.mapred.InputSplit {

    public MongoInputSplit( MongoURI inputURI,
                            String keyField,
                            DBObject query,
                            DBObject fields,
                            DBObject sort,
                            Object specialMin,
                            Object specialMax,
                            int limit,
                            int skip,
                            boolean noTimeout ){
        log.debug( "Creating a new MongoInputSplit for MongoURI '"
                   + inputURI + "', keyField: " + keyField + ", query: '" + query + "', fieldSpec: '" + fields
                   + "', sort: '" + sort + "', limit: " + limit + ", skip: " + skip + " noTimeout? " + noTimeout + "." );

        _mongoURI = inputURI;
        _keyField = keyField;
        _querySpec = query;
        _fieldSpec = fields;
        _sortSpec = sort;
        _limit = limit;
        _skip = skip;
        _notimeout = noTimeout;
        _specialMin = specialMin;
        _specialMax = specialMax;
        getCursor();
        getBSONDecoder();
        getBSONEncoder();
    }



    /**
     * This is supposed to return the size of the split in bytes, but for now, for sanity sake we return the # of docs
     * in the split instead.
     *
     * @return
     */
    @Override
    public long getLength(){
        return Long.MAX_VALUE;
    }

    @Override
    public String[] getLocations(){
        return _mongoURI.getHosts().toArray( new String[_mongoURI.getHosts().size()] );
    }

    /**
     * Serialize the Split instance
     */
    public void write( final DataOutput out ) throws IOException{
        BSONEncoder enc = getBSONEncoder();

        BSONObject spec = BasicDBObjectBuilder.start().
                                               add( "uri", _mongoURI.toString() ).
                                               add( "key", _keyField ).
                                               add( "query", _querySpec ).
                                               add( "field", _fieldSpec ).                                              
                                               add( "sort", _sortSpec ).
                                               add( "limit", _limit ).
                                               add( "skip", _skip ).
                                               add( "specialMin", _specialMin).
                                               add( "specialMax", _specialMax).
                                               add( "notimeout", _notimeout ).get();

        byte[] buf = enc.encode( spec );

        out.write( buf );
    }

    public void readFields( DataInput in ) throws IOException{
        BSONDecoder dec = getBSONDecoder();
        BSONCallback cb = new BasicBSONCallback();
        BSONObject spec;
        // Read the BSON length from the start of the record
        byte[] l = new byte[4];
        try {
            in.readFully( l );
            int dataLen = org.bson.io.Bits.readInt( l );
            if ( log.isDebugEnabled() ) log.debug( "*** Expected DataLen: " + dataLen );
            byte[] data = new byte[dataLen + 4];
            System.arraycopy( l, 0, data, 0, 4 );
            in.readFully( data, 4, dataLen - 4 );
            dec.decode( data, cb );
            spec = (BSONObject) cb.get();
            if ( log.isTraceEnabled() ) log.trace( "Decoded a BSON Object: " + spec );
        }
        catch ( Exception e ) {
            /* If we can't read another length it's not an error, just return quietly. */
            // TODO - Figure out how to gracefully mark this as an empty
            log.info( "No Length Header available." + e );
            spec = new BasicDBObject();
        }         
        
        _mongoURI = new MongoURI((String) spec.get( "uri" ));
        _keyField = (String) spec.get( "key" );
        _querySpec = new BasicDBObject( ((BSONObject) spec.get( "query" )).toMap() );
        _fieldSpec = new BasicDBObject( ((BSONObject) spec.get( "field" )).toMap() ) ;
        _sortSpec = new BasicDBObject( ((BSONObject) spec.get( "sort" )).toMap() );
        _specialMin = spec.get("specialMin");
        _specialMax = spec.get("specialMax");
        _limit = (Integer) spec.get( "limit" );
        _skip = (Integer) spec.get( "skip" );
        _notimeout = (Boolean) spec.get( "notimeout" );
        getCursor();
        log.info( "Deserialized MongoInputSplit ... { length = " + getLength() + ", locations = "
                   + Arrays.toString( getLocations() ) + ", keyField = " + _keyField + ", query = " + _querySpec
                   + ", fields = " + _fieldSpec + ", sort = " + _sortSpec + ", limit = " + _limit + ", skip = "
                   + _skip + ", noTimeout = " + _notimeout + ", specialMin = " + _specialMin
                   + ", specialMax = " + _specialMax + "}" );
    }

    public DBCursor getCursor(){
        // Return the cursor with the split's query, etc. already slotted in for
        // them.
        // todo - support limit/skip
        if ( _cursor == null ){
            log.debug("reading data from " + _mongoURI);
            _cursor = MongoConfigUtil.getCollection( _mongoURI ).find( _querySpec, _fieldSpec ).sort( _sortSpec );
            if (_notimeout) _cursor.setOptions( Bytes.QUERYOPTION_NOTIMEOUT );
            if (_specialMin != null) _cursor.addSpecial("$min", this._specialMin);
            if (_specialMax != null) _cursor.addSpecial("$max", this._specialMax);
            _cursor.slaveOk();
        }

        return _cursor;
    }

    BSONEncoder getBSONEncoder(){
        if (_bsonEncoder == null) 
            _bsonEncoder = new BasicBSONEncoder();
        return _bsonEncoder;
    }
    
    BSONDecoder getBSONDecoder(){
        if (_bsonDecoder == null)
            _bsonDecoder = new BasicBSONDecoder();
        return _bsonDecoder;
    }

    @Override
    public String toString(){
        return "MongoInputSplit{URI=" + _mongoURI.toString()
             + ", keyField=" + _keyField
             + ", min=" + _specialMin + ", max=" + _specialMax 
             + ", query=" + _querySpec
             + ", sort=" + _sortSpec
             + ", fields=" + _fieldSpec
             + ", limit=" + _limit
             + ", skip=" + _skip
             + ", notimeout=" + _notimeout + '}' ;
    }

    public MongoInputSplit(){ }

    public MongoURI getMongoURI(){
        return _mongoURI;
    }

    public DBObject getQuerySpec(){
        return _querySpec;
    }

    public DBObject getFieldSpec(){
        return _fieldSpec;
    }

    public DBObject getSortSpec(){
        return _sortSpec;
    }

    public int getLimit(){
        return _limit;
    }

    public int getSkip(){
        return _skip;
    }

    /**
     * The field to use as the Mapper Key
     */
    public String getKeyField(){
        return _keyField;
    }

    public boolean equals( Object o ){
        if ( this == o ) return true;
        if ( o == null || getClass() != o.getClass() ) return false;

        MongoInputSplit that = (MongoInputSplit) o;

        if ( _limit != that._limit ) return false;
        if ( _notimeout != that._notimeout ) return false;
        if ( _skip != that._skip ) return false;
        if ( _fieldSpec != null ? !_fieldSpec.equals( that._fieldSpec ) : that._fieldSpec != null ) return false;
        if ( _keyField != null ? !_keyField.equals( that._keyField ) : that._keyField != null ) return false;
        if ( _mongoURI != null ? !_mongoURI.toString().equals( that._mongoURI.toString() ) : that._mongoURI != null ) return false;
        if ( _querySpec != null ? !_querySpec.equals( that._querySpec ) : that._querySpec != null ) return false;
        if ( _sortSpec != null ? !_sortSpec.equals( that._sortSpec ) : that._sortSpec != null ) return false;

        return true;
    }

    public int hashCode(){
        int result = _mongoURI != null ? _mongoURI.hashCode() : 0;
        result = 31 * result + ( _keyField != null ? _keyField.hashCode() : 0 );
        result = 31 * result + ( _querySpec != null ? _querySpec.hashCode() : 0 );
        result = 31 * result + ( _fieldSpec != null ? _fieldSpec.hashCode() : 0 );
        result = 31 * result + ( _sortSpec != null ? _sortSpec.hashCode() : 0 );
        result = 31 * result + ( _notimeout ? 1 : 0 );
        result = 31 * result + _limit;
        result = 31 * result + _skip;
        return result;
    }

    private MongoURI _mongoURI;
    private String _keyField = "_id";
    private Object _specialMin = null;
    private Object _specialMax = null;
    private DBObject _querySpec;
    private DBObject _fieldSpec;
    private DBObject _sortSpec;
    private boolean _notimeout;
    private int _limit = 0;
    private int _skip = 0;
    private long _length = -1;
    private transient DBCursor _cursor;
    private transient BSONEncoder _bsonEncoder;
    private transient BSONDecoder _bsonDecoder;

    private static final Log log = LogFactory.getLog( MongoInputSplit.class );

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy