cascading.flow.hadoop.HadoopCoGroupClosure Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cascading-hadoop2-common Show documentation
An API for data management, analytics, and machine learning on parallel computing clusters.
There is a newer version: 4.1.0
Show newest version
/*
 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.flow.hadoop;

import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.NoSuchElementException;

import cascading.flow.FlowProcess;
import cascading.flow.hadoop.util.FalseCollection;
import cascading.provider.FactoryLoader;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.Tuples;
import cascading.tuple.collect.Spillable;
import cascading.tuple.collect.SpillableTupleList;
import cascading.tuple.collect.TupleCollectionFactory;
import cascading.tuple.hadoop.collect.HadoopTupleCollectionFactory;
import cascading.tuple.io.IndexTuple;
import cascading.tuple.util.TupleViews;
import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static cascading.tuple.collect.TupleCollectionFactory.TUPLE_COLLECTION_FACTORY;

/** Class CoGroupClosure is used internally to represent co-grouping results of multiple tuple streams. */
public class HadoopCoGroupClosure extends HadoopGroupByClosure
  {
  /** Field LOG */
  private static final Logger LOG = LoggerFactory.getLogger( HadoopCoGroupClosure.class );

  public enum Spill
    {
      Num_Spills_Written, Num_Spills_Read, Num_Tuples_Spilled, Duration_Millis_Written
    }

  private class SpillListener implements Spillable.SpillListener
    {
    private final FlowProcess flowProcess;
    private final Fields joinField;

    public SpillListener( FlowProcess flowProcess, Fields joinField )
      {
      this.flowProcess = flowProcess;
      this.joinField = joinField;
      }

    @Override
    public void notifyWriteSpillBegin( Spillable spillable, int spillSize, String spillReason )
      {
      int numFiles = spillable.spillCount();

      if( numFiles % 10 == 0 )
        {
        LOG.info( "spilling group: {}, on grouping: {}, num times: {}, with reason: {}",
          new Object[]{joinField.printVerbose(), spillable.getGrouping().print(), numFiles + 1, spillReason} );

        Runtime runtime = Runtime.getRuntime();
        long freeMem = runtime.freeMemory() / 1024 / 1024;
        long maxMem = runtime.maxMemory() / 1024 / 1024;
        long totalMem = runtime.totalMemory() / 1024 / 1024;

        LOG.info( "mem on spill (mb), free: " + freeMem + ", total: " + totalMem + ", max: " + maxMem );
        }

      LOG.info( "spilling {} tuples in list to file number {}", spillSize, numFiles + 1 );

      flowProcess.increment( Spill.Num_Spills_Written, 1 );
      flowProcess.increment( Spill.Num_Tuples_Spilled, spillSize );
      }

    @Override
    public void notifyWriteSpillEnd( SpillableTupleList spillableTupleList, long duration )
      {
      flowProcess.increment( Spill.Duration_Millis_Written, duration );
      }

    @Override
    public void notifyReadSpillBegin( Spillable spillable )
      {
      flowProcess.increment( Spill.Num_Spills_Read, 1 );
      }
    }

  /** Field groups */
  protected Collection[] collections;
  protected final int numSelfJoins;

  private Tuple[] joinedTuplesArray;
  private final Tuple emptyTuple;
  private TupleBuilder joinedBuilder;
  private Tuple joinedTuple = new Tuple(); // is discarded

  private final TupleCollectionFactory tupleCollectionFactory;

  public HadoopCoGroupClosure( FlowProcess flowProcess, int numSelfJoins, Fields[] groupingFields, Fields[] valueFields )
    {
    super( flowProcess, groupingFields, valueFields );
    this.numSelfJoins = numSelfJoins;

    this.emptyTuple = Tuple.size( groupingFields[ 0 ].size() );

    FactoryLoader loader = FactoryLoader.getInstance();

    this.tupleCollectionFactory = loader.loadFactoryFrom( flowProcess, TUPLE_COLLECTION_FACTORY, HadoopTupleCollectionFactory.class );

    initLists();
    }

  @Override
  public int size()
    {
    return Math.max( joinFields.length, numSelfJoins + 1 );
    }

  @Override
  public Iterator getIterator( int pos )
    {
    if( pos < 0 || pos >= collections.length )
      throw new IllegalArgumentException( "invalid group position: " + pos );

    return makeIterator( pos, collections[ pos ].iterator() );
    }

  @Override
  public Tuple getGroupTuple( Tuple keysTuple )
    {
    Tuples.asModifiable( joinedTuple );

    for( int i = 0; i < collections.length; i++ )
      joinedTuplesArray[ i ] = collections[ i ].isEmpty() ? emptyTuple : keysTuple;

    joinedTuple = joinedBuilder.makeResult( joinedTuplesArray );

    return joinedTuple;
    }

  @Override
  public boolean isEmpty( int pos )
    {
    return collections[ pos ].isEmpty();
    }

  @Override
  public void reset( Tuple grouping, Iterator[] values )
    {
    super.reset( grouping, values );

    build();
    }

  protected void build()
    {
    clearGroups();

    if( collections[ 0 ] instanceof FalseCollection ) // force reset on FalseCollection
      ( (FalseCollection) collections[ 0 ] ).reset( null );

    while( values[ 0 ].hasNext() )
      {
      IndexTuple current = (IndexTuple) values[ 0 ].next();
      int pos = current.getIndex();

      // if this is the first (lhs) co-group, just use values iterator
      // we are guaranteed all the remainder tuples in the iterator are from pos == 0
      if( numSelfJoins == 0 && pos == 0 )
        {
        ( (FalseCollection) collections[ 0 ] ).reset( createIterator( current, values[ 0 ] ) );
        break;
        }

      collections[ pos ].add( current.getTuple() ); // get the value tuple for this cogroup
      }
    }

  protected void clearGroups()
    {
    for( Collection collection : collections )
      {
      collection.clear();

      if( collection instanceof Spillable )
        ( (Spillable) collection ).setGrouping( grouping );
      }
    }

  protected void initLists()
    {
    collections = new Collection[ size() ];

    // handle self joins
    if( numSelfJoins != 0 )
      {
      Arrays.fill( collections, createTupleCollection( joinFields[ 0 ] ) );
      }
    else
      {
      collections[ 0 ] = new FalseCollection(); // we iterate this only once per grouping

      for( int i = 1; i < joinFields.length; i++ )
        collections[ i ] = createTupleCollection( joinFields[ i ] );
      }

    joinedBuilder = makeJoinedBuilder( joinFields );
    joinedTuplesArray = new Tuple[ collections.length ];
    }

  static interface TupleBuilder
    {
    Tuple makeResult( Tuple[] tuples );
    }

  private TupleBuilder makeJoinedBuilder( final Fields[] joinFields )
    {
    final Fields[] fields = isSelfJoin() ? new Fields[ size() ] : joinFields;

    if( isSelfJoin() )
      Arrays.fill( fields, 0, fields.length, joinFields[ 0 ] );

    return new TupleBuilder()
      {
      Tuple result = TupleViews.createComposite( fields );

      @Override
      public Tuple makeResult( Tuple[] tuples )
        {
        return TupleViews.reset( result, tuples );
        }
      };
    }

  protected Collection createTupleCollection( Fields joinField )
    {
    Collection collection = tupleCollectionFactory.create( flowProcess );

    if( collection instanceof Spillable )
      ( (Spillable) collection ).setSpillListener( createListener( joinField ) );

    return collection;
    }

  private Spillable.SpillListener createListener( final Fields joinField )
    {
    return new SpillListener( flowProcess, joinField );
    }

  public Iterator createIterator( final IndexTuple current, final Iterator values )
    {
    return new Iterator()
      {
      IndexTuple value = current;

      @Override
      public boolean hasNext()
        {
        return value != null;
        }

      @Override
      public Tuple next()
        {
        if( value == null && !values.hasNext() )
          throw new NoSuchElementException();

        Tuple result = value.getTuple();

        if( values.hasNext() )
          value = values.next();
        else
          value = null;

        return result;
        }

      @Override
      public void remove()
        {
        // unsupported
        }
      };
    }
  }