All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.genomics.dataflow.utils.BreakFusionTransform Maven / Gradle / Ivy

There is a newer version: v1-0.8
Show newest version
/*
 * Copyright (C) 2016 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.genomics.dataflow.utils;

import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.Keys;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;

/*
 * Breaks Dataflow fusion by doing GroupByKey/Ungroup that forces materialization of the data,
 * thus preventing Dataflow form fusing steps before and after this transform.
 * This is useful to insert in cases where a series of transforms deal with very small sets of data
 * that act as descriptors of very heavy workloads in subsequent steps (e.g. a collection of file names
 * where each file takes a long time to process).
 * In this case Dataflow might over-eagerly fuse steps dealing with small datasets with the "heavy" 
 * processing steps, which will result in heavy steps being executed on a single worker.
 * If you insert a fusion break transform in between then Dataflow will be able to spin up many 
 * parallel workers to handle the heavy processing.
 * @see https://cloud.google.com/dataflow/service/dataflow-service-desc#Optimization
 * Typical usage:
 *  ...
 *  PCollection fileNames = pipeline.apply(...);
 *  fileNames.apply(new BreakFusionTransform())
 *      .apply(new HeavyFileProcessingTransform())
 *      .....
 */
public class BreakFusionTransform extends PTransform, PCollection> {
  
  public BreakFusionTransform() {
    super("Break Fusion Transform");

  }
  
  @Override
  public PCollection apply(PCollection input) {
    return input
        .apply(
            ParDo.named("Break fusion mapper")
              .of(new DummyMapFn()))  
        .apply(GroupByKey.create())
        .apply(Keys.create());  
  }
  
  
   static class DummyMapFn extends DoFn> {  
    private static final int DUMMY_VALUE = 42;

    @Override
    public void processElement(DoFn>.ProcessContext c) throws Exception {
      c.output( KV.of(c.element(), DUMMY_VALUE));
    }
   } 
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy