com.google.cloud.genomics.dataflow.readers.ReadGroupStreamer Maven / Gradle / Ivy
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.genomics.dataflow.readers;
import java.util.Collections;
import java.util.List;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.genomics.dataflow.utils.ShardOptions;
import com.google.cloud.genomics.utils.OfflineAuth;
import com.google.cloud.genomics.utils.ShardBoundary;
import com.google.cloud.genomics.utils.ShardUtils;
import com.google.cloud.genomics.utils.ShardUtils.SexChromosomeFilter;
import com.google.genomics.v1.Read;
import com.google.genomics.v1.StreamReadsRequest;
/**
* PTransform from a potentially large number of ReadGroupSets to streaming reads via gRPC.
*
* The sharding occurs as a stage of the pipeline, unlike the ReadStreamer PTransform
* where the shards are passed in. This is useful when the number of shards may
* potentially be larger than Dataflow's pipeline creation request size limit.
*/
public class ReadGroupStreamer extends PTransform, PCollection> {
protected final OfflineAuth auth;
protected final ShardBoundary.Requirement shardBoundary;
protected final String fields;
protected final SexChromosomeFilter sexChromosomeFilter;
/**
* Create a streamer that can appropriately shard a potentially large number of ReadGroupSets.
*
* @param auth The OfflineAuth to use for the request.
* @param shardBoundary The shard boundary semantics to enforce.
* @param fields Which fields to include in a partial response or null for all.
* @param sexChromosomeFilter An enum value indicating how sex chromosomes should be
* handled in the result.
*/
public ReadGroupStreamer(OfflineAuth auth, ShardBoundary.Requirement shardBoundary,
String fields, SexChromosomeFilter sexChromosomeFilter) {
this.auth = auth;
this.shardBoundary = shardBoundary;
this.fields = fields;
this.sexChromosomeFilter = sexChromosomeFilter;
}
@Override
public PCollection apply(PCollection readGroupSetIds) {
return readGroupSetIds.apply(ParDo.of(new CreateReadRequests()))
// Force a shuffle operation here to break the fusion of these steps.
// By breaking fusion, the work will be distributed to all available workers.
.apply(GroupByKey.create())
.apply(ParDo.of(new ConvergeStreamReadsRequestList()))
.apply(new ReadStreamer(auth, ShardBoundary.Requirement.STRICT, null));
}
private class CreateReadRequests extends DoFn> {
@Override
public void processElement(DoFn>.ProcessContext c)
throws Exception {
ShardOptions options = c.getPipelineOptions().as(ShardOptions.class);
String readGroupSetId = c.element();
List requests = null;
if (options.isAllReferences()) {
requests = ShardUtils.getReadRequests(readGroupSetId, sexChromosomeFilter, options.getBasesPerShard(), auth);
} else {
requests =
ShardUtils.getReadRequests(Collections.singletonList(readGroupSetId), options.getReferences(), options.getBasesPerShard());
}
for(StreamReadsRequest request : requests) {
c.output(KV.of(request.hashCode(), request));
}
}
}
private class ConvergeStreamReadsRequestList extends DoFn>, StreamReadsRequest> {
@Override
public void processElement(ProcessContext c) {
for (StreamReadsRequest r : c.element().getValue()) {
c.output(r);
}
}
}
}