com.google.cloud.genomics.dataflow.pipelines.ShardedBAMWriting Maven / Gradle / Ivy

Go to download
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.genomics.dataflow.pipelines;

import com.google.api.client.repackaged.com.google.common.base.Strings;
import com.google.api.services.storage.Storage;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.DelegateCoder;
import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.Default;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.util.Transport;
import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.genomics.dataflow.coders.GenericJsonCoder;
import com.google.cloud.genomics.dataflow.readers.ReadStreamer;
import com.google.cloud.genomics.dataflow.readers.bam.HeaderInfo;
import com.google.cloud.genomics.dataflow.readers.bam.ReadBAMTransform;
import com.google.cloud.genomics.dataflow.readers.bam.ReaderOptions;
import com.google.cloud.genomics.dataflow.readers.bam.ShardingPolicy;
import com.google.cloud.genomics.dataflow.utils.GCSOptions;
import com.google.cloud.genomics.dataflow.utils.GCSOutputOptions;
import com.google.cloud.genomics.dataflow.utils.GenomicsOptions;
import com.google.cloud.genomics.dataflow.utils.ShardOptions;
import com.google.cloud.genomics.dataflow.utils.ShardReadsTransform;
import com.google.cloud.genomics.dataflow.writers.WriteBAMTransform;
import com.google.cloud.genomics.utils.Contig;
import com.google.cloud.genomics.utils.OfflineAuth;
import com.google.cloud.genomics.utils.ShardBoundary;
import com.google.cloud.genomics.utils.ShardUtils;
import com.google.cloud.genomics.utils.ShardUtils.SexChromosomeFilter;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.genomics.v1.Read;
import com.google.genomics.v1.StreamReadsRequest;

import htsjdk.samtools.ValidationStringency;

import java.io.IOException;
import java.security.GeneralSecurityException;
import java.util.Collections;
import java.util.List;
import java.util.logging.Logger;

/**
 * Demonstrates loading some Reads, sharding them, writing them to BAM file shards in parallel,
 * then combining the shards and writing an index for the combined BAM file.
 */
public class ShardedBAMWriting {

  static interface Options extends ShardOptions, ShardReadsTransform.Options,
    WriteBAMTransform.Options, GCSOutputOptions {
    @Description("The Google Cloud Storage path to the BAM file to get reads data from" + 
        "This or ReadGroupSetId must be set")
    @Default.String("")
    String getBAMFilePath();

    void setBAMFilePath(String filePath);
    
    @Description("An ID of the Google Genomics ReadGroupSets this " +
        "pipeline is working with. This or BAMFilePath must be set.")
    @Default.String("")
    String getReadGroupSetId();

    void setReadGroupSetId(String readGroupSetId);
    
    public static class Methods {
      public static void validateOptions(Options options) {
        GCSOutputOptions.Methods.validateOptions(options);
        Preconditions.checkArgument(
            !Strings.isNullOrEmpty(options.getReadGroupSetId()) ||
            !Strings.isNullOrEmpty(options.getBAMFilePath()), 
            "Either BAMFilePath or ReadGroupSetId must be specified");
      }
    }
  }

  private static final Logger LOG = Logger.getLogger(ShardedBAMWriting.class.getName());
  private static Options pipelineOptions;
  private static Pipeline pipeline;
  private static OfflineAuth auth;
  private static Iterable contigs;

  public static void main(String[] args) throws GeneralSecurityException, IOException {
    // Register the options so that they show up via --help
    PipelineOptionsFactory.register(Options.class);
    pipelineOptions = PipelineOptionsFactory.fromArgs(args)
        .withValidation().as(Options.class);
    // Option validation is not yet automatic, we make an explicit call here.
    Options.Methods.validateOptions(pipelineOptions);

    auth = GenomicsOptions.Methods.getGenomicsAuth(pipelineOptions);
    pipeline = Pipeline.create(pipelineOptions);
    // Register coders.
    pipeline.getCoderRegistry().setFallbackCoderProvider(GenericJsonCoder.PROVIDER);
    pipeline.getCoderRegistry().registerCoder(Contig.class, CONTIG_CODER);
    // Process options.
    contigs = pipelineOptions.isAllReferences() ? null : 
      Contig.parseContigsFromCommandLine(pipelineOptions.getReferences());
    
    
    // Get the reads and shard them.
    PCollection reads;
    HeaderInfo headerInfo;
    
    final String outputFileName = pipelineOptions.getOutput();
    final GcsPath destPath = GcsPath.fromUri(outputFileName);
    final GcsPath destIdxPath = GcsPath.fromUri(outputFileName + ".bai");
    final Storage.Objects storage = Transport.newStorageClient(
        pipelineOptions
          .as(GCSOptions.class))
          .build()
          .objects();
    LOG.info("Cleaning up output file " + destPath + " and " + destIdxPath);
    try {
      storage.delete(destPath.getBucket(), destPath.getObject()).execute();
    } catch (Exception ignored) {
      // Ignore errors
    }
    try {
      storage.delete(destIdxPath.getBucket(), destIdxPath.getObject()).execute();
    } catch (Exception ignored) {
      // Ignore errors
    }
    
    if (!Strings.isNullOrEmpty(pipelineOptions.getReadGroupSetId())) {
      headerInfo = HeaderInfo.getHeaderFromApi(pipelineOptions.getReadGroupSetId(), auth, contigs);
      reads = getReadsFromAPI();
    } else {
      headerInfo = HeaderInfo.getHeaderFromBAMFile(storage, pipelineOptions.getBAMFilePath(), contigs);
      reads = getReadsFromBAMFile();
    }
    
    final PCollection writtenFiles = WriteBAMTransform.write(
        reads, headerInfo, pipelineOptions.getOutput(), pipeline);
    
    writtenFiles
        .apply(
            TextIO.Write
              .to(pipelineOptions.getOutput() + "-result")
        .named("Write Output Result")
        .withoutSharding());
    pipeline.run();            
  }
      
  private static PCollection getReadsFromBAMFile() throws IOException {
    /**
     * Policy used to shard Reads.
     * By default we are using the default sharding supplied by the policy class.
     * If you want custom sharding, use the following pattern:
     *      *    BAM_FILE_READ_SHARDING_POLICY = new ShardingPolicy() {
     *     @Override
     *     public boolean shardBigEnough(BAMShard shard) {
     *       return shard.sizeInLoci() > 50000000;
     *     }
     *   };
     * 
     */
    final ShardingPolicy BAM_FILE_READ_SHARDING_POLICY = ShardingPolicy.BYTE_SIZE_POLICY;
    
    LOG.info("Sharded reading of " + pipelineOptions.getBAMFilePath());
    
    final ReaderOptions readerOptions = new ReaderOptions(
        ValidationStringency.DEFAULT_STRINGENCY,
        true);
   
    return ReadBAMTransform.getReadsFromBAMFilesSharded(pipeline,
        auth,
        contigs,
        readerOptions,
        pipelineOptions.getBAMFilePath(),
        BAM_FILE_READ_SHARDING_POLICY);
  }
  
  private static PCollection getReadsFromAPI() throws IOException {
    final String  rgsId = pipelineOptions.getReadGroupSetId();
    LOG.info("Sharded reading of ReadGroupSet: " + rgsId);
    
    List requests = Lists.newArrayList();

    if (pipelineOptions.isAllReferences()) {
      requests.addAll(ShardUtils.getReadRequests(rgsId, SexChromosomeFilter.INCLUDE_XY, 
          pipelineOptions.getBasesPerShard(), auth));
    } else {
      requests.addAll(
          ShardUtils.getReadRequests(Collections.singletonList(rgsId), 
              pipelineOptions.getReferences(), pipelineOptions.getBasesPerShard()));
    }
 
    LOG.info("Reading from the API with: " + requests.size() + " shards");
    
    PCollection reads = pipeline.apply(Create.of(requests))
        .apply(new ReadStreamer(auth, ShardBoundary.Requirement.STRICT, null));
    return reads;
  }
  
  static Coder CONTIG_CODER = DelegateCoder.of(
      StringUtf8Coder.of(),
      new DelegateCoder.CodingFunction() {
        @Override
        public String apply(Contig contig) throws Exception {
          return contig.toString();
        }
      },
      new DelegateCoder.CodingFunction() {
        @Override
        public Contig apply(String contigStr) throws Exception {
          return Contig.parseContigsFromCommandLine(contigStr).iterator().next();
        }
      });
}