All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.genomics.dataflow.pipelines.ShardedBAMWriting Maven / Gradle / Ivy

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.genomics.dataflow.pipelines;

import com.google.api.client.repackaged.com.google.common.base.Strings;
import com.google.api.services.storage.Storage;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.DelegateCoder;
import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.Default;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.util.Transport;
import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.genomics.dataflow.coders.GenericJsonCoder;
import com.google.cloud.genomics.dataflow.readers.ReadStreamer;
import com.google.cloud.genomics.dataflow.readers.bam.HeaderInfo;
import com.google.cloud.genomics.dataflow.readers.bam.ReadBAMTransform;
import com.google.cloud.genomics.dataflow.readers.bam.ReaderOptions;
import com.google.cloud.genomics.dataflow.readers.bam.ShardingPolicy;
import com.google.cloud.genomics.dataflow.utils.GCSOptions;
import com.google.cloud.genomics.dataflow.utils.GCSOutputOptions;
import com.google.cloud.genomics.dataflow.utils.GenomicsOptions;
import com.google.cloud.genomics.dataflow.utils.ShardOptions;
import com.google.cloud.genomics.dataflow.utils.ShardReadsTransform;
import com.google.cloud.genomics.dataflow.writers.WriteBAMTransform;
import com.google.cloud.genomics.utils.Contig;
import com.google.cloud.genomics.utils.OfflineAuth;
import com.google.cloud.genomics.utils.ShardBoundary;
import com.google.cloud.genomics.utils.ShardUtils;
import com.google.cloud.genomics.utils.ShardUtils.SexChromosomeFilter;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.genomics.v1.Read;
import com.google.genomics.v1.StreamReadsRequest;

import htsjdk.samtools.ValidationStringency;

import java.io.IOException;
import java.security.GeneralSecurityException;
import java.util.Collections;
import java.util.List;
import java.util.logging.Logger;

/**
 * Demonstrates loading some Reads, sharding them, writing them to BAM file shards in parallel,
 * then combining the shards and writing an index for the combined BAM file.
 */
public class ShardedBAMWriting {

  static interface Options extends ShardOptions, ShardReadsTransform.Options,
    WriteBAMTransform.Options, GCSOutputOptions {
    @Description("The Google Cloud Storage path to the BAM file to get reads data from" + 
        "This or ReadGroupSetId must be set")
    @Default.String("")
    String getBAMFilePath();

    void setBAMFilePath(String filePath);
    
    @Description("An ID of the Google Genomics ReadGroupSets this " +
        "pipeline is working with. This or BAMFilePath must be set.")
    @Default.String("")
    String getReadGroupSetId();

    void setReadGroupSetId(String readGroupSetId);
    
    public static class Methods {
      public static void validateOptions(Options options) {
        GCSOutputOptions.Methods.validateOptions(options);
        Preconditions.checkArgument(
            !Strings.isNullOrEmpty(options.getReadGroupSetId()) ||
            !Strings.isNullOrEmpty(options.getBAMFilePath()), 
            "Either BAMFilePath or ReadGroupSetId must be specified");
      }
    }
  }

  private static final Logger LOG = Logger.getLogger(ShardedBAMWriting.class.getName());
  private static Options pipelineOptions;
  private static Pipeline pipeline;
  private static OfflineAuth auth;
  private static Iterable contigs;

  public static void main(String[] args) throws GeneralSecurityException, IOException {
    // Register the options so that they show up via --help
    PipelineOptionsFactory.register(Options.class);
    pipelineOptions = PipelineOptionsFactory.fromArgs(args)
        .withValidation().as(Options.class);
    // Option validation is not yet automatic, we make an explicit call here.
    Options.Methods.validateOptions(pipelineOptions);

    auth = GenomicsOptions.Methods.getGenomicsAuth(pipelineOptions);
    pipeline = Pipeline.create(pipelineOptions);
    // Register coders.
    pipeline.getCoderRegistry().setFallbackCoderProvider(GenericJsonCoder.PROVIDER);
    pipeline.getCoderRegistry().registerCoder(Contig.class, CONTIG_CODER);
    // Process options.
    contigs = pipelineOptions.isAllReferences() ? null : 
      Contig.parseContigsFromCommandLine(pipelineOptions.getReferences());
    
    
    // Get the reads and shard them.
    PCollection reads;
    HeaderInfo headerInfo;
    
    final String outputFileName = pipelineOptions.getOutput();
    final GcsPath destPath = GcsPath.fromUri(outputFileName);
    final GcsPath destIdxPath = GcsPath.fromUri(outputFileName + ".bai");
    final Storage.Objects storage = Transport.newStorageClient(
        pipelineOptions
          .as(GCSOptions.class))
          .build()
          .objects();
    LOG.info("Cleaning up output file " + destPath + " and " + destIdxPath);
    try {
      storage.delete(destPath.getBucket(), destPath.getObject()).execute();
    } catch (Exception ignored) {
      // Ignore errors
    }
    try {
      storage.delete(destIdxPath.getBucket(), destIdxPath.getObject()).execute();
    } catch (Exception ignored) {
      // Ignore errors
    }
    
    if (!Strings.isNullOrEmpty(pipelineOptions.getReadGroupSetId())) {
      headerInfo = HeaderInfo.getHeaderFromApi(pipelineOptions.getReadGroupSetId(), auth, contigs);
      reads = getReadsFromAPI();
    } else {
      headerInfo = HeaderInfo.getHeaderFromBAMFile(storage, pipelineOptions.getBAMFilePath(), contigs);
      reads = getReadsFromBAMFile();
    }
    
    final PCollection writtenFiles = WriteBAMTransform.write(
        reads, headerInfo, pipelineOptions.getOutput(), pipeline);
    
    writtenFiles
        .apply(
            TextIO.Write
              .to(pipelineOptions.getOutput() + "-result")
        .named("Write Output Result")
        .withoutSharding());
    pipeline.run();            
  }
      
  private static PCollection getReadsFromBAMFile() throws IOException {
    /**
     * Policy used to shard Reads.
     * By default we are using the default sharding supplied by the policy class.
     * If you want custom sharding, use the following pattern:
     * 
     *    BAM_FILE_READ_SHARDING_POLICY = new ShardingPolicy() {
     *     @Override
     *     public boolean shardBigEnough(BAMShard shard) {
     *       return shard.sizeInLoci() > 50000000;
     *     }
     *   };
     * 
*/ final ShardingPolicy BAM_FILE_READ_SHARDING_POLICY = ShardingPolicy.BYTE_SIZE_POLICY; LOG.info("Sharded reading of " + pipelineOptions.getBAMFilePath()); final ReaderOptions readerOptions = new ReaderOptions( ValidationStringency.DEFAULT_STRINGENCY, true); return ReadBAMTransform.getReadsFromBAMFilesSharded(pipeline, auth, contigs, readerOptions, pipelineOptions.getBAMFilePath(), BAM_FILE_READ_SHARDING_POLICY); } private static PCollection getReadsFromAPI() throws IOException { final String rgsId = pipelineOptions.getReadGroupSetId(); LOG.info("Sharded reading of ReadGroupSet: " + rgsId); List requests = Lists.newArrayList(); if (pipelineOptions.isAllReferences()) { requests.addAll(ShardUtils.getReadRequests(rgsId, SexChromosomeFilter.INCLUDE_XY, pipelineOptions.getBasesPerShard(), auth)); } else { requests.addAll( ShardUtils.getReadRequests(Collections.singletonList(rgsId), pipelineOptions.getReferences(), pipelineOptions.getBasesPerShard())); } LOG.info("Reading from the API with: " + requests.size() + " shards"); PCollection reads = pipeline.apply(Create.of(requests)) .apply(new ReadStreamer(auth, ShardBoundary.Requirement.STRICT, null)); return reads; } static Coder CONTIG_CODER = DelegateCoder.of( StringUtf8Coder.of(), new DelegateCoder.CodingFunction() { @Override public String apply(Contig contig) throws Exception { return contig.toString(); } }, new DelegateCoder.CodingFunction() { @Override public Contig apply(String contigStr) throws Exception { return Contig.parseContigsFromCommandLine(contigStr).iterator().next(); } }); }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy