com.google.cloud.genomics.dataflow.pipelines.AnnotateVariants Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of google-genomics-dataflow Show documentation
Google Genomics Dataflow pipelines.
There is a newer version: v1-0.8
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.genomics.dataflow.pipelines;

import htsjdk.samtools.util.IntervalTree;
import htsjdk.samtools.util.IntervalTree.Node;

import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;

import com.google.api.client.util.Strings;
import com.google.api.services.genomics.Genomics;
import com.google.api.services.genomics.model.Annotation;
import com.google.api.services.genomics.model.AnnotationSet;
import com.google.api.services.genomics.model.ListBasesResponse;
import com.google.api.services.genomics.model.QueryRange;
import com.google.api.services.genomics.model.RangePosition;
import com.google.api.services.genomics.model.SearchAnnotationsRequest;
import com.google.api.services.genomics.model.SearchVariantsRequest;
import com.google.api.services.genomics.model.Variant;
import com.google.api.services.genomics.model.VariantAnnotation;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.Default;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.genomics.dataflow.coders.GenericJsonCoder;
import com.google.cloud.genomics.dataflow.utils.AnnotationUtils;
import com.google.cloud.genomics.dataflow.utils.AnnotationUtils.VariantEffect;
import com.google.cloud.genomics.dataflow.utils.GCSOutputOptions;
import com.google.cloud.genomics.dataflow.utils.GenomicsOptions;
import com.google.cloud.genomics.dataflow.utils.ShardOptions;
import com.google.cloud.genomics.utils.GenomicsFactory;
import com.google.cloud.genomics.utils.OfflineAuth;
import com.google.cloud.genomics.utils.Paginator;
import com.google.cloud.genomics.utils.ShardBoundary;
import com.google.cloud.genomics.utils.ShardUtils;
import com.google.cloud.genomics.utils.VariantUtils;
import com.google.common.base.Joiner;
import com.google.common.base.Stopwatch;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Maps;
import com.google.common.collect.Range;

/**
 * Demonstrates a simple variant annotation program which takes
 * {@code VariantSet}s and {@code AnnotationSet}s as input and emits
 * {@code VariantAnnotation}s. This program is intended to serve as an example
 * of how a variant annotation program can be structured to run in parallel,
 * operating over Google Genomics input sources; the current output has limited
 * biological significance.
 *
 * Currently only annotates SNPs and uses the following two methods:
 * 
 * Determines the effect of the provided variants on the provided
 *   transcripts, if any. A result is only emitted for a subset of cases where
 *   the variant appears to cause a coding change in the transcript.
 * 
Performs an exact match join on the provided existing variant
 *   annotations, if any.
 * 
 *
 * See http://googlegenomics.readthedocs.org/en/latest/use_cases/annotate_variants/google_genomics_annotation.html
 * for running instructions.
 */
public final class AnnotateVariants extends DoFn> {
  
  public static interface Options extends ShardOptions, GCSOutputOptions {

    @Description("The ID of the Google Genomics variant set this pipeline is accessing. "
        + "Defaults to 1000 Genomes.")
    @Default.String("10473108253681171589")
    String getVariantSetId();

    void setVariantSetId(String variantSetId);

    @Description("The IDs of the Google Genomics call sets this pipeline is working with, comma "
        + "delimited.Defaults to 1000 Genomes HG00261.")
    @Default.String("10473108253681171589-0")
    String getCallSetIds();
    void setCallSetIds(String callSetIds);

    @Description("The IDs of the Google Genomics transcript sets this pipeline is working with, "
        + "comma delimited. Defaults to UCSC refGene (hg19).")
    @Default.String("CIjfoPXj9LqPlAEQ5vnql4KewYuSAQ")
    String getTranscriptSetIds();
    void setTranscriptSetIds(String transcriptSetIds);

    @Description("The IDs of the Google Genomics variant annotation sets this pipeline is working "
        + "with, comma delimited. Defaults to ClinVar (GRCh37).")
    @Default.String("CILSqfjtlY6tHxC0nNH-4cu-xlQ")
    String getVariantAnnotationSetIds();
    void setVariantAnnotationSetIds(String variantAnnotationSetIds);

    public static class Methods {
      public static void validateOptions(Options options) {
        GCSOutputOptions.Methods.validateOptions(options);
      }
    }

  }

  private static final Logger LOG = Logger.getLogger(AnnotateVariants.class.getName());
  private static final int VARIANTS_PAGE_SIZE = 5000;
  private static final String VARIANT_FIELDS
      = "nextPageToken,variants(id,referenceName,start,end,alternateBases,referenceBases)";

  private final OfflineAuth auth;
  private final List callSetIds, transcriptSetIds, variantAnnotationSetIds;
  private final Map, String> refBaseCache;

  public AnnotateVariants(OfflineAuth auth,
      List callSetIds, List transcriptSetIds,
      List variantAnnotationSetIds) {
    this.auth = auth;
    this.callSetIds = callSetIds;
    this.transcriptSetIds = transcriptSetIds;
    this.variantAnnotationSetIds = variantAnnotationSetIds;
    refBaseCache = Maps.newHashMap();
  }

  @Override
  public void processElement(
      DoFn>.ProcessContext c) throws Exception {
    Genomics genomics = GenomicsFactory.builder().build().fromOfflineAuth(auth);

    SearchVariantsRequest request = c.element();
    LOG.info("processing contig " + request);
    Iterable varIter = FluentIterable
        .from(Paginator.Variants.create(genomics, ShardBoundary.Requirement.STRICT).search(
            request
              // TODO: Variants-only retrieval is not well support currently. For now
              // we parameterize by CallSet for performance.
              .setCallSetIds(callSetIds)
              .setPageSize(VARIANTS_PAGE_SIZE),
            VARIANT_FIELDS))
        .filter(VariantUtils.IS_SNP);
    if (!varIter.iterator().hasNext()) {
      LOG.info("region has no variants, skipping");
      return;
    }

    IntervalTree transcripts = retrieveTranscripts(genomics, request);
    ListMultimap, Annotation> variantAnnotations =
        retrieveVariantAnnotations(genomics, request);

    Stopwatch stopwatch = Stopwatch.createStarted();
    int varCount = 0;
    for (Variant variant : varIter) {
      List alleles = ImmutableList.builder()
          .addAll(variant.getAlternateBases())
          .add(variant.getReferenceBases())
          .build();
      Range pos = Range.openClosed(variant.getStart(), variant.getEnd());
      for (String allele : alleles) {
        String outKey = Joiner.on(":").join(
            variant.getReferenceName(), variant.getStart(), allele, variant.getId());
        for (Annotation match : variantAnnotations.get(pos)) {
          if (allele.equals(match.getVariant().getAlternateBases())) {
            // Exact match to a known variant annotation; straightforward join.
            c.output(KV.of(outKey, match.getVariant()));
          }
        }

        Iterator> transcriptIter = transcripts.overlappers(
            pos.lowerEndpoint().intValue(), pos.upperEndpoint().intValue() - 1); // Inclusive.
        while (transcriptIter.hasNext()) {
          // Calculate an effect of this allele on the coding region of the given transcript.
          Annotation transcript = transcriptIter.next().getValue();
          VariantEffect effect = AnnotationUtils.determineVariantTranscriptEffect(
              variant.getStart(), allele, transcript,
              getCachedTranscriptBases(genomics, transcript));
          if (effect != null && !VariantEffect.SYNONYMOUS_SNP.equals(effect)) {
            c.output(KV.of(outKey, new VariantAnnotation()
                .setAlternateBases(allele)
                .setType("SNP")
                .setEffect(effect.toString())
                .setGeneId(transcript.getTranscript().getGeneId())
                .setTranscriptIds(ImmutableList.of(transcript.getId()))));
          }
        }
      }
      varCount++;
      if (varCount%1e3 == 0) {
        LOG.info(String.format("read %d variants (%.2f / s)",
            varCount, (double)varCount / stopwatch.elapsed(TimeUnit.SECONDS)));
      }
    }
    LOG.info("finished reading " + varCount + " variants in " + stopwatch);
  }

  private ListMultimap, Annotation> retrieveVariantAnnotations(
      Genomics genomics, SearchVariantsRequest request) {
    Stopwatch stopwatch = Stopwatch.createStarted();
    ListMultimap, Annotation> annotationMap = ArrayListMultimap.create();
    Iterable annotationIter =
        Paginator.Annotations.create(genomics, ShardBoundary.Requirement.OVERLAPS).search(
            new SearchAnnotationsRequest()
              .setAnnotationSetIds(variantAnnotationSetIds)
              .setRange(new QueryRange()
                .setReferenceName(canonicalizeRefName(request.getReferenceName()))
                .setStart(request.getStart())
                .setEnd(request.getEnd())));
    for (Annotation annotation : annotationIter) {
      RangePosition pos = annotation.getPosition();
      long start = 0;
      if (pos.getStart() != null) {
        start = pos.getStart();
      }
      annotationMap.put(Range.closedOpen(start, pos.getEnd()), annotation);
    }
    LOG.info(String.format("read %d variant annotations in %s (%.2f / s)", annotationMap.size(),
        stopwatch, (double)annotationMap.size() / stopwatch.elapsed(TimeUnit.SECONDS)));
    return annotationMap;
  }

  private IntervalTree retrieveTranscripts(Genomics genomics, SearchVariantsRequest request) {
    Stopwatch stopwatch = Stopwatch.createStarted();
    IntervalTree transcripts = new IntervalTree<>();
    Iterable transcriptIter =
        Paginator.Annotations.create(genomics, ShardBoundary.Requirement.OVERLAPS).search(
            new SearchAnnotationsRequest()
              .setAnnotationSetIds(transcriptSetIds)
              .setRange(new QueryRange()
                .setReferenceName(canonicalizeRefName(request.getReferenceName()))
                .setStart(request.getStart())
                .setEnd(request.getEnd())));
    for (Annotation annotation : transcriptIter) {
      RangePosition pos = annotation.getPosition();
      transcripts.put(pos.getStart().intValue(), pos.getEnd().intValue(), annotation);
    }
    LOG.info(String.format("read %d transcripts in %s (%.2f / s)", transcripts.size(),
        stopwatch, (double)transcripts.size() / stopwatch.elapsed(TimeUnit.SECONDS)));
    return transcripts;
  }

  private String getCachedTranscriptBases(Genomics genomics, Annotation transcript)
      throws IOException {
    RangePosition pos = transcript.getPosition();
    Range rng = Range.closedOpen(pos.getStart(), pos.getEnd());
    if (!refBaseCache.containsKey(rng)) {
      refBaseCache.put(rng, retrieveReferenceBases(genomics, pos));
    }
    return refBaseCache.get(rng);
  }

  private String retrieveReferenceBases(Genomics genomics, RangePosition pos) throws IOException {
    StringBuilder b = new StringBuilder();
    String pageToken = "";
    while (true) {
      // TODO: Support full request parameterization for Paginator.References.Bases.
      ListBasesResponse response = genomics.references().bases()
          .list(pos.getReferenceId())
          .setStart(pos.getStart())
          .setEnd(pos.getEnd())
          .setPageToken(pageToken)
          .execute();
      b.append(response.getSequence());
      pageToken = response.getNextPageToken();
      if (Strings.isNullOrEmpty(pageToken)) {
        break;
      }
    }
    return b.toString();
  }

  private static String canonicalizeRefName(String refName) {
    return refName.replace("chr", "");
  }

  public static void main(String[] args) throws Exception {
    // Register the options so that they show up via --help
    PipelineOptionsFactory.register(Options.class);
    Options options = PipelineOptionsFactory.fromArgs(args)
        .withValidation().as(Options.class);
    // Option validation is not yet automatic, we make an explicit call here.
    Options.Methods.validateOptions(options);
    
    OfflineAuth auth = GenomicsOptions.Methods.getGenomicsAuth(options);
    Genomics genomics = GenomicsFactory.builder().build().fromOfflineAuth(auth);

    List callSetIds = ImmutableList.of();
    if (!Strings.isNullOrEmpty(options.getCallSetIds().trim())) {
      callSetIds = ImmutableList.copyOf(options.getCallSetIds().split(","));
    }
    List transcriptSetIds =
        validateAnnotationSetsFlag(genomics, options.getTranscriptSetIds(), "TRANSCRIPT");
    List variantAnnotationSetIds =
        validateAnnotationSetsFlag(genomics, options.getVariantAnnotationSetIds(), "VARIANT");
    validateRefsetForAnnotationSets(genomics, transcriptSetIds);

    List requests = options.isAllReferences() ?
        ShardUtils.getPaginatedVariantRequests(options.getVariantSetId(), ShardUtils.SexChromosomeFilter.EXCLUDE_XY,
            options.getBasesPerShard(), auth) :
              ShardUtils.getPaginatedVariantRequests(options.getVariantSetId(), options.getReferences(), options.getBasesPerShard());

    Pipeline p = Pipeline.create(options);
    p.getCoderRegistry().setFallbackCoderProvider(GenericJsonCoder.PROVIDER);

    p.begin()
      .apply(Create.of(requests))
      .apply(ParDo.of(new AnnotateVariants(auth, callSetIds, transcriptSetIds, variantAnnotationSetIds)))
      .apply(GroupByKey.create())
      .apply(ParDo.of(new DoFn>, String>() {
        @Override
        public void processElement(ProcessContext c) {
          c.output(c.element().getKey() + ": " + c.element().getValue());
        }
      }))
      .apply(TextIO.Write.to(options.getOutput()));
    p.run();
  }

  private static void validateRefsetForAnnotationSets(
      Genomics genomics, List annosetIds) throws IOException {
    String refsetId = null;
    for (String annosetId : annosetIds) {
      String gotId = genomics.annotationSets().get(annosetId).execute().getReferenceSetId();
      if (refsetId == null) {
        refsetId = gotId;
      } else if (!refsetId.equals(gotId)) {
        throw new IllegalArgumentException("want consistent reference sets across the provided " +
            "annotation sets, got " + refsetId + " and " + gotId);
      }
    }
  }

  private static List validateAnnotationSetsFlag(
      Genomics genomics, String flagValue, String wantType) throws IOException {
    List annosetIds = ImmutableList.copyOf(flagValue.split(","));
    for (String annosetId : annosetIds) {
      AnnotationSet annoset = genomics.annotationSets().get(annosetId).execute();
      if (!wantType.equals(annoset.getType())) {
        throw new IllegalArgumentException("annotation set " + annosetId + " has type " +
            annoset.getType() + ", wanted type " + wantType);
      }
    }
    return annosetIds;
  }
}