All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.genomics.dataflow.pipelines.AnnotateVariants Maven / Gradle / Ivy

There is a newer version: v1-0.8
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.genomics.dataflow.pipelines;

import htsjdk.samtools.util.IntervalTree;
import htsjdk.samtools.util.IntervalTree.Node;

import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;

import com.google.api.client.util.Strings;
import com.google.api.services.genomics.Genomics;
import com.google.api.services.genomics.model.Annotation;
import com.google.api.services.genomics.model.AnnotationSet;
import com.google.api.services.genomics.model.ListBasesResponse;
import com.google.api.services.genomics.model.QueryRange;
import com.google.api.services.genomics.model.RangePosition;
import com.google.api.services.genomics.model.SearchAnnotationsRequest;
import com.google.api.services.genomics.model.SearchVariantsRequest;
import com.google.api.services.genomics.model.Variant;
import com.google.api.services.genomics.model.VariantAnnotation;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.Default;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.genomics.dataflow.coders.GenericJsonCoder;
import com.google.cloud.genomics.dataflow.utils.AnnotationUtils;
import com.google.cloud.genomics.dataflow.utils.AnnotationUtils.VariantEffect;
import com.google.cloud.genomics.dataflow.utils.GCSOutputOptions;
import com.google.cloud.genomics.dataflow.utils.GenomicsOptions;
import com.google.cloud.genomics.dataflow.utils.ShardOptions;
import com.google.cloud.genomics.utils.GenomicsFactory;
import com.google.cloud.genomics.utils.OfflineAuth;
import com.google.cloud.genomics.utils.Paginator;
import com.google.cloud.genomics.utils.ShardBoundary;
import com.google.cloud.genomics.utils.ShardUtils;
import com.google.cloud.genomics.utils.VariantUtils;
import com.google.common.base.Joiner;
import com.google.common.base.Stopwatch;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Maps;
import com.google.common.collect.Range;

/**
 * Demonstrates a simple variant annotation program which takes
 * {@code VariantSet}s and {@code AnnotationSet}s as input and emits
 * {@code VariantAnnotation}s. This program is intended to serve as an example
 * of how a variant annotation program can be structured to run in parallel,
 * operating over Google Genomics input sources; the current output has limited
 * biological significance.
 *
 * Currently only annotates SNPs and uses the following two methods:
 * 
    *
  • Determines the effect of the provided variants on the provided * transcripts, if any. A result is only emitted for a subset of cases where * the variant appears to cause a coding change in the transcript. *
  • Performs an exact match join on the provided existing variant * annotations, if any. *
* * See http://googlegenomics.readthedocs.org/en/latest/use_cases/annotate_variants/google_genomics_annotation.html * for running instructions. */ public final class AnnotateVariants extends DoFn> { public static interface Options extends ShardOptions, GCSOutputOptions { @Description("The ID of the Google Genomics variant set this pipeline is accessing. " + "Defaults to 1000 Genomes.") @Default.String("10473108253681171589") String getVariantSetId(); void setVariantSetId(String variantSetId); @Description("The IDs of the Google Genomics call sets this pipeline is working with, comma " + "delimited.Defaults to 1000 Genomes HG00261.") @Default.String("10473108253681171589-0") String getCallSetIds(); void setCallSetIds(String callSetIds); @Description("The IDs of the Google Genomics transcript sets this pipeline is working with, " + "comma delimited. Defaults to UCSC refGene (hg19).") @Default.String("CIjfoPXj9LqPlAEQ5vnql4KewYuSAQ") String getTranscriptSetIds(); void setTranscriptSetIds(String transcriptSetIds); @Description("The IDs of the Google Genomics variant annotation sets this pipeline is working " + "with, comma delimited. Defaults to ClinVar (GRCh37).") @Default.String("CILSqfjtlY6tHxC0nNH-4cu-xlQ") String getVariantAnnotationSetIds(); void setVariantAnnotationSetIds(String variantAnnotationSetIds); public static class Methods { public static void validateOptions(Options options) { GCSOutputOptions.Methods.validateOptions(options); } } } private static final Logger LOG = Logger.getLogger(AnnotateVariants.class.getName()); private static final int VARIANTS_PAGE_SIZE = 5000; private static final String VARIANT_FIELDS = "nextPageToken,variants(id,referenceName,start,end,alternateBases,referenceBases)"; private final OfflineAuth auth; private final List callSetIds, transcriptSetIds, variantAnnotationSetIds; private final Map, String> refBaseCache; public AnnotateVariants(OfflineAuth auth, List callSetIds, List transcriptSetIds, List variantAnnotationSetIds) { this.auth = auth; this.callSetIds = callSetIds; this.transcriptSetIds = transcriptSetIds; this.variantAnnotationSetIds = variantAnnotationSetIds; refBaseCache = Maps.newHashMap(); } @Override public void processElement( DoFn>.ProcessContext c) throws Exception { Genomics genomics = GenomicsFactory.builder().build().fromOfflineAuth(auth); SearchVariantsRequest request = c.element(); LOG.info("processing contig " + request); Iterable varIter = FluentIterable .from(Paginator.Variants.create(genomics, ShardBoundary.Requirement.STRICT).search( request // TODO: Variants-only retrieval is not well support currently. For now // we parameterize by CallSet for performance. .setCallSetIds(callSetIds) .setPageSize(VARIANTS_PAGE_SIZE), VARIANT_FIELDS)) .filter(VariantUtils.IS_SNP); if (!varIter.iterator().hasNext()) { LOG.info("region has no variants, skipping"); return; } IntervalTree transcripts = retrieveTranscripts(genomics, request); ListMultimap, Annotation> variantAnnotations = retrieveVariantAnnotations(genomics, request); Stopwatch stopwatch = Stopwatch.createStarted(); int varCount = 0; for (Variant variant : varIter) { List alleles = ImmutableList.builder() .addAll(variant.getAlternateBases()) .add(variant.getReferenceBases()) .build(); Range pos = Range.openClosed(variant.getStart(), variant.getEnd()); for (String allele : alleles) { String outKey = Joiner.on(":").join( variant.getReferenceName(), variant.getStart(), allele, variant.getId()); for (Annotation match : variantAnnotations.get(pos)) { if (allele.equals(match.getVariant().getAlternateBases())) { // Exact match to a known variant annotation; straightforward join. c.output(KV.of(outKey, match.getVariant())); } } Iterator> transcriptIter = transcripts.overlappers( pos.lowerEndpoint().intValue(), pos.upperEndpoint().intValue() - 1); // Inclusive. while (transcriptIter.hasNext()) { // Calculate an effect of this allele on the coding region of the given transcript. Annotation transcript = transcriptIter.next().getValue(); VariantEffect effect = AnnotationUtils.determineVariantTranscriptEffect( variant.getStart(), allele, transcript, getCachedTranscriptBases(genomics, transcript)); if (effect != null && !VariantEffect.SYNONYMOUS_SNP.equals(effect)) { c.output(KV.of(outKey, new VariantAnnotation() .setAlternateBases(allele) .setType("SNP") .setEffect(effect.toString()) .setGeneId(transcript.getTranscript().getGeneId()) .setTranscriptIds(ImmutableList.of(transcript.getId())))); } } } varCount++; if (varCount%1e3 == 0) { LOG.info(String.format("read %d variants (%.2f / s)", varCount, (double)varCount / stopwatch.elapsed(TimeUnit.SECONDS))); } } LOG.info("finished reading " + varCount + " variants in " + stopwatch); } private ListMultimap, Annotation> retrieveVariantAnnotations( Genomics genomics, SearchVariantsRequest request) { Stopwatch stopwatch = Stopwatch.createStarted(); ListMultimap, Annotation> annotationMap = ArrayListMultimap.create(); Iterable annotationIter = Paginator.Annotations.create(genomics, ShardBoundary.Requirement.OVERLAPS).search( new SearchAnnotationsRequest() .setAnnotationSetIds(variantAnnotationSetIds) .setRange(new QueryRange() .setReferenceName(canonicalizeRefName(request.getReferenceName())) .setStart(request.getStart()) .setEnd(request.getEnd()))); for (Annotation annotation : annotationIter) { RangePosition pos = annotation.getPosition(); long start = 0; if (pos.getStart() != null) { start = pos.getStart(); } annotationMap.put(Range.closedOpen(start, pos.getEnd()), annotation); } LOG.info(String.format("read %d variant annotations in %s (%.2f / s)", annotationMap.size(), stopwatch, (double)annotationMap.size() / stopwatch.elapsed(TimeUnit.SECONDS))); return annotationMap; } private IntervalTree retrieveTranscripts(Genomics genomics, SearchVariantsRequest request) { Stopwatch stopwatch = Stopwatch.createStarted(); IntervalTree transcripts = new IntervalTree<>(); Iterable transcriptIter = Paginator.Annotations.create(genomics, ShardBoundary.Requirement.OVERLAPS).search( new SearchAnnotationsRequest() .setAnnotationSetIds(transcriptSetIds) .setRange(new QueryRange() .setReferenceName(canonicalizeRefName(request.getReferenceName())) .setStart(request.getStart()) .setEnd(request.getEnd()))); for (Annotation annotation : transcriptIter) { RangePosition pos = annotation.getPosition(); transcripts.put(pos.getStart().intValue(), pos.getEnd().intValue(), annotation); } LOG.info(String.format("read %d transcripts in %s (%.2f / s)", transcripts.size(), stopwatch, (double)transcripts.size() / stopwatch.elapsed(TimeUnit.SECONDS))); return transcripts; } private String getCachedTranscriptBases(Genomics genomics, Annotation transcript) throws IOException { RangePosition pos = transcript.getPosition(); Range rng = Range.closedOpen(pos.getStart(), pos.getEnd()); if (!refBaseCache.containsKey(rng)) { refBaseCache.put(rng, retrieveReferenceBases(genomics, pos)); } return refBaseCache.get(rng); } private String retrieveReferenceBases(Genomics genomics, RangePosition pos) throws IOException { StringBuilder b = new StringBuilder(); String pageToken = ""; while (true) { // TODO: Support full request parameterization for Paginator.References.Bases. ListBasesResponse response = genomics.references().bases() .list(pos.getReferenceId()) .setStart(pos.getStart()) .setEnd(pos.getEnd()) .setPageToken(pageToken) .execute(); b.append(response.getSequence()); pageToken = response.getNextPageToken(); if (Strings.isNullOrEmpty(pageToken)) { break; } } return b.toString(); } private static String canonicalizeRefName(String refName) { return refName.replace("chr", ""); } public static void main(String[] args) throws Exception { // Register the options so that they show up via --help PipelineOptionsFactory.register(Options.class); Options options = PipelineOptionsFactory.fromArgs(args) .withValidation().as(Options.class); // Option validation is not yet automatic, we make an explicit call here. Options.Methods.validateOptions(options); OfflineAuth auth = GenomicsOptions.Methods.getGenomicsAuth(options); Genomics genomics = GenomicsFactory.builder().build().fromOfflineAuth(auth); List callSetIds = ImmutableList.of(); if (!Strings.isNullOrEmpty(options.getCallSetIds().trim())) { callSetIds = ImmutableList.copyOf(options.getCallSetIds().split(",")); } List transcriptSetIds = validateAnnotationSetsFlag(genomics, options.getTranscriptSetIds(), "TRANSCRIPT"); List variantAnnotationSetIds = validateAnnotationSetsFlag(genomics, options.getVariantAnnotationSetIds(), "VARIANT"); validateRefsetForAnnotationSets(genomics, transcriptSetIds); List requests = options.isAllReferences() ? ShardUtils.getPaginatedVariantRequests(options.getVariantSetId(), ShardUtils.SexChromosomeFilter.EXCLUDE_XY, options.getBasesPerShard(), auth) : ShardUtils.getPaginatedVariantRequests(options.getVariantSetId(), options.getReferences(), options.getBasesPerShard()); Pipeline p = Pipeline.create(options); p.getCoderRegistry().setFallbackCoderProvider(GenericJsonCoder.PROVIDER); p.begin() .apply(Create.of(requests)) .apply(ParDo.of(new AnnotateVariants(auth, callSetIds, transcriptSetIds, variantAnnotationSetIds))) .apply(GroupByKey.create()) .apply(ParDo.of(new DoFn>, String>() { @Override public void processElement(ProcessContext c) { c.output(c.element().getKey() + ": " + c.element().getValue()); } })) .apply(TextIO.Write.to(options.getOutput())); p.run(); } private static void validateRefsetForAnnotationSets( Genomics genomics, List annosetIds) throws IOException { String refsetId = null; for (String annosetId : annosetIds) { String gotId = genomics.annotationSets().get(annosetId).execute().getReferenceSetId(); if (refsetId == null) { refsetId = gotId; } else if (!refsetId.equals(gotId)) { throw new IllegalArgumentException("want consistent reference sets across the provided " + "annotation sets, got " + refsetId + " and " + gotId); } } } private static List validateAnnotationSetsFlag( Genomics genomics, String flagValue, String wantType) throws IOException { List annosetIds = ImmutableList.copyOf(flagValue.split(",")); for (String annosetId : annosetIds) { AnnotationSet annoset = genomics.annotationSets().get(annosetId).execute(); if (!wantType.equals(annoset.getType())) { throw new IllegalArgumentException("annotation set " + annosetId + " has type " + annoset.getType() + ", wanted type " + wantType); } } return annosetIds; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy