Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.genomics.dataflow.pipelines;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.security.GeneralSecurityException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import com.google.api.client.util.Strings;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.Default;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.Filter;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
import com.google.cloud.dataflow.sdk.transforms.View;
import com.google.cloud.dataflow.sdk.transforms.join.CoGbkResult;
import com.google.cloud.dataflow.sdk.transforms.join.CoGroupByKey;
import com.google.cloud.dataflow.sdk.transforms.join.KeyedPCollectionTuple;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.cloud.genomics.dataflow.coders.GenericJsonCoder;
import com.google.cloud.genomics.dataflow.functions.LikelihoodFn;
import com.google.cloud.genomics.dataflow.model.AlleleFreq;
import com.google.cloud.genomics.dataflow.model.ReadBaseQuality;
import com.google.cloud.genomics.dataflow.model.ReadBaseWithReference;
import com.google.cloud.genomics.dataflow.model.ReadCounts;
import com.google.cloud.genomics.dataflow.model.ReadQualityCount;
import com.google.cloud.genomics.dataflow.pipelines.CalculateCoverage.CheckMatchingReferenceSet;
import com.google.cloud.genomics.dataflow.readers.ReadGroupStreamer;
import com.google.cloud.genomics.dataflow.readers.VariantStreamer;
import com.google.cloud.genomics.dataflow.utils.GCSOutputOptions;
import com.google.cloud.genomics.dataflow.utils.GenomicsOptions;
import com.google.cloud.genomics.dataflow.utils.ReadFunctions;
import com.google.cloud.genomics.dataflow.utils.ShardOptions;
import com.google.cloud.genomics.dataflow.utils.Solver;
import com.google.cloud.genomics.dataflow.utils.VariantFunctions;
import com.google.cloud.genomics.utils.GenomicsUtils;
import com.google.cloud.genomics.utils.OfflineAuth;
import com.google.cloud.genomics.utils.ShardBoundary;
import com.google.cloud.genomics.utils.ShardUtils;
import com.google.cloud.genomics.utils.ShardUtils.SexChromosomeFilter;
import com.google.common.collect.ImmutableMultiset;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
import com.google.genomics.v1.Position;
import com.google.genomics.v1.Read;
import com.google.genomics.v1.StreamVariantsRequest;
import com.google.genomics.v1.Variant;
import com.google.protobuf.ListValue;
/**
* Test a set of reads for contamination.
*
* Takes a set of specified ReadGroupSets of reads to test and statistics on reference allele
* frequencies for SNPs with a single alternative from a specified set of VariantSets.
*
* See http://googlegenomics.readthedocs.org/en/latest/use_cases/perform_quality_control_checks/verify_bam_id.html
* for running instructions.
*
* Uses the sequence data alone approach described in:
* G. Jun, M. Flickinger, K. N. Hetrick, Kurt, J. M. Romm, K. F. Doheny,
* G. Abecasis, M. Boehnke,and H. M. Kang, Detecting and Estimating
* Contamination of Human DNA Samples in Sequencing and Array-Based Genotype
* Data, American journal of human genetics doi:10.1016/j.ajhg.2012.09.004
* (volume 91 issue 5 pp.839 - 848)
* http://www.sciencedirect.com/science/article/pii/S0002929712004788
*/
public class VerifyBamId {
/**
* Options required to run this pipeline.
*/
public static interface Options extends ShardOptions, GCSOutputOptions {
@Description("A comma delimited list of the IDs of the Google Genomics ReadGroupSets this "
+ "pipeline is working with. Default (empty) indicates all ReadGroupSets in InputDatasetId."
+ " This or InputDatasetId must be set. InputDatasetId overrides "
+ "ReadGroupSetIds (if InputDatasetId is set, this field will be ignored).")
@Default.String("")
String getReadGroupSetIds();
void setReadGroupSetIds(String readGroupSetId);
@Description("The ID of the Google Genomics Dataset that the pipeline will get its input reads"
+ " from. Default (empty) means to use ReadGroupSetIds and VariantSetIds instead. This or"
+ " ReadGroupSetIds and VariantSetIds must be set. InputDatasetId overrides"
+ " ReadGroupSetIds and VariantSetIds (if this field is set, ReadGroupSetIds and"
+ " VariantSetIds will be ignored).")
@Default.String("")
String getInputDatasetId();
void setInputDatasetId(String inputDatasetId);
public String DEFAULT_VARIANTSET = "10473108253681171589";
@Description("The ID of the Google Genomics VariantSet this pipeline is working with."
+ " It assumes the variant set has INFO field 'AF' from which it retrieves the"
+ " allele frequency for the variant, such as 1,000 Genomes phase 1 or phase 3 variants."
+ " Defaults to the 1,000 Genomes phase 1 VariantSet with id " + DEFAULT_VARIANTSET + ".")
@Default.String(DEFAULT_VARIANTSET)
String getVariantSetId();
void setVariantSetId(String variantSetId);
@Description("The minimum allele frequency to use in analysis. Defaults to 0.01.")
@Default.Double(0.01)
double getMinFrequency();
void setMinFrequency(double minFrequency);
@Description("The fraction of positions to check. Defaults to 0.01.")
@Default.Double(0.01)
double getSamplingFraction();
void setSamplingFraction(double minFrequency);
public static class Methods {
public static void validateOptions(Options options) {
GCSOutputOptions.Methods.validateOptions(options);
}
}
}
private static Pipeline p;
private static Options pipelineOptions;
private static OfflineAuth auth;
/**
* String prefix used for sampling hash function
*/
private static final String HASH_PREFIX = "";
// TODO: this value is not quite correct. Test again after
// https://github.com/googlegenomics/utils-java/issues/48
private static final String VARIANT_FIELDS = "variants(start,calls(genotype,callSetName))";
/**
* Run the VerifyBamId algorithm and output the resulting contamination estimate.
*/
public static void main(String[] args) throws GeneralSecurityException, IOException {
// Register the options so that they show up via --help
PipelineOptionsFactory.register(Options.class);
pipelineOptions = PipelineOptionsFactory.fromArgs(args)
.withValidation().as(Options.class);
// Option validation is not yet automatic, we make an explicit call here.
Options.Methods.validateOptions(pipelineOptions);
auth = GenomicsOptions.Methods.getGenomicsAuth(pipelineOptions);
p = Pipeline.create(pipelineOptions);
p.getCoderRegistry().setFallbackCoderProvider(GenericJsonCoder.PROVIDER);
if (pipelineOptions.getInputDatasetId().isEmpty() && pipelineOptions.getReadGroupSetIds().isEmpty()) {
throw new IllegalArgumentException("InputDatasetId or ReadGroupSetIds must be specified");
}
List rgsIds;
if (pipelineOptions.getInputDatasetId().isEmpty()) {
rgsIds = Lists.newArrayList(pipelineOptions.getReadGroupSetIds().split(","));
} else {
rgsIds = GenomicsUtils.getReadGroupSetIds(pipelineOptions.getInputDatasetId(), auth);
}
// Grab one ReferenceSetId to be used within the pipeline to confirm that all ReadGroupSets
// are associated with the same ReferenceSet.
String referenceSetId = GenomicsUtils.getReferenceSetId(rgsIds.get(0), auth);
if (Strings.isNullOrEmpty(referenceSetId)) {
throw new IllegalArgumentException("No ReferenceSetId associated with ReadGroupSetId "
+ rgsIds.get(0)
+ ". All ReadGroupSets in given input must have an associated ReferenceSet.");
}
// TODO: confirm that variant set also corresponds to the same reference
// https://github.com/googlegenomics/api-client-java/issues/66
// Reads in Reads.
PCollection reads = p.begin()
.apply(Create.of(rgsIds))
.apply(ParDo.of(new CheckMatchingReferenceSet(referenceSetId, auth)))
.apply(new ReadGroupStreamer(auth, ShardBoundary.Requirement.STRICT, null, SexChromosomeFilter.INCLUDE_XY));
/*
TODO: We can reduce the number of requests needed to be created by doing the following:
1. Stream the Variants first (rather than concurrently with the Reads). Select a subset of
them equal to some threshold (say 50K by default).
2. Create the requests for streaming Reads by running a ParDo over the selected Variants
to get their ranges (we only need to stream Reads that overlap the selected Variants).
3. Stream the Reads from the created requests.
*/
// Reads in Variants. TODO potentially provide an option to load the Variants from a file.
List variantRequests = pipelineOptions.isAllReferences() ?
ShardUtils.getVariantRequests(pipelineOptions.getVariantSetId(), ShardUtils.SexChromosomeFilter.INCLUDE_XY,
pipelineOptions.getBasesPerShard(), auth) :
ShardUtils.getVariantRequests(pipelineOptions.getVariantSetId(), pipelineOptions.getReferences(), pipelineOptions.getBasesPerShard());
PCollection variants = p.apply(Create.of(variantRequests))
.apply(new VariantStreamer(auth, ShardBoundary.Requirement.STRICT, VARIANT_FIELDS));
PCollection> refFreq = getFreq(variants, pipelineOptions.getMinFrequency());
PCollection> readCountsTable =
combineReads(reads, pipelineOptions.getSamplingFraction(), HASH_PREFIX, refFreq);
// Converts our results to a single Map of Position keys to ReadCounts values.
PCollectionView