
com.google.cloud.genomics.dataflow.pipelines.DeleteVariants Maven / Gradle / Ivy
/*
* Copyright (C) 2016 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.genomics.dataflow.pipelines;
import com.google.api.client.util.BackOff;
import com.google.api.client.util.ExponentialBackOff;
import com.google.api.services.genomics.Genomics;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.options.Validation.Required;
import com.google.cloud.dataflow.sdk.transforms.Aggregator;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.Sum;
import com.google.cloud.genomics.dataflow.utils.GCSOutputOptions;
import com.google.cloud.genomics.dataflow.utils.GenomicsOptions;
import com.google.cloud.genomics.utils.GenomicsFactory;
import com.google.cloud.genomics.utils.OfflineAuth;
import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.security.GeneralSecurityException;
/**
* Given a file of variant ids, delete them from a variant set in parallel.
*
* These variant ids may be from the result of a BigQuery query materialized to Cloud Storage
* as CSV or as the result of a Dataflow pipeline such as IdentifyPrivateVariants.
*/
public class DeleteVariants {
public static interface Options extends GCSOutputOptions {
@Description("The filepath to a comma-separated or tab-separated file of variant ids. "
+ "The variant id will be retrieved from the first "
+ "column. Any other columns will be ignored.")
@Required
String getInput();
void setInput(String filepath);
public static class Methods {
public static void validateOptions(Options options) {
GCSOutputOptions.Methods.validateOptions(options);
}
}
}
private static final Logger LOG = LoggerFactory.getLogger(DeleteVariants.class);
public static final class DeleteVariantFn extends DoFn {
private final OfflineAuth auth;
private final Aggregator notFoundVariantCount =
createAggregator("Number of variants not found (previously deleted)", new Sum.SumLongFn());
private final Aggregator deletedVariantCount =
createAggregator("Number of variants deleted", new Sum.SumLongFn());
private Genomics genomics;
public DeleteVariantFn(OfflineAuth auth) {
super();
this.auth = auth;
}
@Override
public void startBundle(Context context) throws IOException, GeneralSecurityException {
genomics = GenomicsFactory.builder().build().fromOfflineAuth(auth);
}
@Override
public void processElement(DoFn.ProcessContext context) throws Exception {
String variantId = context.element();
// Call the deletion operation via exponential backoff so that "Rate Limit Exceeded"
// quota issues do not cause the pipeline to fail.
ExponentialBackOff backoff = new ExponentialBackOff.Builder().build();
while (true) {
try {
genomics.variants().delete(variantId).execute();
deletedVariantCount.addValue(1L);
context.output(1);
return;
} catch (Exception e) {
if (e.getMessage().startsWith("403 Forbidden")) {
long backOffMillis = backoff.nextBackOffMillis();
if (backOffMillis == BackOff.STOP) {
throw e;
}
Thread.sleep(backOffMillis);
} else {
throw e;
}
}
}
}
}
public static void main(String[] args) throws IOException, GeneralSecurityException {
// Register the options so that they show up via --help
PipelineOptionsFactory.register(Options.class);
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
// Option validation is not yet automatic, we make an explicit call here.
Options.Methods.validateOptions(options);
OfflineAuth auth = GenomicsOptions.Methods.getGenomicsAuth(options);
GenomicsOptions.Methods.requestConfirmation("*** The pipeline will delete variants whose "
+ "ids are listed in: " + options.getInput() + ". ***");
Pipeline p = Pipeline.create(options);
p.apply(TextIO.Read.named("ReadLines").from(options.getInput()))
.apply(ParDo.named("ParseVariantIds").of(new DoFn() {
@Override
public void processElement(ProcessContext c) {
String record = c.element();
// The variant id will be retrieved from the first column. Any other columns
// will be ignored.
Iterable fields =
Splitter.on(
CharMatcher.BREAKING_WHITESPACE.or(CharMatcher.is(',')))
.omitEmptyStrings()
.trimResults()
.split(record);
java.util.Iterator iter = fields.iterator();
if (iter.hasNext()) {
c.output(iter.next());
}
}
}))
.apply(ParDo.of(new DeleteVariantFn(auth)))
.apply(Sum.integersGlobally())
.apply(ParDo.named("FormatResults").of(new DoFn() {
@Override
public void processElement(ProcessContext c) {
c.output("Deleted Variant Count: " + c.element());
}
}))
.apply(TextIO.Write.named("Write Count").to(options.getOutput()));
p.run();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy