org.broadinstitute.hellbender.tools.walkers.validation.RemoveNearbyIndels Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.walkers.validation;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.VariantWalker;
import picard.cmdline.programgroups.VariantManipulationProgramGroup;
import java.util.ArrayDeque;
/**
* Remove indels that are close to another indel from a vcf file.
*
*
* This is a preprocessing step for the CRSP sensitivity validation truth data. For any pair of indels that are within
* a minimum allowed distance (given by --min-indel-spacing), both indels are removed, regardless of any intervening
* non-indel variants.
*
*
* Usage example
*
*
* gatk RemoveNearbyIndels \
* -V input.vcf \
* -O output.vcf \
* --min-indel-spacing 20
*
*
* Created by David Benjamin on 1/30/17.
*/
@CommandLineProgramProperties(
summary = "Remove indels that are close to each other from a vcf. For any pair of indels that are within" +
"some minimum allowed distance, both indels are removed, regardless of any intervening non-indel variants.",
oneLineSummary = "(Internal) Remove indels from the VCF file that are close to each other.",
programGroup = VariantManipulationProgramGroup.class
)
@DocumentedFeature
public class RemoveNearbyIndels extends VariantWalker {
public static final String MIN_INDEL_SPACING_NAME = "min-indel-spacing";
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc = "The output filtered VCF file",
optional = false)
private final GATKPath outputVcf = null;
@Argument(fullName = MIN_INDEL_SPACING_NAME,
shortName = MIN_INDEL_SPACING_NAME,
doc = "Minimum spacing between neighboring indels to be emitted",
optional = false)
private int minIndelSpacing = 1;
private VariantContextWriter vcfWriter;
private VariantBuffer variantBuffer = new VariantBuffer();
@Override
public void onTraversalStart() {
final VCFHeader inputHeader = getHeaderForVariants();
final VCFHeader vcfHeader = new VCFHeader(inputHeader.getMetaDataInSortedOrder(), inputHeader.getGenotypeSamples());
getDefaultToolVCFHeaderLines().forEach(vcfHeader::addMetaDataLine);
vcfWriter = createVCFWriter(outputVcf);
vcfWriter.writeHeader(vcfHeader);
}
@Override
public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext refContext, final FeatureContext fc) {
variantBuffer.add(vc);
}
@Override
public Object onTraversalSuccess() {
variantBuffer.emitRemaining();
return "SUCCESS";
}
@Override
public void closeTool() {
if ( vcfWriter != null ) {
vcfWriter.close();
}
}
private final class VariantBuffer {
private VariantContext lastIndel = null;
// INVARIANT: this buffer will contain at most one indel
private final ArrayDeque buffer = new ArrayDeque<>();
public VariantBuffer() { }
public void add(final VariantContext vc) {
if (lastIndel == null && vc.isIndel()) {
buffer.add(vc);
lastIndel = vc;
} else if (nearby(lastIndel, vc)) {
if (vc.isIndel()) {
emitAllNonIndels(); // throw out {@code lastIndel} and {@code vc}
} else {
buffer.add(vc);
}
} else {
emitAllVariants();
buffer.add(vc);
}
lastIndel = vc.isIndel() ? vc : lastIndel;
}
private boolean nearby(final VariantContext left, final VariantContext right) {
return left != null && left.getContig().equals(right.getContig())
&& (right.getStart() - left.getEnd() < minIndelSpacing);
}
private void emitAllNonIndels() {
buffer.stream().filter(vc -> !vc.isIndel()).forEach(vcfWriter::add);
buffer.clear();
}
private void emitAllVariants() {
buffer.forEach(vcfWriter::add);
buffer.clear();
}
public void emitRemaining() {
buffer.stream().filter(vc -> !(vc.isIndel() && nearby(lastIndel, vc)) || vc == lastIndel).forEach(vcfWriter::add);
buffer.clear();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy