picard.vcf.MakeSitesOnlyVcf Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of picard Show documentation
Show all versions of picard Show documentation
A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF.
package picard.vcf;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.VariantManipulationProgramGroup;
import java.io.File;
import java.util.Set;
import java.util.TreeSet;
/**
* Creates a VCF that contains all the site-level information for all records in the input VCF but no genotype information.
*
* Summary
* This tool reads a VCF/VCF.gz/BCF and removes all genotype information from it while retaining all site level information,
* including annotations based on genotypes (e.g. AN, AF). Output can be any supported variant format including .vcf,
* .vcf.gz or .bcf.
*
* Inputs
*
* - Input VCF or BCF file containing genotype and site-level information.
* - Output VCF or BCF file containing only site-level information.
* - [Optional] Names of one or more samples to include in the output VCF.
*
*
* Usage example:
*
* java -jar picard.jar MakeSitesOnlyVcf \
* INPUT=input_variants.vcf \
* OUTPUT=output_variants.vcf
*
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
summary = MakeSitesOnlyVcf.USAGE_DETAILS,
oneLineSummary = MakeSitesOnlyVcf.USAGE_SUMMARY,
programGroup = VariantManipulationProgramGroup.class)
@DocumentedFeature
public class MakeSitesOnlyVcf extends CommandLineProgram {
static final String USAGE_SUMMARY = "Creates a VCF that contains all the site-level information for all records in the input VCF but no genotype information.";
static final String USAGE_DETAILS = "This tool reads a VCF/VCF.gz/BCF and removes all genotype information from it while retaining" +
"all site level information, including annotations based on genotypes (e.g. AN, AF). Output can be" +
"any supported variant format including .vcf, .vcf.gz or .bcf.
" +
"Usage example:
" +
"" +
"java -jar picard.jar MakeSitesOnlyVcf \\
" +
" INPUT=input_variants.vcf \\
" +
" OUTPUT=output_variants.vcf" +
"
";
@Argument(shortName= StandardOptionDefinitions.INPUT_SHORT_NAME, doc="Input VCF or BCF containing genotype and site-level information.")
public File INPUT;
@Argument(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="Output VCF or BCF file containing only site-level information.")
public File OUTPUT;
@Argument(shortName="S", doc="Names of one or more samples to include in the output VCF.", optional=true)
public Set SAMPLE = new TreeSet();
// Stock main method
public static void main(final String[] args) {
new MakeSitesOnlyVcf().instanceMainWithExit(args);
}
public MakeSitesOnlyVcf() {
CREATE_INDEX = true;
}
@Override
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
final VCFFileReader reader = new VCFFileReader(INPUT, false);
final VCFHeader inputVcfHeader = new VCFHeader(reader.getFileHeader().getMetaDataInInputOrder());
final SAMSequenceDictionary sequenceDictionary = inputVcfHeader.getSequenceDictionary();
if (CREATE_INDEX && sequenceDictionary == null) {
throw new PicardException("A sequence dictionary must be available (either through the input file or by setting it explicitly) when creating indexed output.");
}
final ProgressLogger progress = new ProgressLogger(Log.getInstance(MakeSitesOnlyVcf.class), 10000);
// Setup the site-only file writer
final VariantContextWriterBuilder builder = new VariantContextWriterBuilder()
.setOutputFile(OUTPUT)
.setReferenceDictionary(sequenceDictionary);
if (CREATE_INDEX)
builder.setOption(Options.INDEX_ON_THE_FLY);
else
builder.unsetOption(Options.INDEX_ON_THE_FLY);
final VariantContextWriter writer = builder.build();
final VCFHeader header = new VCFHeader(inputVcfHeader.getMetaDataInInputOrder(), SAMPLE);
writer.writeHeader(header);
// Go through the input, strip the records and write them to the output
final CloseableIterator iterator = reader.iterator();
while (iterator.hasNext()) {
final VariantContext full = iterator.next();
final VariantContext site = subsetToSamplesWithOriginalAnnotations(full, SAMPLE);
writer.add(site);
progress.record(site.getContig(), site.getStart());
}
CloserUtil.close(iterator);
CloserUtil.close(reader);
writer.close();
return 0;
}
/** Makes a new VariantContext with only the desired samples. */
private static VariantContext subsetToSamplesWithOriginalAnnotations(final VariantContext ctx, final Set samples) {
final VariantContextBuilder builder = new VariantContextBuilder(ctx);
final GenotypesContext newGenotypes = ctx.getGenotypes().subsetToSamples(samples);
builder.alleles(ctx.getAlleles());
return builder.genotypes(newGenotypes).make();
}
}