All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.vcf.MergeVcfs Maven / Gradle / Ivy

Go to download

A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF.

There is a newer version: 3.2.0
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2012 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package picard.vcf;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.MergingIterator;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextComparator;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFUtils;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.VariantManipulationProgramGroup;

import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;

/**
 * Combines multiple variant files into a single variant file.
 *
 * 

Inputs

*
    *
  • One or more input file in VCF format (can be gzipped, i.e. ending in ".vcf.gz", or binary compressed, i.e. ending in ".bcf").
  • *
  • Optionally a sequence dictionary file (typically name ending in .dict) if the input VCF does not contain a * complete contig list and if the output index is to be created (true by default).
  • *
*

* The input variant data must adhere to the following rules: *

    *
  • If there are samples, those must be the same across all input files.
  • *
  • Input file headers must be contain compatible declarations for common annotations (INFO, FORMAT fields) and filters.
  • *
  • Input files variant records must be sorted by their contig and position following the sequence dictionary provided * or the header contig list.
  • *
*

*

* You can either directly specify the list of files by specifying INPUT multiple times, or provide a list * in a file with name ending in ".list" to INPUT. *

* *

Outputs

* A VCF sorted (i) according to the dictionary and (ii) by coordiante. *

Usage examples

*

Example 1:

* We combine several variant files in different formats, where at least one of them contains the contig list in its header. *
 *     java -jar picard.jar MergeVcfs \
 *          I=input_variants.01.vcf \
 *          I=input_variants.02.vcf.gz \
 *          O=output_variants.vcf.gz
 * 
*

Example 2:

* Similar to example 1 but we use an input list file to specify the input files: *
 *     java -jar picard.jar MergeVcfs \
 *          I=input_variant_files.list \
 *          O=output_variants.vcf.gz
 * 
* * @since 1.0.1 */ @CommandLineProgramProperties( oneLineSummary = MergeVcfs.SUMMARY_FIRST_SENTENCE, summary = MergeVcfs.SUMMARY, programGroup = VariantManipulationProgramGroup.class) @DocumentedFeature public class MergeVcfs extends CommandLineProgram { static final String SUMMARY_FIRST_SENTENCE = "Combines multiple variant files into a single variant file"; static final String SUMMARY = "

" + SUMMARY_FIRST_SENTENCE + ".

" + "

Inputs

" + "
    " + "
  • One or more input file in VCF format (can be gzipped, i.e. ending in \".vcf.gz\", or binary compressed, i.e. ending in \".bcf\").
  • " + "
  • Optionally a sequence dictionary file (typically name ending in .dict) if the input VCF does not contain a" + " complete contig list and if the output index is to be created (true by default).
  • " + "
" + "

" + " The input variant data must adhere to the following rules:

" + "
    " + "
  • If there are samples, those must be the same across all input files.
  • " + "
  • Input file headers must be contain compatible declarations for common annotations (INFO, FORMAT fields) and filters.
  • " + "
  • Input files variant records must be sorted by their contig and position following the sequence dictionary provided" + " or the header contig list.
  • " + "
" + "

You can either directly specify the list of files by specifying INPUT multiple times, or provide a list" + " in a file with name ending in \".list\" to INPUT.

" + "

Outputs

" + "

A VCF sorted (i) according to the dictionary and (ii) by coordinate.

" + "

Usage examples

" + "

Example 1:

" + "

We combine several variant files in different formats, where at least one of them contains the contig list in its header.

" + "
java -jar picard.jar MergeVcfs \\\n" + 
			"          I=input_variants.01.vcf \\\n" + 
			"          I=input_variants.02.vcf.gz \\\n" + 
			"          O=output_variants.vcf.gz
" + "

Example 2:

" + "

Similar to example 1 but we use an input list file to specify the input files:

" + "
java -jar picard.jar MergeVcfs \\\n" + 
			"          I=input_variant_files.list \\\n" + 
			"          O=output_variants.vcf.gz

"; @Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc="VCF or BCF input files (File format is determined by file extension), or a file having a '.list' suffix containing the path to the files, one per line.", minElements=1) public List INPUT; @Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The merged VCF or BCF file. File format is determined by file extension.") public File OUTPUT; @Argument(shortName = "D", doc = "The index sequence dictionary to use instead of the sequence dictionary in the input files", optional = true) public File SEQUENCE_DICTIONARY; private final static String SEQ_DICT_REQUIRED = "A sequence dictionary must be available (either through the input file or by setting it explicitly)."; private final Log log = Log.getInstance(MergeVcfs.class); public MergeVcfs() { this.CREATE_INDEX = true; } @Override protected int doWork() { final ProgressLogger progress = new ProgressLogger(log, 10000); final List sampleList = new ArrayList(); INPUT = IOUtil.unrollFiles(INPUT, IOUtil.VCF_EXTENSIONS); final Collection> iteratorCollection = new ArrayList>(INPUT.size()); final Collection headers = new HashSet(INPUT.size()); VariantContextComparator variantContextComparator = null; SAMSequenceDictionary sequenceDictionary = null; if (SEQUENCE_DICTIONARY != null) { sequenceDictionary = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(SEQUENCE_DICTIONARY).getFileHeader().getSequenceDictionary(); } for (final File file : INPUT) { IOUtil.assertFileIsReadable(file); final VCFFileReader fileReader = new VCFFileReader(file, false); final VCFHeader fileHeader = fileReader.getFileHeader(); if (fileHeader.getContigLines().isEmpty()) { if (sequenceDictionary == null) { throw new IllegalArgumentException(SEQ_DICT_REQUIRED); } else { fileHeader.setSequenceDictionary(sequenceDictionary); } } if (variantContextComparator == null) { variantContextComparator = fileHeader.getVCFRecordComparator(); } else { if (!variantContextComparator.isCompatible(fileHeader.getContigLines())) { throw new IllegalArgumentException( "The contig entries in input file " + file.getAbsolutePath() + " are not compatible with the others."); } } if (sequenceDictionary == null) sequenceDictionary = fileHeader.getSequenceDictionary(); if (sampleList.isEmpty()) { sampleList.addAll(fileHeader.getSampleNamesInOrder()); } else { if (!sampleList.equals(fileHeader.getSampleNamesInOrder())) { throw new IllegalArgumentException("Input file " + file.getAbsolutePath() + " has sample entries that don't match the other files."); } } headers.add(fileHeader); iteratorCollection.add(fileReader.iterator()); } if (CREATE_INDEX && sequenceDictionary == null) { throw new PicardException(String.format("Index creation failed. %s", SEQ_DICT_REQUIRED)); } final VariantContextWriterBuilder builder = new VariantContextWriterBuilder() .setOutputFile(OUTPUT) .setReferenceDictionary(sequenceDictionary); if (CREATE_INDEX) { builder.setOption(Options.INDEX_ON_THE_FLY); } else { builder.unsetOption(Options.INDEX_ON_THE_FLY); } final VariantContextWriter writer = builder.build(); writer.writeHeader(new VCFHeader(VCFUtils.smartMergeHeaders(headers, false), sampleList)); final MergingIterator mergingIterator = new MergingIterator(variantContextComparator, iteratorCollection); while (mergingIterator.hasNext()) { final VariantContext context = mergingIterator.next(); writer.add(context); progress.record(context.getContig(), context.getStart()); } CloserUtil.close(mergingIterator); writer.close(); return 0; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy