picard.vcf.MergeVcfs Maven / Gradle / Ivy
Show all versions of picard Show documentation
/*
* The MIT License
*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.vcf;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.MergingIterator;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextComparator;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFUtils;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.VariantManipulationProgramGroup;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
/**
* Combines multiple variant files into a single variant file.
*
* Inputs
*
* - One or more input file in VCF format (can be gzipped, i.e. ending in ".vcf.gz", or binary compressed, i.e. ending in ".bcf").
* - Optionally a sequence dictionary file (typically name ending in .dict) if the input VCF does not contain a
* complete contig list and if the output index is to be created (true by default).
*
*
* The input variant data must adhere to the following rules:
*
* - If there are samples, those must be the same across all input files.
* - Input file headers must be contain compatible declarations for common annotations (INFO, FORMAT fields) and filters.
* - Input files variant records must be sorted by their contig and position following the sequence dictionary provided
* or the header contig list.
*
*
*
* You can either directly specify the list of files by specifying INPUT
multiple times, or provide a list
* in a file with name ending in ".list" to INPUT
.
*
*
* Outputs
* A VCF sorted (i) according to the dictionary and (ii) by coordiante.
* Usage examples
* Example 1:
* We combine several variant files in different formats, where at least one of them contains the contig list in its header.
*
* java -jar picard.jar MergeVcfs \
* I=input_variants.01.vcf \
* I=input_variants.02.vcf.gz \
* O=output_variants.vcf.gz
*
* Example 2:
* Similar to example 1 but we use an input list file to specify the input files:
*
* java -jar picard.jar MergeVcfs \
* I=input_variant_files.list \
* O=output_variants.vcf.gz
*
*
* @since 1.0.1
*/
@CommandLineProgramProperties(
oneLineSummary = MergeVcfs.SUMMARY_FIRST_SENTENCE,
summary = MergeVcfs.SUMMARY,
programGroup = VariantManipulationProgramGroup.class)
@DocumentedFeature
public class MergeVcfs extends CommandLineProgram {
static final String SUMMARY_FIRST_SENTENCE = "Combines multiple variant files into a single variant file";
static final String SUMMARY = "" + SUMMARY_FIRST_SENTENCE + ".
" +
"Inputs
" +
"" +
" - One or more input file in VCF format (can be gzipped, i.e. ending in \".vcf.gz\", or binary compressed, i.e. ending in \".bcf\").
" +
" - Optionally a sequence dictionary file (typically name ending in .dict) if the input VCF does not contain a" +
" complete contig list and if the output index is to be created (true by default).
" +
"
" +
" " +
" The input variant data must adhere to the following rules:
" +
" " +
" - If there are samples, those must be the same across all input files.
" +
" - Input file headers must be contain compatible declarations for common annotations (INFO, FORMAT fields) and filters.
" +
" - Input files variant records must be sorted by their contig and position following the sequence dictionary provided" +
" or the header contig list.
" +
"
" +
" You can either directly specify the list of files by specifying INPUT
multiple times, or provide a list" +
" in a file with name ending in \".list\" to INPUT
.
" +
" Outputs
" +
" A VCF sorted (i) according to the dictionary and (ii) by coordinate.
" +
" Usage examples
" +
" Example 1:
" +
" We combine several variant files in different formats, where at least one of them contains the contig list in its header.
" +
" java -jar picard.jar MergeVcfs \\\n" +
" I=input_variants.01.vcf \\\n" +
" I=input_variants.02.vcf.gz \\\n" +
" O=output_variants.vcf.gz
" +
" Example 2:
" +
" Similar to example 1 but we use an input list file to specify the input files:
" +
" java -jar picard.jar MergeVcfs \\\n" +
" I=input_variant_files.list \\\n" +
" O=output_variants.vcf.gz
";
@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
doc="VCF or BCF input files (File format is determined by file extension), or a file having a '.list' suffix containing the path to the files, one per line.", minElements=1)
public List INPUT;
@Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The merged VCF or BCF file. File format is determined by file extension.")
public File OUTPUT;
@Argument(shortName = "D", doc = "The index sequence dictionary to use instead of the sequence dictionary in the input files", optional = true)
public File SEQUENCE_DICTIONARY;
private final static String SEQ_DICT_REQUIRED = "A sequence dictionary must be available (either through the input file or by setting it explicitly).";
private final Log log = Log.getInstance(MergeVcfs.class);
public MergeVcfs() {
this.CREATE_INDEX = true;
}
@Override
protected int doWork() {
final ProgressLogger progress = new ProgressLogger(log, 10000);
final List sampleList = new ArrayList();
INPUT = IOUtil.unrollFiles(INPUT, IOUtil.VCF_EXTENSIONS);
final Collection> iteratorCollection = new ArrayList>(INPUT.size());
final Collection headers = new HashSet(INPUT.size());
VariantContextComparator variantContextComparator = null;
SAMSequenceDictionary sequenceDictionary = null;
if (SEQUENCE_DICTIONARY != null) {
sequenceDictionary = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(SEQUENCE_DICTIONARY).getFileHeader().getSequenceDictionary();
}
for (final File file : INPUT) {
IOUtil.assertFileIsReadable(file);
final VCFFileReader fileReader = new VCFFileReader(file, false);
final VCFHeader fileHeader = fileReader.getFileHeader();
if (fileHeader.getContigLines().isEmpty()) {
if (sequenceDictionary == null) {
throw new IllegalArgumentException(SEQ_DICT_REQUIRED);
} else {
fileHeader.setSequenceDictionary(sequenceDictionary);
}
}
if (variantContextComparator == null) {
variantContextComparator = fileHeader.getVCFRecordComparator();
} else {
if (!variantContextComparator.isCompatible(fileHeader.getContigLines())) {
throw new IllegalArgumentException(
"The contig entries in input file " + file.getAbsolutePath() + " are not compatible with the others.");
}
}
if (sequenceDictionary == null) sequenceDictionary = fileHeader.getSequenceDictionary();
if (sampleList.isEmpty()) {
sampleList.addAll(fileHeader.getSampleNamesInOrder());
} else {
if (!sampleList.equals(fileHeader.getSampleNamesInOrder())) {
throw new IllegalArgumentException("Input file " + file.getAbsolutePath() + " has sample entries that don't match the other files.");
}
}
headers.add(fileHeader);
iteratorCollection.add(fileReader.iterator());
}
if (CREATE_INDEX && sequenceDictionary == null) {
throw new PicardException(String.format("Index creation failed. %s", SEQ_DICT_REQUIRED));
}
final VariantContextWriterBuilder builder = new VariantContextWriterBuilder()
.setOutputFile(OUTPUT)
.setReferenceDictionary(sequenceDictionary);
if (CREATE_INDEX) {
builder.setOption(Options.INDEX_ON_THE_FLY);
} else {
builder.unsetOption(Options.INDEX_ON_THE_FLY);
}
final VariantContextWriter writer = builder.build();
writer.writeHeader(new VCFHeader(VCFUtils.smartMergeHeaders(headers, false), sampleList));
final MergingIterator mergingIterator = new MergingIterator(variantContextComparator, iteratorCollection);
while (mergingIterator.hasNext()) {
final VariantContext context = mergingIterator.next();
writer.add(context);
progress.record(context.getContig(), context.getStart());
}
CloserUtil.close(mergingIterator);
writer.close();
return 0;
}
}