All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.vcf.FixVcfHeader Maven / Gradle / Ivy

Go to download

A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF.

There is a newer version: 3.2.0
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2016 Nils Homer
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */

package picard.vcf;

import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFFormatHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.vcf.VCFStandardHeaderLines;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.VariantManipulationProgramGroup;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

/**
 * Tool for replacing or fixing up a VCF header.
 *
 * @author Nils Homer
 */
@CommandLineProgramProperties(
        summary = FixVcfHeader.USAGE_SUMMARY + FixVcfHeader.USAGE_DETAILS,
        oneLineSummary = FixVcfHeader.USAGE_SUMMARY,
        programGroup = VariantManipulationProgramGroup.class)
@DocumentedFeature
public class FixVcfHeader extends CommandLineProgram {
    static final String USAGE_SUMMARY = "Replaces or fixes a VCF header.";
    static final String USAGE_DETAILS =
            "This tool will either replace the header in the input VCF file (INPUT) with the given VCF header (HEADER) or will attempt to fill " +
            "in any field definitions that are missing in the input header by examining the variants in the input VCF file (INPUT).  In the  " +
            " latter case, this tool will perform two passes over the input VCF, and any INFO and FORMAT fields found in the VCF records but " +
            "not found in the input VCF header will be added to the output VCF header with dummy descriptions.
" + "

Replace header usage example:

" + "
" +
            "java -jar picard.jar FixVcfHeader \\
" + " I=input.vcf \\
" + " O=fixed.vcf \\
" + " HEADER=header.vcf" + "
" + "

Fix header usage example:

" + "
" +
            "java -jar picard.jar FixVcfHeader \\
" + " I=input.vcf \\
" + " O=fixed.vcf \\
" + "
" + "
"; @Argument(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME, doc="The input VCF/BCF file.") public File INPUT; @Argument(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="The output VCF/BCF file.") public File OUTPUT; @Argument(shortName="N", doc="Check only the first N records when searching for missing INFO and FORMAT fields.", optional=true) public int CHECK_FIRST_N_RECORDS = -1; @Argument(shortName="H", doc="The replacement VCF header.", optional=true) public File HEADER = null; @Argument(doc="Enforce that the samples are the same (and in the same order) when replacing the VCF header.", optional=true) public boolean ENFORCE_SAME_SAMPLES = true; private final Log log = Log.getInstance(FixVcfHeader.class); // Stock main method public static void main(final String[] args) { new FixVcfHeader().instanceMainWithExit(args); } @Override protected String[] customCommandLineValidation() { if (HEADER != null && 0 <= CHECK_FIRST_N_RECORDS) return new String[]{"CHECK_FIRST_N_RECORDS should no be specified when HEADER is specified"}; return super.customCommandLineValidation(); } @Override protected int doWork() { IOUtil.assertFileIsReadable(INPUT); if (HEADER != null) IOUtil.assertFileIsReadable(HEADER); IOUtil.assertFileIsWritable(OUTPUT); final VCFFileReader reader = new VCFFileReader(INPUT, false); final VCFHeader outHeader; if (HEADER != null) { // read the header from the file final VCFFileReader headerReader = new VCFFileReader(HEADER, false); if (ENFORCE_SAME_SAMPLES) { outHeader = headerReader.getFileHeader(); enforceSameSamples(reader.getFileHeader(), outHeader); } else { outHeader = new VCFHeader(headerReader.getFileHeader().getMetaDataInInputOrder(), reader.getFileHeader().getSampleNamesInOrder()); } CloserUtil.close(headerReader); outHeader.getInfoHeaderLines() .stream() .filter(infoHeaderLine -> !reader.getFileHeader().hasInfoLine(infoHeaderLine.getID())) .forEach(infoHeaderLine -> log.info("INFO line found in HEADER will be added to OUTPUT: " + infoHeaderLine.getID())); outHeader.getFormatHeaderLines() .stream() .filter(formatHeaderLine -> !reader.getFileHeader().hasInfoLine(formatHeaderLine.getID())) .forEach(formatHeaderLine -> log.info("FORMAT line found in HEADER will be added to OUTPUT: " + formatHeaderLine.getID())); } else { // read the input final ProgressLogger progress = new ProgressLogger(log, 1000000, "read"); final VCFFileReader in = new VCFFileReader(INPUT, false); final VCFHeader inHeader = in.getFileHeader(); final Map infoHeaderLines = new HashMap<>(); final Map formatHeaderLines = new HashMap<>(); log.info("Reading in records to re-build the header."); for (final VariantContext ctx : in) { // INFO for (final Map.Entry attribute : ctx.getAttributes().entrySet()) { final String id = attribute.getKey(); if (!inHeader.hasInfoLine(id) && !infoHeaderLines.containsKey(id)) { log.info("Will add an INFO line with id: " + id); infoHeaderLines.put(id, new VCFInfoHeaderLine(id, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Missing description: this INFO line was added by Picard's FixVCFHeader")); } } // FORMAT for (final Genotype genotype : ctx.getGenotypes()) { for (final Map.Entry attribute : genotype.getExtendedAttributes().entrySet()) { final String id = attribute.getKey(); if (!inHeader.hasFormatLine(id) && !formatHeaderLines.containsKey(id)) { log.info("Will add FORMAT line with id: " + id); formatHeaderLines.put(id, new VCFFormatHeaderLine(id, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Missing description: this FORMAT line was added by Picard's FixVCFHeader")); } } } progress.record(ctx.getContig(), ctx.getStart()); if (0 < CHECK_FIRST_N_RECORDS && CHECK_FIRST_N_RECORDS <= progress.getCount()) break; } CloserUtil.close(in); // create the output header final Set headerLines = new HashSet<>(inHeader.getMetaDataInInputOrder()); VCFStandardHeaderLines.addStandardFormatLines(headerLines, false, Genotype.PRIMARY_KEYS); // This is very frustrating to have to add headerLines.addAll(infoHeaderLines.values()); headerLines.addAll(formatHeaderLines.values()); outHeader = new VCFHeader(headerLines, inHeader.getSampleNamesInOrder()); log.info("VCF header re-built."); } final VariantContextWriter writer = new VariantContextWriterBuilder().setOption(Options.INDEX_ON_THE_FLY) .setOutputFile(OUTPUT).setReferenceDictionary(outHeader.getSequenceDictionary()).build(); writer.writeHeader(outHeader); log.info("Writing the output VCF."); final ProgressLogger progress = new ProgressLogger(log, 1000000, "read"); for (final VariantContext ctx : reader) { writer.add(ctx); progress.record(ctx.getContig(), ctx.getStart()); } writer.close(); CloserUtil.close(reader); return 0; } private void enforceSameSamples(final VCFHeader readerHeader, final VCFHeader inputHeader) { final ArrayList readerSamples = readerHeader.getSampleNamesInOrder(); final ArrayList inputSamples = inputHeader.getSampleNamesInOrder(); if (readerSamples.size() != inputSamples.size()) { throw new PicardException("The input VCF had a different # of samples than the input VCF header."); } for (int i = 0; i < readerSamples.size(); i++) { if (!readerSamples.get(i).equals(inputSamples.get(i))) { throw new PicardException(String.format("Mismatch in the %dth sample: '%s' != '%s'", i, readerSamples.get(i), inputSamples.get(i))); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy