org.broadinstitute.hellbender.tools.GatherVcfsCloud Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.seekablestream.SeekableStreamFactory;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.BlockCompressedOutputStream;
import htsjdk.samtools.util.BlockCompressedStreamConstants;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.PeekableIterator;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.FeatureReader;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextComparator;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.runtime.ProgressLogger;
import picard.cmdline.programgroups.VariantManipulationProgramGroup;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Path;
import java.util.EnumSet;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.stream.Collectors;


/**
 * This tool combines together rows of variant calls from multiple VCFs, e.g. those produced by scattering calling
 * across genomic intervals, into a single VCF. This tool enables scattering operations, e.g. in the cloud, and is
 * preferred for such contexts over Picard MergeVcfs or Picard GatherVCfs. The tool also runs locally.
 *
 * The input files need to have the same set of samples but completely different sets of loci.
 * These input files must be supplied in genomic order and must not have events at overlapping positions.
 *
 * Input
 * 
 * A set of VCF files, each specified in genomic order with the -I option, or a .list text file listing the set of VCFs
 * to be merged, one file per line.
 * 
 *
 * Output
 * 
 * A single VCF file containing the variant call records from the multiple VCFs.
 * 
 *
 * Usage examples
 * Specify each VCF file within the command.
 *  * gatk GatherVcfsCloud \
 *     -I cohortA_chr1.vcf.gz \
 *     -I cohortA_chr2.vcf.gz \
 *     -O cohortA_chr1chr2.vcf.gz
 * 
 *
 * Specify the VCF files using the following input.list:
 *  *     cohortA_chr1.vcf.gz
 *     cohortA_chr2.vcf.gz
 * 
 *
 *  * gatk GatherVcfsCloud \
 *     -I input.list
 *     -O cohortA_chr1chr2.vcf.gz
 * 
 *
 * @author Tim Fennell
 */

@DocumentedFeature
@CommandLineProgramProperties(
        summary = "Gathers multiple VCF files from a scatter operation into a single VCF file. Input files " +
                  "must be supplied in genomic order and must not have events at overlapping positions.",
        oneLineSummary = "Gathers multiple VCF files from a scatter operation into a single VCF file",
        programGroup = VariantManipulationProgramGroup.class
)
public final class GatherVcfsCloud extends CommandLineProgram {

    public static final String IGNORE_SAFETY_CHECKS_LONG_NAME = "ignore-safety-checks";
    public static final String GATHER_TYPE_LONG_NAME = "gather-type";

    @Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME,
            shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME,  doc = "Input VCF file(s).")
	public List inputs;

    @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
            shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "Output VCF file.")
	public File output;

    @Argument(fullName = StandardArgumentDefinitions.CLOUD_PREFETCH_BUFFER_LONG_NAME, shortName = StandardArgumentDefinitions.CLOUD_PREFETCH_BUFFER_SHORT_NAME, doc = "Size of the cloud-only prefetch buffer (in MB; 0 to disable).", optional=true)
    public int cloudPrefetchBuffer = 2;

    @Argument(fullName=StandardArgumentDefinitions.CREATE_OUTPUT_VARIANT_INDEX_LONG_NAME,
            shortName=StandardArgumentDefinitions.CREATE_OUTPUT_VARIANT_INDEX_SHORT_NAME,
            doc = "If true, create a VCF index when writing a coordinate-sorted VCF file.", optional=true)
    public boolean createIndex = true;

    @Argument(fullName = GATHER_TYPE_LONG_NAME, doc ="Choose which method should be used to gather: BLOCK gathering is faster but only" +
            "works when you have both bgzipped inputs and outputs, while CONVENTIONAL gather is much slower but should work on all vcf files. " +
            "AUTOMATIC chooses BLOCK if possible and CONVENTIONAL otherwise.")
    public GatherType gatherType = GatherType.AUTOMATIC;

    @Argument(fullName = "progress-logger-frequency",
            doc = "The frequency with which the progress is logged to output (i.e. every N records)")
    public Integer progressLoggerFrequency = 10000;

    @Advanced
    @Argument(fullName = IGNORE_SAFETY_CHECKS_LONG_NAME, doc = "Disable sanity checks to improve performance, may result in silently creating corrupted outputs data")
    public boolean ignoreSafetyChecks = false;

    @Advanced
    @Argument(fullName = "disable-contig-ordering-check", doc = "Don't check relative ordering of contigs when doing a conventional gather")
    public boolean disableContigOrderingCheck = false;

    private static final Logger log = LogManager.getLogger();

    public enum GatherType { BLOCK, CONVENTIONAL, AUTOMATIC}

    @Override
    protected Object doWork() {
        log.info("Checking inputs.");
        final List inputPaths = inputs.stream().map(IOUtils::getPath).collect(Collectors.toList());

        if(!ignoreSafetyChecks) {
            for (final Path f : inputPaths) {
                IOUtil.assertFileIsReadable(f);
            }
        }

        IOUtil.assertFileIsWritable(output);

        final SAMSequenceDictionary sequenceDictionary = getHeader(inputPaths.get(0)).getSequenceDictionary();

        if (createIndex && sequenceDictionary == null) {
            throw new UserException("In order to index the resulting VCF, the input VCFs must contain ##contig lines.");
        }

        if( !ignoreSafetyChecks) {
            log.info("Checking file headers and first records to ensure compatibility.");
            assertSameSamplesAndValidOrdering(inputPaths, disableContigOrderingCheck);
        }

        if(gatherType == GatherType.AUTOMATIC) {
            if ( canBlockCopy(inputPaths, output) ) {
                gatherType = GatherType.BLOCK;
            } else {
                gatherType = GatherType.CONVENTIONAL;
            }
        }

        if (gatherType == GatherType.BLOCK && !canBlockCopy(inputPaths, output)) {
            throw new UserException.BadInput(
                    "Requested block copy but some files are not bgzipped, all inputs and the output must be bgzipped to block copy");
        }

        switch (gatherType) {
            case BLOCK:
                log.info("Gathering by copying gzip blocks. Will not be able to validate position non-overlap of files.");
                if (createIndex) {
                    log.warn("Index creation not currently supported when gathering block compressed VCFs.");
                }
                gatherWithBlockCopying(inputPaths, output, cloudPrefetchBuffer);
                break;
            case CONVENTIONAL:
                log.info("Gathering by conventional means.");
                gatherConventionally(sequenceDictionary, createIndex, inputPaths, output, cloudPrefetchBuffer, disableContigOrderingCheck, progressLoggerFrequency);
                break;
            default:
                throw new GATKException.ShouldNeverReachHereException("Invalid gather type: " + gatherType + ".  Please report this bug to the developers.");
        }
        return null;
    }

    private static boolean canBlockCopy(final List inputPaths, final File output) {
        return areAllBlockCompressed(inputPaths) && areAllBlockCompressed(CollectionUtil.makeList(output.toPath()));
    }

    private static VCFHeader getHeader(final Path path) {
        try (FeatureReader reader =  getReaderFromVCFUri(path, 0)) {
            return ((VCFHeader) reader.getHeader());
        } catch (final IOException e) {
            throw new UserException.CouldNotReadInputFile(path, e.getMessage(), e);
        }
    }

    /** Checks (via filename checking) that all files appear to be block compressed files. */
    @VisibleForTesting
    static boolean areAllBlockCompressed(final List input) {
        for (final Path path : input) {
            if (path == null){
                return false;
            }
            final String pathString = path.toUri().toString();
            if ( pathString.endsWith(".bcf") || !IOUtil.hasBlockCompressedExtension(pathString)){
                return false;
            }
        }
        return true;
    }

    private static FeatureReader getReaderFromVCFUri(final Path variantPath, final int cloudPrefetchBuffer) {
        final String variantURI = variantPath.toUri().toString();
        return AbstractFeatureReader.getFeatureReader(variantURI, null, new VCFCodec(), false,
                BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer),
                Function.identity());
    }

    /** Validates that all headers contain the same set of genotyped samples and that files are in order by position of first record. */
    private static void assertSameSamplesAndValidOrdering(final List inputFiles, final boolean disableContigOrderingCheck) {
        final VCFHeader firstHeader = getHeader(inputFiles.get(0));
        final SAMSequenceDictionary dict = firstHeader.getSequenceDictionary();
        if ( dict == null) {
            throw new UserException.BadInput("The first VCF specified is missing the required sequence dictionary. " +
                                                     "This is required to perform validation.  You can skip this validation " +
                                                     "using --"+IGNORE_SAFETY_CHECKS_LONG_NAME +" but ignoring safety checks " +
                                                     "can result in invalid output.");
        }
        final VariantContextComparator comparator = new VariantContextComparator(dict);
        final List samples = firstHeader.getGenotypeSamples();

        Path lastFile = null;
        VariantContext lastContext = null;

        for (final Path f : inputFiles) {
            final FeatureReader in = getReaderFromVCFUri(f, 0);
            final VCFHeader header = (VCFHeader)in.getHeader();
            dict.assertSameDictionary(header.getSequenceDictionary());
            final List theseSamples = header.getGenotypeSamples();

            if (!samples.equals(theseSamples)) {
                final SortedSet s1 = new TreeSet<>(samples);
                final SortedSet s2 = new TreeSet<>(theseSamples);
                s1.removeAll(theseSamples);
                s2.removeAll(samples);

                throw new IllegalArgumentException("VCFs do not have identical sample lists." +
                        " Samples unique to first file: " + s1 + ". Samples unique to " + f.toUri().toString() + ": " + s2 + ".");
            }

            try(final CloseableIterator variantIterator = in.iterator()) {
                if (variantIterator.hasNext()) {
                    final VariantContext currentContext = variantIterator.next();
                    if (lastContext != null) {
                        if ( disableContigOrderingCheck ) {
                            if ( lastContext.getContig().equals(currentContext.getContig()) && lastContext.getStart() >= currentContext.getStart() ) {
                                throw new IllegalArgumentException(
                                        "First record in file " + f.toUri().toString() + " is not after first record in " +
                                                "previous file " + lastFile.toUri().toString());
                            }
                        }
                        else {
                            if ( comparator.compare(lastContext, currentContext) >= 0 ) {
                                throw new IllegalArgumentException(
                                        "First record in file " + f.toUri().toString() + " is not after first record in " +
                                                "previous file " + lastFile.toUri().toString());
                            }
                        }
                    }

                    lastContext = currentContext;
                    lastFile = f;
                }
            } catch (final IOException e) {
                throw new UserException.CouldNotReadInputFile(f, e.getMessage(), e);
            }

            CloserUtil.close(in);
        }
    }

    /** Code for gathering multiple VCFs that works regardless of input format and output format, but can be slow. */
    private static void gatherConventionally(final SAMSequenceDictionary sequenceDictionary,
                                             final boolean createIndex,
                                             final List inputFiles,
                                             final File outputFile,
                                             final int cloudPrefetchBuffer,
                                             final boolean disableContigOrderingCheck,
                                             final int progressLoggerFrequency) {
        final EnumSet options = EnumSet.copyOf(VariantContextWriterBuilder.DEFAULT_OPTIONS);
        if (createIndex) options.add(Options.INDEX_ON_THE_FLY); else options.remove(Options.INDEX_ON_THE_FLY);
        try (final VariantContextWriter out = new VariantContextWriterBuilder().setOutputFile(outputFile)
                .setReferenceDictionary(sequenceDictionary).setOptions(options).build()) {

            final ProgressLogger progress = new ProgressLogger(log, progressLoggerFrequency);
            VariantContext lastContext = null;
            Path lastFile = null;
            VCFHeader firstHeader = null;
            VariantContextComparator comparator = null;

            for (final Path f : inputFiles) {
                try {
                    log.debug("Gathering from file: ", f.toUri().toString());
                    final FeatureReader variantReader = getReaderFromVCFUri(f, cloudPrefetchBuffer);
                    final PeekableIterator variantIterator;

                        variantIterator = new PeekableIterator<>(variantReader.iterator());

                    final VCFHeader header = (VCFHeader)variantReader.getHeader();

                    if (firstHeader == null) {
                        firstHeader = header;
                        out.writeHeader(firstHeader);
                        comparator = new VariantContextComparator(firstHeader.getContigLines());
                    }

                    if (lastContext != null && variantIterator.hasNext()) {
                        final VariantContext vc = variantIterator.peek();
                        if ( disableContigOrderingCheck ) {
                            // Just check start positions
                            if ( vc.getContig().equals(lastContext.getContig()) && vc.getStart() <= lastContext.getStart() ) {
                                throw new IllegalStateException("First variant in file " + f.toUri().toString() + " is at start position " + vc.getStart() +
                                        " but last variant in earlier file " + lastFile.toUri().toString() + " is at start position " + lastContext.getStart());
                            }
                        }
                        else {
                            // Check contig ordering and start positions
                            if ( comparator.compare(vc, lastContext) <= 0 ) {
                                throw new IllegalStateException("First variant in file " + f.toUri().toString() + " is at " + String.format("%s:%d", vc.getContig(), vc.getStart()) +
                                        " but last variant in earlier file " + lastFile.toUri().toString() + " is at " + String.format("%s:%d", lastContext.getContig(), lastContext.getStart()));

                            }
                        }
                    }

                    while (variantIterator.hasNext()) {
                        lastContext = variantIterator.next();
                        out.add(lastContext);
                        progress.record(lastContext.getContig(), lastContext.getStart());
                    }

                    lastFile = f;

                    CloserUtil.close(variantIterator);
                    CloserUtil.close(variantReader);
                } catch (final IOException e) {
                    throw new UserException.CouldNotReadInputFile(f, e.getMessage(), e);
                }
            }
        }
    }

    /**
     * Assumes that all inputs and outputs are block compressed VCF files and copies them without decompressing and parsing
     * most of the gzip blocks. Will decompress and parse blocks up to the one containing the end of the header in each file
     * (often the first block) and re-compress any data remaining in that block into a new block in the output file. Subsequent
     * blocks (excluding a terminator block if present) are copied directly from input to output.
     */
    private static void gatherWithBlockCopying(final List vcfs, final File output, final int cloudPrefetchBuffer) {
         try (final FileOutputStream out = new FileOutputStream(output)) {
            boolean isFirstFile = true;

            for (final Path f : vcfs) {
                log.info("Gathering " + f.toUri());
                final Function prefetcher = BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer);
                try (final SeekableStream in = SeekableStreamFactory.getInstance().getStreamFor(f.toUri().toString(), prefetcher)) {
                    // a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator block and not copy it
                    final BlockCompressedInputStream.FileTermination term = BlockCompressedInputStream.checkTermination(f);
                    if (term == BlockCompressedInputStream.FileTermination.DEFECTIVE) {
                        throw new UserException.MalformedFile(f.toUri() + " does not have a valid GZIP block at the end of the file.");
                    }

                    if (!isFirstFile) {
                        final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(in, false);
                        boolean lastByteNewline = true;

                        int firstNonHeaderByteIndex = -1;
                        while (blockIn.available() > 0) {
                            // Read a block - blockIn.available() is guaranteed to return the bytes remaining in the block that has been
                            // read, and since we haven't consumed any yet, that is the block size.
                            final int blockLength = blockIn.available();
                            final byte[] blockContents = new byte[blockLength];
                            final int read = blockIn.read(blockContents);
                            Utils.validate(blockLength > 0 && read == blockLength, "Could not read available bytes from BlockCompressedInputStream.");

                            // Scan forward within the block to see if we can find the end of the header within this block
                            firstNonHeaderByteIndex = -1;
                            for (int i = 0; i < read; ++i) {
                                final byte b = blockContents[i];
                                final boolean thisByteNewline = (b == '\n' || b == '\r');

                                if (lastByteNewline && !thisByteNewline && b != '#') {
                                    // Aha!  Found first byte of non-header data in file!
                                    firstNonHeaderByteIndex = i;
                                    break;
                                }

                                lastByteNewline = thisByteNewline;
                            }

                            // If we found the end of the header then write the remainder of this block out as a
                            // new gzip block and then break out of the while loop
                            if (firstNonHeaderByteIndex >= 0) {
                                final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(out, (Path)null);
                                blockOut.write(blockContents, firstNonHeaderByteIndex, blockContents.length - firstNonHeaderByteIndex);
                                blockOut.flush();
                                // Don't close blockOut because closing underlying stream would break everything
                                break;
                            }
                        }

                        if( firstNonHeaderByteIndex == -1 ){
                            log.warn("Scanned the entire file " + f.toUri().toString() + " and found no variants");
                        }
                    }

                    // Copy remainder of input stream into output stream
                    final long currentPos = in.position();
                    final long length = in.length();
                    final long skipLast = (term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) ?
                            BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length : 0;
                    final long bytesToWrite = length - skipLast - currentPos;

                    IOUtil.transferByStream(in, out, bytesToWrite);
                    isFirstFile = false;
                }
            }

            // And lastly add the Terminator block and close up
            out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
        }
        catch (final IOException ioe) {
            throw new RuntimeIOException(ioe);
        }

    }

}