org.broadinstitute.hellbender.tools.GatherVcfsCloud Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools;
import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.seekablestream.SeekableStreamFactory;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.BlockCompressedOutputStream;
import htsjdk.samtools.util.BlockCompressedStreamConstants;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.PeekableIterator;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.FeatureReader;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextComparator;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.runtime.ProgressLogger;
import picard.cmdline.programgroups.VariantManipulationProgramGroup;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Path;
import java.util.EnumSet;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
* This tool combines together rows of variant calls from multiple VCFs, e.g. those produced by scattering calling
* across genomic intervals, into a single VCF. This tool enables scattering operations, e.g. in the cloud, and is
* preferred for such contexts over Picard MergeVcfs or Picard GatherVCfs. The tool also runs locally.
*
* The input files need to have the same set of samples but completely different sets of loci.
* These input files must be supplied in genomic order and must not have events at overlapping positions.
*
* Input
*
* A set of VCF files, each specified in genomic order with the -I option, or a .list text file listing the set of VCFs
* to be merged, one file per line.
*
*
* Output
*
* A single VCF file containing the variant call records from the multiple VCFs.
*
*
* Usage examples
* Specify each VCF file within the command.
*
* gatk GatherVcfsCloud \
* -I cohortA_chr1.vcf.gz \
* -I cohortA_chr2.vcf.gz \
* -O cohortA_chr1chr2.vcf.gz
*
*
* Specify the VCF files using the following input.list:
*
* cohortA_chr1.vcf.gz
* cohortA_chr2.vcf.gz
*
*
*
* gatk GatherVcfsCloud \
* -I input.list
* -O cohortA_chr1chr2.vcf.gz
*
*
* @author Tim Fennell
*/
@DocumentedFeature
@CommandLineProgramProperties(
summary = "Gathers multiple VCF files from a scatter operation into a single VCF file. Input files " +
"must be supplied in genomic order and must not have events at overlapping positions.",
oneLineSummary = "Gathers multiple VCF files from a scatter operation into a single VCF file",
programGroup = VariantManipulationProgramGroup.class
)
public final class GatherVcfsCloud extends CommandLineProgram {
public static final String IGNORE_SAFETY_CHECKS_LONG_NAME = "ignore-safety-checks";
public static final String GATHER_TYPE_LONG_NAME = "gather-type";
@Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, doc = "Input VCF file(s).")
public List inputs;
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "Output VCF file.")
public File output;
@Argument(fullName = StandardArgumentDefinitions.CLOUD_PREFETCH_BUFFER_LONG_NAME, shortName = StandardArgumentDefinitions.CLOUD_PREFETCH_BUFFER_SHORT_NAME, doc = "Size of the cloud-only prefetch buffer (in MB; 0 to disable).", optional=true)
public int cloudPrefetchBuffer = 2;
@Argument(fullName=StandardArgumentDefinitions.CREATE_OUTPUT_VARIANT_INDEX_LONG_NAME,
shortName=StandardArgumentDefinitions.CREATE_OUTPUT_VARIANT_INDEX_SHORT_NAME,
doc = "If true, create a VCF index when writing a coordinate-sorted VCF file.", optional=true)
public boolean createIndex = true;
@Argument(fullName = GATHER_TYPE_LONG_NAME, doc ="Choose which method should be used to gather: BLOCK gathering is faster but only" +
"works when you have both bgzipped inputs and outputs, while CONVENTIONAL gather is much slower but should work on all vcf files. " +
"AUTOMATIC chooses BLOCK if possible and CONVENTIONAL otherwise.")
public GatherType gatherType = GatherType.AUTOMATIC;
@Argument(fullName = "progress-logger-frequency",
doc = "The frequency with which the progress is logged to output (i.e. every N records)")
public Integer progressLoggerFrequency = 10000;
@Advanced
@Argument(fullName = IGNORE_SAFETY_CHECKS_LONG_NAME, doc = "Disable sanity checks to improve performance, may result in silently creating corrupted outputs data")
public boolean ignoreSafetyChecks = false;
@Advanced
@Argument(fullName = "disable-contig-ordering-check", doc = "Don't check relative ordering of contigs when doing a conventional gather")
public boolean disableContigOrderingCheck = false;
private static final Logger log = LogManager.getLogger();
public enum GatherType { BLOCK, CONVENTIONAL, AUTOMATIC}
@Override
protected Object doWork() {
log.info("Checking inputs.");
final List inputPaths = inputs.stream().map(IOUtils::getPath).collect(Collectors.toList());
if(!ignoreSafetyChecks) {
for (final Path f : inputPaths) {
IOUtil.assertFileIsReadable(f);
}
}
IOUtil.assertFileIsWritable(output);
final SAMSequenceDictionary sequenceDictionary = getHeader(inputPaths.get(0)).getSequenceDictionary();
if (createIndex && sequenceDictionary == null) {
throw new UserException("In order to index the resulting VCF, the input VCFs must contain ##contig lines.");
}
if( !ignoreSafetyChecks) {
log.info("Checking file headers and first records to ensure compatibility.");
assertSameSamplesAndValidOrdering(inputPaths, disableContigOrderingCheck);
}
if(gatherType == GatherType.AUTOMATIC) {
if ( canBlockCopy(inputPaths, output) ) {
gatherType = GatherType.BLOCK;
} else {
gatherType = GatherType.CONVENTIONAL;
}
}
if (gatherType == GatherType.BLOCK && !canBlockCopy(inputPaths, output)) {
throw new UserException.BadInput(
"Requested block copy but some files are not bgzipped, all inputs and the output must be bgzipped to block copy");
}
switch (gatherType) {
case BLOCK:
log.info("Gathering by copying gzip blocks. Will not be able to validate position non-overlap of files.");
if (createIndex) {
log.warn("Index creation not currently supported when gathering block compressed VCFs.");
}
gatherWithBlockCopying(inputPaths, output, cloudPrefetchBuffer);
break;
case CONVENTIONAL:
log.info("Gathering by conventional means.");
gatherConventionally(sequenceDictionary, createIndex, inputPaths, output, cloudPrefetchBuffer, disableContigOrderingCheck, progressLoggerFrequency);
break;
default:
throw new GATKException.ShouldNeverReachHereException("Invalid gather type: " + gatherType + ". Please report this bug to the developers.");
}
return null;
}
private static boolean canBlockCopy(final List inputPaths, final File output) {
return areAllBlockCompressed(inputPaths) && areAllBlockCompressed(CollectionUtil.makeList(output.toPath()));
}
private static VCFHeader getHeader(final Path path) {
try (FeatureReader reader = getReaderFromVCFUri(path, 0)) {
return ((VCFHeader) reader.getHeader());
} catch (final IOException e) {
throw new UserException.CouldNotReadInputFile(path, e.getMessage(), e);
}
}
/** Checks (via filename checking) that all files appear to be block compressed files. */
@VisibleForTesting
static boolean areAllBlockCompressed(final List input) {
for (final Path path : input) {
if (path == null){
return false;
}
final String pathString = path.toUri().toString();
if ( pathString.endsWith(".bcf") || !IOUtil.hasBlockCompressedExtension(pathString)){
return false;
}
}
return true;
}
private static FeatureReader getReaderFromVCFUri(final Path variantPath, final int cloudPrefetchBuffer) {
final String variantURI = variantPath.toUri().toString();
return AbstractFeatureReader.getFeatureReader(variantURI, null, new VCFCodec(), false,
BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer),
Function.identity());
}
/** Validates that all headers contain the same set of genotyped samples and that files are in order by position of first record. */
private static void assertSameSamplesAndValidOrdering(final List inputFiles, final boolean disableContigOrderingCheck) {
final VCFHeader firstHeader = getHeader(inputFiles.get(0));
final SAMSequenceDictionary dict = firstHeader.getSequenceDictionary();
if ( dict == null) {
throw new UserException.BadInput("The first VCF specified is missing the required sequence dictionary. " +
"This is required to perform validation. You can skip this validation " +
"using --"+IGNORE_SAFETY_CHECKS_LONG_NAME +" but ignoring safety checks " +
"can result in invalid output.");
}
final VariantContextComparator comparator = new VariantContextComparator(dict);
final List samples = firstHeader.getGenotypeSamples();
Path lastFile = null;
VariantContext lastContext = null;
for (final Path f : inputFiles) {
final FeatureReader in = getReaderFromVCFUri(f, 0);
final VCFHeader header = (VCFHeader)in.getHeader();
dict.assertSameDictionary(header.getSequenceDictionary());
final List theseSamples = header.getGenotypeSamples();
if (!samples.equals(theseSamples)) {
final SortedSet s1 = new TreeSet<>(samples);
final SortedSet s2 = new TreeSet<>(theseSamples);
s1.removeAll(theseSamples);
s2.removeAll(samples);
throw new IllegalArgumentException("VCFs do not have identical sample lists." +
" Samples unique to first file: " + s1 + ". Samples unique to " + f.toUri().toString() + ": " + s2 + ".");
}
try(final CloseableIterator variantIterator = in.iterator()) {
if (variantIterator.hasNext()) {
final VariantContext currentContext = variantIterator.next();
if (lastContext != null) {
if ( disableContigOrderingCheck ) {
if ( lastContext.getContig().equals(currentContext.getContig()) && lastContext.getStart() >= currentContext.getStart() ) {
throw new IllegalArgumentException(
"First record in file " + f.toUri().toString() + " is not after first record in " +
"previous file " + lastFile.toUri().toString());
}
}
else {
if ( comparator.compare(lastContext, currentContext) >= 0 ) {
throw new IllegalArgumentException(
"First record in file " + f.toUri().toString() + " is not after first record in " +
"previous file " + lastFile.toUri().toString());
}
}
}
lastContext = currentContext;
lastFile = f;
}
} catch (final IOException e) {
throw new UserException.CouldNotReadInputFile(f, e.getMessage(), e);
}
CloserUtil.close(in);
}
}
/** Code for gathering multiple VCFs that works regardless of input format and output format, but can be slow. */
private static void gatherConventionally(final SAMSequenceDictionary sequenceDictionary,
final boolean createIndex,
final List inputFiles,
final File outputFile,
final int cloudPrefetchBuffer,
final boolean disableContigOrderingCheck,
final int progressLoggerFrequency) {
final EnumSet options = EnumSet.copyOf(VariantContextWriterBuilder.DEFAULT_OPTIONS);
if (createIndex) options.add(Options.INDEX_ON_THE_FLY); else options.remove(Options.INDEX_ON_THE_FLY);
try (final VariantContextWriter out = new VariantContextWriterBuilder().setOutputFile(outputFile)
.setReferenceDictionary(sequenceDictionary).setOptions(options).build()) {
final ProgressLogger progress = new ProgressLogger(log, progressLoggerFrequency);
VariantContext lastContext = null;
Path lastFile = null;
VCFHeader firstHeader = null;
VariantContextComparator comparator = null;
for (final Path f : inputFiles) {
try {
log.debug("Gathering from file: ", f.toUri().toString());
final FeatureReader variantReader = getReaderFromVCFUri(f, cloudPrefetchBuffer);
final PeekableIterator variantIterator;
variantIterator = new PeekableIterator<>(variantReader.iterator());
final VCFHeader header = (VCFHeader)variantReader.getHeader();
if (firstHeader == null) {
firstHeader = header;
out.writeHeader(firstHeader);
comparator = new VariantContextComparator(firstHeader.getContigLines());
}
if (lastContext != null && variantIterator.hasNext()) {
final VariantContext vc = variantIterator.peek();
if ( disableContigOrderingCheck ) {
// Just check start positions
if ( vc.getContig().equals(lastContext.getContig()) && vc.getStart() <= lastContext.getStart() ) {
throw new IllegalStateException("First variant in file " + f.toUri().toString() + " is at start position " + vc.getStart() +
" but last variant in earlier file " + lastFile.toUri().toString() + " is at start position " + lastContext.getStart());
}
}
else {
// Check contig ordering and start positions
if ( comparator.compare(vc, lastContext) <= 0 ) {
throw new IllegalStateException("First variant in file " + f.toUri().toString() + " is at " + String.format("%s:%d", vc.getContig(), vc.getStart()) +
" but last variant in earlier file " + lastFile.toUri().toString() + " is at " + String.format("%s:%d", lastContext.getContig(), lastContext.getStart()));
}
}
}
while (variantIterator.hasNext()) {
lastContext = variantIterator.next();
out.add(lastContext);
progress.record(lastContext.getContig(), lastContext.getStart());
}
lastFile = f;
CloserUtil.close(variantIterator);
CloserUtil.close(variantReader);
} catch (final IOException e) {
throw new UserException.CouldNotReadInputFile(f, e.getMessage(), e);
}
}
}
}
/**
* Assumes that all inputs and outputs are block compressed VCF files and copies them without decompressing and parsing
* most of the gzip blocks. Will decompress and parse blocks up to the one containing the end of the header in each file
* (often the first block) and re-compress any data remaining in that block into a new block in the output file. Subsequent
* blocks (excluding a terminator block if present) are copied directly from input to output.
*/
private static void gatherWithBlockCopying(final List vcfs, final File output, final int cloudPrefetchBuffer) {
try (final FileOutputStream out = new FileOutputStream(output)) {
boolean isFirstFile = true;
for (final Path f : vcfs) {
log.info("Gathering " + f.toUri());
final Function prefetcher = BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer);
try (final SeekableStream in = SeekableStreamFactory.getInstance().getStreamFor(f.toUri().toString(), prefetcher)) {
// a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator block and not copy it
final BlockCompressedInputStream.FileTermination term = BlockCompressedInputStream.checkTermination(f);
if (term == BlockCompressedInputStream.FileTermination.DEFECTIVE) {
throw new UserException.MalformedFile(f.toUri() + " does not have a valid GZIP block at the end of the file.");
}
if (!isFirstFile) {
final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(in, false);
boolean lastByteNewline = true;
int firstNonHeaderByteIndex = -1;
while (blockIn.available() > 0) {
// Read a block - blockIn.available() is guaranteed to return the bytes remaining in the block that has been
// read, and since we haven't consumed any yet, that is the block size.
final int blockLength = blockIn.available();
final byte[] blockContents = new byte[blockLength];
final int read = blockIn.read(blockContents);
Utils.validate(blockLength > 0 && read == blockLength, "Could not read available bytes from BlockCompressedInputStream.");
// Scan forward within the block to see if we can find the end of the header within this block
firstNonHeaderByteIndex = -1;
for (int i = 0; i < read; ++i) {
final byte b = blockContents[i];
final boolean thisByteNewline = (b == '\n' || b == '\r');
if (lastByteNewline && !thisByteNewline && b != '#') {
// Aha! Found first byte of non-header data in file!
firstNonHeaderByteIndex = i;
break;
}
lastByteNewline = thisByteNewline;
}
// If we found the end of the header then write the remainder of this block out as a
// new gzip block and then break out of the while loop
if (firstNonHeaderByteIndex >= 0) {
final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(out, (Path)null);
blockOut.write(blockContents, firstNonHeaderByteIndex, blockContents.length - firstNonHeaderByteIndex);
blockOut.flush();
// Don't close blockOut because closing underlying stream would break everything
break;
}
}
if( firstNonHeaderByteIndex == -1 ){
log.warn("Scanned the entire file " + f.toUri().toString() + " and found no variants");
}
}
// Copy remainder of input stream into output stream
final long currentPos = in.position();
final long length = in.length();
final long skipLast = (term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) ?
BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length : 0;
final long bytesToWrite = length - skipLast - currentPos;
IOUtil.transferByStream(in, out, bytesToWrite);
isFirstFile = false;
}
}
// And lastly add the Terminator block and close up
out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
}
catch (final IOException ioe) {
throw new RuntimeIOException(ioe);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy