org.broadinstitute.hellbender.tools.sv.PrintSVEvidence Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.sv;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.tribble.Feature;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.ExperimentalFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.codecs.*;

import java.util.*;
/**
 * Merges locus-sorted files of evidence for structural variation into a single output file.
 * The tool can also subset the inputs to specified genomic intervals, or to a specified list of samples.
 * The evidence types and their files extensions are:
 * 
 *     BafEvidence
 *     The biallelic frequency of a SNP in some sample at some locus.
 *          File extensions are *.baf.txt, *.baf.txt.gz, or *.baf.bci.
 *     DepthEvidence
 *     The read counts of any number of samples on some interval.
 *          File extensions are *.rd.txt, *.rd.txt.gz, or *.rd.bci.
 *     DiscordantPairEvidence
 *     Evidence of a read pair that spans a genomic distance that's too large or too small.
 *          File extensions are *.pe.txt, *.pe.txt.gz, or *.pe.bci.
 *     SiteDepth
 *     The read counts of each base call for some sample at some locus.
 *          File extensions are *.sd.txt, *.sd.txt.gz, or *.sd.bci.
 *     SplitReadEvidence
 *     The number of chimeric reads in some sample at some locus.
 *          File extensions are *.sr.txt, *.sr.txt.gz, or *.sr.bci.
 * 
 *
 * Inputs
 *
 * 
 *     
 *         One or more evidence files.
 *         These must be locus-sorted, and must all contain the same type of evidence.
 *         
Or a file containing a list of evidence files, one per line.
 *     
 *     Optional:  A list of sample names to extract.
 * 
 *
 * Output
 *
 * 
 *     
 *         An output file containing merged evidence from the inputs.
 *     
 * 
 *
 * Usage example
 *
 *  *     gatk SVCluster \
 *       -F file1.baf.txt.gz [-F file2.baf.txt.gz ...] \
 *       -O merged.baf.bci \
 *       --sample-names sample1 [--sample-names sample2 ...]
 * 
 *
 * @author Ted Sharpe <[email protected]>
 */
@CommandLineProgramProperties(
        summary = "Merges multiple sources of SV evidence records of some particular feature type" +
        " into a single output file.  Inputs must be locus-sorted." +
        "  Can also subset by regions or samples.",
        oneLineSummary = "Merges SV evidence records.",
        programGroup = StructuralVariantDiscoveryProgramGroup.class
)
@ExperimentalFeature
public class PrintSVEvidence extends MultiFeatureWalker {
    public static final String EVIDENCE_FILE_NAME = "evidence-file";
    public static final String SAMPLE_NAMES_NAME = "sample-names";
    public static final String COMPRESSION_LEVEL_NAME = "compression-level";

    @Argument(
            doc = "Input feature file URI(s) with extension '"
                    + SplitReadEvidenceCodec.FORMAT_SUFFIX + "', '"
                    + DiscordantPairEvidenceCodec.FORMAT_SUFFIX + "', '"
                    + SiteDepthCodec.FORMAT_SUFFIX + "', '"
                    + BafEvidenceCodec.FORMAT_SUFFIX + "', or '"
                    + DepthEvidenceCodec.FORMAT_SUFFIX + "' (may be gzipped). "
                    + "Can also handle bci rather than txt files.",
            fullName = EVIDENCE_FILE_NAME,
            shortName = StandardArgumentDefinitions.FEATURE_SHORT_NAME
    )
    private List> inputPaths;

    @Argument(doc = "List of sample names to extract from the sources (either as a .list file or " +
            " as repeated arguments).  If not specified, all samples will be merged.",
              fullName = SAMPLE_NAMES_NAME, optional = true)
    @VisibleForTesting
    Set sampleNames = new LinkedHashSet<>();

    @Argument(
            doc = "Output file for features of a type matching the input. Will be indexed if it " +
                    "has a block-compressed extension (e.g. '.gz' or '.bci').",
            fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
            shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
    )
    private GATKPath outputFilePath;

    @Argument(
            doc = "Output compression level",
            fullName = COMPRESSION_LEVEL_NAME,
            minValue = 0, maxValue = 9, optional = true
    )
    private int compressionLevel = 4;

    private boolean noSampleFiltering = false;
    private FeatureSink outputSink;

    @Override
    @SuppressWarnings("unchecked")
    public void onTraversalStart() {
        super.onTraversalStart();

        final FeatureOutputCodec> codec =
                FeatureOutputCodecFinder.find(outputFilePath);
        final Class outputClass = codec.getFeatureType();
        if ( !SVFeature.class.isAssignableFrom(outputClass) ) {
            throw new UserException("Output file " + outputFilePath + " implies Feature subtype " +
                    outputClass.getSimpleName() + " but this tool requires an SVFeature subtype.");
        }

        for ( FeatureInput input : inputPaths ) {
            try {
                final Class inputClass =
                        input.getFeatureCodecClass().getDeclaredConstructor().newInstance().getFeatureType();
                if ( !outputClass.isAssignableFrom(inputClass) ) {
                    throw new UserException("Incompatible feature input " + input.getFeaturePath() +
                            " produces features of type " + inputClass.getSimpleName() +
                            " rather than features of type " + outputClass.getSimpleName() +
                            " as dictated by the output path " + outputFilePath);
                }
            } catch ( final ReflectiveOperationException roe ) {
                throw new GATKException("Failed to instantiate codec " +
                                            input.getFeatureCodecClass().getSimpleName());
            }
        }
        if ( sampleNames.isEmpty() ) {
            // use the complete set of sample names we found in the headers of the feature files
            sampleNames.addAll(getSampleNames());
            if ( sampleNames.isEmpty() ) {
                noSampleFiltering = true;
            }
        }

        // the validity of this cast was checked at the beginning of this method
        outputSink = (FeatureSink)codec.makeSortMerger(outputFilePath,
                                    getDictionary(), new ArrayList<>(sampleNames), compressionLevel);
    }

    @Override
    public void apply( final SVFeature featureArg,
                       final Object header,
                       final ReadsContext readsContext,
                       final ReferenceContext referenceContext ) {
        final SVFeature feature;
        if ( noSampleFiltering ) {
            feature = featureArg;
        } else {
            feature = featureArg.extractSamples(sampleNames, header);
            if ( feature == null ) {
                return;
            }
        }
        outputSink.write(feature);
    }

    @Override
    public Object onTraversalSuccess() {
        super.onTraversalSuccess();
        outputSink.close();
        return null;
    }
}