org.broadinstitute.hellbender.tools.copynumber.CollectAllelicCounts Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.copynumber;
import com.google.common.collect.ImmutableList;
import htsjdk.samtools.SAMSequenceDictionary;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
import org.broadinstitute.hellbender.engine.AlignmentContext;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.LocusWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.filters.MappingQualityReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.copynumber.datacollection.AllelicCountCollector;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.Metadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.MetadataUtils;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleLocatableMetadata;
import org.broadinstitute.hellbender.utils.Nucleotide;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* Collects reference and alternate allele counts at specified sites. The alt count is defined as the
* total count minus the ref count, and the alt nucleotide is defined as the non-ref base with the highest count,
* with ties broken by the order of the bases in {@link AllelicCountCollector#BASES}. Only reads that pass the
* specified read filters and bases that exceed the specified {@code minimum-base-quality} will be counted.
*
* Inputs
*
*
* -
* SAM format read data
*
* -
* Reference FASTA file
*
* -
* Sites at which allelic counts will be collected
*
*
*
*
* Outputs
*
*
* -
* Allelic-counts file.
* This is a tab-separated values (TSV) file with a SAM-style header containing a read group sample name, a sequence dictionary,
* a row specifying the column headers contained in {@link AllelicCountCollection.AllelicCountTableColumn},
* and the corresponding entry rows.
*
*
*
* Usage examples
*
*
* gatk CollectAllelicCounts \
* -I sample.bam \
* -R reference.fa \
* -L sites.interval_list \
* -O sample.allelicCounts.tsv
*
*
* @author Lee Lichtenstein <[email protected]>
* @author Samuel Lee <[email protected]>
*/
@CommandLineProgramProperties(
summary = "Collects reference and alternate allele counts at specified sites",
oneLineSummary = "Collects reference and alternate allele counts at specified sites",
programGroup = CoverageAnalysisProgramGroup.class
)
@DocumentedFeature
public final class CollectAllelicCounts extends LocusWalker {
private static final int DEFAULT_MINIMUM_MAPPING_QUALITY = 30;
static final int DEFAULT_MINIMUM_BASE_QUALITY = 20;
static final List DEFAULT_ADDITIONAL_READ_FILTERS = ImmutableList.of(
ReadFilterLibrary.MAPPED,
ReadFilterLibrary.NON_ZERO_REFERENCE_LENGTH_ALIGNMENT,
ReadFilterLibrary.NOT_DUPLICATE,
new MappingQualityReadFilter(DEFAULT_MINIMUM_MAPPING_QUALITY));
public static final String MINIMUM_BASE_QUALITY_LONG_NAME = "minimum-base-quality";
@Argument(
doc = "Output file for allelic counts.",
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
)
private File outputAllelicCountsFile;
@Argument(
doc = "Minimum base quality. Base calls with lower quality will be filtered out of pileups.",
fullName = MINIMUM_BASE_QUALITY_LONG_NAME,
minValue = 0,
optional = true
)
private int minimumBaseQuality = DEFAULT_MINIMUM_BASE_QUALITY;
private AllelicCountCollector allelicCountCollector;
@Override
public boolean emitEmptyLoci() {
return true;
}
@Override
public boolean requiresReference() {
return true;
}
@Override
public boolean requiresIntervals() {
return true;
}
@Override
public List getDefaultReadFilters() {
final List readFilters = new ArrayList<>(super.getDefaultReadFilters());
readFilters.addAll(DEFAULT_ADDITIONAL_READ_FILTERS);
return readFilters;
}
@Override
public void onTraversalStart() {
validateArguments();
final SampleLocatableMetadata metadata = MetadataUtils.fromHeader(getHeaderForReads(), Metadata.Type.SAMPLE_LOCATABLE);
final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
//this check is currently redundant, since the master dictionary is taken from the reads;
//however, if any other dictionary is added in the future, such a check should be performed
if (!CopyNumberArgumentValidationUtils.isSameDictionary(metadata.getSequenceDictionary(), sequenceDictionary)) {
logger.warn("Sequence dictionary in BAM does not match the master sequence dictionary.");
}
allelicCountCollector = new AllelicCountCollector(metadata);
logger.info("Collecting allelic counts...");
}
private void validateArguments() {
CopyNumberArgumentValidationUtils.validateOutputFiles(outputAllelicCountsFile);
}
@Override
public Object onTraversalSuccess() {
logger.info(String.format("Writing allelic counts to %s...", outputAllelicCountsFile.getAbsolutePath()));
allelicCountCollector.getAllelicCounts().write(outputAllelicCountsFile);
logger.info(String.format("%s complete.", getClass().getSimpleName()));
return null;
}
@Override
public void apply(AlignmentContext alignmentContext, ReferenceContext referenceContext, FeatureContext featureContext) {
final byte refAsByte = referenceContext.getBase();
allelicCountCollector.collectAtLocus(Nucleotide.decode(refAsByte), alignmentContext.getBasePileup(), alignmentContext.getLocation(), minimumBaseQuality);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy