org.broadinstitute.hellbender.engine.AssemblyRegionWalker Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.engine;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.IGVUtils;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.downsampling.PositionalDownsampler;
import org.broadinstitute.hellbender.utils.downsampling.ReadsDownsampler;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* An AssemblyRegionWalker is a tool that processes an entire region of reads at a time, each marked as either "active"
* (containing possible variation) or "inactive" (not likely to contain actual variation). Tool authors must implement
* {@link #assemblyRegionEvaluator} to provide a means of determining whether a site is active or not, as well as
* {@link #apply} to process each region. Authors must also implement methods providing default values for the
* various traversal parameters.
*
* Each region passed to {@link #apply} will come pre-marked as either "active" or "inactive" using the results of the
* configured {@link #assemblyRegionEvaluator}. The {@link #assemblyRegionEvaluator} is used to evaluate the probability
* that each individual locus is active, and these probabilities are used to determine the bounds of each active and
* inactive region.
*
* {@link #apply} will be called once for each active AND inactive region, and it is up to the implementation how to
* handle/process active vs. inactive regions.
*
* Internally, the reads are loaded in chunks called read shards, which are then subdivided into active/inactive regions
* for processing by the tool implementation. One read shard is created per contig.
*/
public abstract class AssemblyRegionWalker extends WalkerBase {
@ArgumentCollection
public final AssemblyRegionArgumentCollection assemblyRegionArgs = new AssemblyRegionArgumentCollection();
/**
* If provided, this walker will write out its assembly regions
* to this file in the IGV formatted TAB-delimited output:
*
* http://www.broadinstitute.org/software/igv/IGV
*
* Intended to make debugging the active region calculations easier
*/
@Argument(fullName = AssemblyRegionArgumentCollection.ASSEMBLY_REGION_OUT_LONG_NAME, doc="Output the assembly region to this IGV formatted file", optional = true)
protected String assemblyRegionOut = null;
private PrintStream assemblyRegionOutStream;
@Override
public final boolean requiresReads() { return true; }
@Override
public final boolean requiresReference() { return true; }
@Override
public String getProgressMeterRecordLabel() { return "regions"; }
private List readShards;
private boolean nonRandomDownsamplingMode = nonRandomDownsamplingMode();
/**
* Initialize data sources for traversal.
*
* Marked final so that tool authors don't override it. Tool authors should override onTraversalStart() instead.
*/
@Override
protected final void onStartup() {
super.onStartup();
assemblyRegionArgs.validate();
final List intervals = hasUserSuppliedIntervals() ? userIntervals : IntervalUtils.getAllIntervalsForReference(getHeaderForReads().getSequenceDictionary());
readShards = makeReadShards(intervals);
initializeAssemblyRegionOutputStreams();
}
/**
* Shard our intervals for traversal into ReadShards, each shard containing all of the
* intervals for one contig.
*
* We pad the intervals within each shard by the same amount as the assembly region padding
* to avoid boundary artifacts.
*
* @param intervals unmodified intervals for traversal
* @return List of {@link MultiIntervalLocalReadShard} objects, sharded and padded as necessary
*/
private List makeReadShards(final List intervals ) {
final List shards = new ArrayList<>();
final List> intervalsGroupedByContig = IntervalUtils.groupIntervalsByContig(intervals);
for ( final List allIntervalsOnContig : intervalsGroupedByContig ) {
shards.add(new MultiIntervalLocalReadShard(allIntervalsOnContig, assemblyRegionArgs.assemblyRegionPadding, reads));
}
return shards;
}
private void initializeAssemblyRegionOutputStreams() {
if ( assemblyRegionOut != null ) {
try {
assemblyRegionOutStream = new PrintStream(assemblyRegionOut);
}
catch ( IOException e ) {
throw new UserException.CouldNotCreateOutputFile(assemblyRegionOut, "Error writing assembly regions to output file", e);
}
logger.info("Writing assembly regions to " + assemblyRegionOut);
IGVUtils.printIGVFormatHeader(assemblyRegionOutStream, "line", "AssemblyRegions");
}
}
/**
* Returns the default list of CommandLineReadFilters that are used for this tool. The filters
* returned by this method are subject to selective enabling/disabling and customization by the
* user via the command line. The default implementation uses the {@link WellformedReadFilter}
* filter with all default options, as well as the {@link ReadFilterLibrary.MappedReadFilter}.
* Subclasses can override to provide alternative filters.
*
* Note: this method is called before command line parsing begins, and thus before a SAMFileHeader is
* available through {link #getHeaderForReads}.
*
* @return List of default filter instances to be applied for this tool.
*/
public List getDefaultReadFilters() {
final List defaultFilters = new ArrayList<>(2);
defaultFilters.add(new WellformedReadFilter());
defaultFilters.add(new ReadFilterLibrary.MappedReadFilter());
return defaultFilters;
}
protected ReadsDownsampler createDownsampler() {
return assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ? new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, getHeaderForReads(), nonRandomDownsamplingMode) : null;
}
/**
* {@inheritDoc}
*
* Implementation of assembly region traversal.
*
* NOTE: You should only override {@link #traverse()} if you are writing a new walker base class in the
* engine package that extends this class. It is not meant to be overridden by tools outside of the
* engine package.
*/
@Override
public void traverse() {
CountingReadFilter countedFilter = makeReadFilter();
// Since we're processing regions rather than individual reads, tell the progress
// meter to check the time more frequently (every 10 regions instead of every 1000 regions).
progressMeter.setRecordsBetweenTimeChecks(10L);
for ( final MultiIntervalLocalReadShard readShard : readShards ) {
// Since reads in each shard are lazily fetched, we need to pass the filter and transformers to the window
// instead of filtering the reads directly here
readShard.setPreReadFilterTransformer(makePreReadFilterTransformer());
readShard.setReadFilter(countedFilter);
readShard.setDownsampler(createDownsampler());
readShard.setPostReadFilterTransformer(makePostReadFilterTransformer());
processReadShard(readShard, reference, features);
}
logger.info(countedFilter.getSummaryLine());
}
/**
* Divide the given Shard up into active/inactive AssemblyRegions using the {@link #assemblyRegionEvaluator},
* and send each region to the tool implementation for processing.
*
* @param shard MultiIntervalLocalReadShard to process
* @param reference Reference data source
* @param features FeatureManager
*/
private void processReadShard(MultiIntervalLocalReadShard shard, ReferenceDataSource reference, FeatureManager features ) {
final Iterator assemblyRegionIter = new AssemblyRegionIterator(shard, getHeaderForReads(), reference, features, assemblyRegionEvaluator(), assemblyRegionArgs, shouldTrackPileupsForAssemblyRegions());
// Call into the tool implementation to process each assembly region from this shard.
while ( assemblyRegionIter.hasNext() ) {
final AssemblyRegion assemblyRegion = assemblyRegionIter.next();
if ( assemblyRegionArgs.forceActive ) {
assemblyRegion.setIsActive(true);
}
logger.debug("Processing assembly region at " + assemblyRegion.getSpan() + " isActive: " + assemblyRegion.isActive() + " numReads: " + assemblyRegion.getReads().size());
writeAssemblyRegion(assemblyRegion);
apply(assemblyRegion,
new ReferenceContext(reference, assemblyRegion.getPaddedSpan()),
new FeatureContext(features, assemblyRegion.getPaddedSpan()));
// For this traversal, the progress meter unit is the assembly region rather than the read shard
progressMeter.update(assemblyRegion.getSpan());
}
}
private void writeAssemblyRegion(final AssemblyRegion region) {
if ( assemblyRegionOutStream != null ) {
IGVUtils.printIGVFormatRow(assemblyRegionOutStream, new SimpleInterval(region.getContig(), region.getStart(), region.getStart()),
"end-marker", 0.0);
IGVUtils.printIGVFormatRow(assemblyRegionOutStream, region,
"size=" + new SimpleInterval(region).size(), region.isActive() ? 1.0 : -1.0);
}
}
/**
* Shutdown data sources.
*
* Marked final so that tool authors don't override it. Tool authors should override onTraversalDone() instead.
*/
@Override
protected final void onShutdown() {
// Overridden only to make final so that concrete tool implementations don't override
super.onShutdown();
if ( assemblyRegionOutStream != null ) {
assemblyRegionOutStream.close();
}
}
/**
* @return The evaluator to be used to determine whether each locus is active or not. Must be implemented by tool authors.
* The results of this per-locus evaluator are used to determine the bounds of each active and inactive region.
*/
public abstract AssemblyRegionEvaluator assemblyRegionEvaluator();
/**
* Allows implementing tools to decide whether pileups must be tracked and attached to assembly regions for later processing.
* This is configurable for now in order to save on potential increases in memory consumption variant calling machinery.
*/
public abstract boolean shouldTrackPileupsForAssemblyRegions();
/**
* Process an individual AssemblyRegion. Must be implemented by tool authors.
*
* Each region will come pre-marked as either "active" or "inactive" using the results of the configured
* {@link #assemblyRegionEvaluator}. This method will be called once for each active AND inactive region,
* and it is up to the implementation how to handle/process active vs. inactive regions.
*
* @param region region to process (pre-marked as either active or inactive)
* @param referenceContext reference data overlapping the padded span of the assembly region
* @param featureContext features overlapping the padded span of the assembly region
*/
public abstract void apply( final AssemblyRegion region, final ReferenceContext referenceContext, final FeatureContext featureContext );
public boolean nonRandomDownsamplingMode() {
return false;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy