All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.dragstr.ComposeSTRTableFile Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.dragstr;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import org.apache.commons.io.FileUtils;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.Hidden;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.GATKTool;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.dragstr.STRTableFile;
import org.broadinstitute.hellbender.utils.dragstr.STRTableFileBuilder;
import picard.cmdline.programgroups.ReferenceProgramGroup;

import java.io.*;
import java.util.*;
import java.util.stream.Collectors;

/**
 * This tool looks for low-complexity STR sequences along the reference that are later used to estimate the Dragstr model 
 * during single sample auto calibration {@link CalibrateDragstrModel}.
 * 

Inputs

*

* This command takes as input the reference (possibly a subset of intervals) and an optional {@link STRDecimationTable decimation table} (herein referred as DT). *

*

* The DT modulates how often we sample a site for each possible period and repeat length. Since there is far more * positions with short period and short repeat length sampling for those combinations should be less frequent. * For further details about the format of this table and interpretation of its values please check the documentation * in class {@link STRDecimationTable}. *

*

* If no DT is provided, the tool uses a default one that has been tailored to work fine when run over * the entire Human genome and it should be alright with other genomes of comparable size (i.e. 1 to 10Gbps). * With larger genomes, that default DT will likely result in an unecessarely large number of sampled sites * that it turn may increase the run time of tools that depend on the output. In contrast, * with smaller genomes or subsets (using targeted intervals) it might result in a number of sampled sites * too small to build accurate Dragstr model. In this case you really need to compose and provide * your own DT or perhaps try out not to decimate at all ({@code --decimation NONE}). *

*

Output

*

* The output of this command is a zip file that contains the collection of sampled sites in * binary form ({@code all.bin}), and index for that file for quick access by location interval * ({@code all.idx}). Other files in the zip provide some summary and tracking information, for example * the reference sequence dictionary ({@code reference.dict}), a copy of the DT ({@code decimation.txt}) * and summarized stats ({@code summary.txt}). *

*

*

Examples

*

 *     # Human? just use the default.
 *     gatk ComposeSTRTableFile -R hg19.fasta -O hg19.str.zip
 *     # or ...
 *     gatk ComposeSTRTableFile -R hg19.fasta --decimation DEFAULT -O hg19.str.zip
 *
 * 
*
 *
 *     # yeast genome is roughly ~ 12Mbp long.
 *     gatk ComposeSTRTableFile -R yeast.fasta --decimation custom-yeast.dt -O yeast.str.zip
 *
 * 
*
 *
 *     #  Carsonella ruddii just about 160Kbps, prorably we don't want to decimate at all:
 *     gatk ComposeSTRTableFile -R Cruddii.fasta --decimation NONE -O yeast.str.zip
 *
 * 
*

*/ @CommandLineProgramProperties( programGroup = ReferenceProgramGroup.class, oneLineSummary = "Composes a genome-wide STR location table used for DragSTR model auto-calibration", summary = "Composes a genome-wide STR location table used for DragSTR model auto-calibration" ) @DocumentedFeature public final class ComposeSTRTableFile extends GATKTool { public static final String REFERENCE_SEQUENCE_BUFFER_SIZE_FULL_NAME = "reference-sequence-buffer-size"; public static final String GENERATE_SITES_TEXT_OUTPUT_FULL_NAME = "generate-sites-text-output"; public static final int MINIMUM_REFERENCE_SEQUENCE_BUFFER_SIZE = 1024; public static final int MAXIMUM_REFERENCE_SEQUENCE_BUFFER_SIZE = 100_000_000; public static final int DEFAULT_REFERENCE_SEQUENCE_BUFFER_SIZE = 100_000; @Argument(fullName = "decimation", doc = "decimation per period and repeat. It can be \"DEFAULT\" to use the default values (DEFAULT), " + " \"NONE\" to deactivate decimation (potentially resulting in a very large output file) " + "or indicate the path to a file that contains the decimation matrix.", optional = true) private STRDecimationTable decimationTable = STRDecimationTable.DEFAULT; @Argument(doc = "name of the zip file where the sites sampled will be stored", fullName= StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME) private GATKPath outputPath = null; @Argument(doc = "request to generate a text formatted version of the STR table in the output zip (" + STRTableFile.SITES_TEXT_FILE_NAME + ")", fullName = GENERATE_SITES_TEXT_OUTPUT_FULL_NAME, optional = true) @Hidden private boolean generateSitesTextOutput = false; @Argument(fullName = DragstrHyperParameters.MAX_PERIOD_ARGUMENT_FULL_NAME, doc="maximum STR period sampled", optional = true, minValue = 1, maxValue = 20) private int maxPeriod = DragstrHyperParameters.DEFAULT_MAX_PERIOD; @Argument(fullName = DragstrHyperParameters.MAX_REPEATS_ARGUMENT_FULL_NAME, doc="maximum STR repeat sampled", optional = true, minValue = 1, maxValue = 100) private int maxRepeat = DragstrHyperParameters.DEFAULT_MAX_REPEAT_LENGTH; @Argument(fullName= REFERENCE_SEQUENCE_BUFFER_SIZE_FULL_NAME, doc="size of the look ahead reference sequence buffer", optional = true, minValue = MINIMUM_REFERENCE_SEQUENCE_BUFFER_SIZE, maxValue = MAXIMUM_REFERENCE_SEQUENCE_BUFFER_SIZE) @Hidden private int referenceSequenceBufferSize = DEFAULT_REFERENCE_SEQUENCE_BUFFER_SIZE; @Override public boolean requiresReference() { return true; } private static final String COMMAND_LINE_ANNOTATION_NAME = "commandLine"; private File tempDir; private int[][][] nextMasks; public void onStartup() { super.onStartup(); try { tempDir = File.createTempFile("gatk-sample-dragstr-sites", ".tmp"); } catch (final IOException ex) { throw new GATKException("could not create temporary disk space", ex); } if (!tempDir.delete()) { throw new GATKException("could not create temporary disk space: could not delete tempfile"); } else if (!tempDir.mkdir()) { throw new GATKException("could not create temporary disk space: could not create tempdir"); } } public void onShutdown() { try { if (tempDir != null) { FileUtils.deleteDirectory(tempDir); } } catch (final IOException e) { throw new GATKException("issues removing temporary directory: " + tempDir, e); } finally { super.onShutdown(); } } @Override public void traverse() { final SAMSequenceDictionary dictionary = getBestAvailableSequenceDictionary(); initializeMasks(dictionary); try (final STRTableFileBuilder output = STRTableFileBuilder.newInstance(dictionary, decimationTable, generateSitesTextOutput, maxPeriod, maxRepeat)) { output.annotate(COMMAND_LINE_ANNOTATION_NAME, getCommandLine()); final Map> intervalsByContig = composeAndGroupTraversalIntervalsByContig(dictionary); for (final Map.Entry> contigEntry : intervalsByContig.entrySet()) { final BufferedReferenceBases nucleotideSequence = BufferedReferenceBases.of(directlyAccessEngineReferenceDataSource(), contigEntry.getKey(), referenceSequenceBufferSize); final SAMSequenceRecord sequenceRecord = dictionary.getSequence(contigEntry.getKey()); for (final SimpleInterval interval : contigEntry.getValue()) { traverseInterval(sequenceRecord.getSequenceIndex(), nucleotideSequence, interval.getStart(), interval.getEnd(), decimationTable, output); } } output.store(outputPath); } progressMeter.stop(); } private Map> composeAndGroupTraversalIntervalsByContig(final SAMSequenceDictionary dictionary) { if (!intervalArgumentCollection.intervalsSpecified()) { return dictionary.getSequences().stream() .map(s -> new SimpleInterval(s.getSequenceName(), 1, s.getSequenceLength())) .collect(Collectors.groupingBy(SimpleInterval::getContig, LinkedHashMap::new, Collectors.toList())); } else { final Map> keyUnsorted = IntervalUtils.sortAndMergeIntervals(intervalArgumentCollection.getIntervals(dictionary), dictionary, IntervalMergingRule.ALL); final Map> keySorted = new LinkedHashMap<>(keyUnsorted.size()); keyUnsorted.entrySet().stream() .sorted(Comparator.comparingInt(entry -> dictionary.getSequenceIndex(entry.getKey()))) .forEach(entry -> keySorted.put(entry.getKey(), entry.getValue())); return keySorted; } } private void initializeMasks(final SAMSequenceDictionary dictionary) { nextMasks = new int[dictionary.getSequences().size()][maxPeriod + 1][maxRepeat + 1]; for (int i = 0; i < nextMasks.length; i++) { for (final int[] masks : nextMasks[i]) { Arrays.fill(masks, i); } } } private void traverseInterval(final int seqNumber, final BufferedReferenceBases sequence, final long seqStart, final long seqEnd, final STRDecimationTable decimationTable, final STRTableFileBuilder output) { final String id = sequence.contigID(); final long length = sequence.length(); final byte[] unitBuffer = new byte[this.maxPeriod]; for (long pos = seqStart; pos <= seqEnd; pos++) { final BestPeriodRepeat best = findBestPeriodRepeatCombination(sequence, pos, length, unitBuffer); if (best != null) { pos = best.end; emitOrDecimateSTR(seqNumber, id, best, decimationTable, output); } } } /** * Returns the STR spec starting at a particular position on the input reference base sequence. * @return {@code null} iff the base at the starting position is not a standard base (i.e. A, C, G, T) for example N X or ?. */ private BestPeriodRepeat findBestPeriodRepeatCombination(final BufferedReferenceBases sequence, final long pos, final long length, final byte[] unitBuffer) { final BestPeriodRepeat best; // usually maxPeriodAtPos == maxPeriod except when close to the end of the sequence // is the maximum period to be considered that cannot exceed min(maxPeriod, seq-length - pos + 1) // but is always 1 or greater. final int maxPeriodAtPos = sequence.copyBytesAt(pos, unitBuffer, 0, maxPeriod); // Efficient code for period == 1: final byte firstUnitBase; if (!Nucleotide.decode(firstUnitBase = unitBuffer[0]).isStandard()) { best = null; } else { long beg, end; // we look upstream for same bases: for (beg = pos - 1; beg >= 1 && Nucleotide.same(sequence.byteAt(beg), firstUnitBase); beg--) { /* nothing to be done */ } beg++; // we look downstream for same bases: for (end = pos + 1; end <= length && Nucleotide.same(sequence.byteAt(end), firstUnitBase); end++) { /* nothing to be done */ } end--; // Initialize best period to 1: best = BestPeriodRepeat.initializeToOne(beg, end); // Now wo do period 2 and beyond: int cmp; // var to hold the next position in the unit (buffer) to compare against. for (int period = 2; period <= maxPeriodAtPos; period++) { // we stop if the last base in unit is not ACGT (usually N): if (!Nucleotide.decode(unitBuffer[period - 1]).isStandard()) { break; } // We look upstream for matching p-mers for (beg = pos - 1, cmp = period - 1; beg >= 1 && Nucleotide.same(sequence.byteAt(beg), unitBuffer[cmp]); beg--) { if (--cmp == -1) { cmp = period - 1; } } beg++; // We look downstream for matching p-mers for (cmp = 0, end = pos + period; end <= length && Nucleotide.same(sequence.byteAt(end), unitBuffer[cmp]); end++) { if (++cmp == period) { cmp = 0; } } end--; best.updateIfBetter(period, beg, end); } } return best; } private void emitOrDecimateSTR(final int seqNumber, final String seqId, final BestPeriodRepeat best, final STRDecimationTable decimationTable, final STRTableFileBuilder output) { final int effectiveRepeats = Math.min(maxRepeat, best.repeats); final int mask = nextMasks[seqNumber][best.period][effectiveRepeats]++; if (!decimationTable.decimate(mask, best.period, best.repeats)) { final DragstrLocus locus = DragstrLocus.make(seqNumber, best.start, (byte) best.period, (short) (best.end - best.start + 1), mask); output.emit(locus); progressMeter.update(new SimpleInterval(seqId, (int) best.start, (int) best.start)); } else { output.decimate(best.period, best.repeats); } } private static class BestPeriodRepeat { private int period; private int repeats; private long start; private long end; BestPeriodRepeat(final int period, final long start, final long end) { this.start = start; this.end = end; this.period = period; this.repeats = (int) (end - start + 1) / period; } private static BestPeriodRepeat initializeToOne(final long start, final long end) { return new BestPeriodRepeat(1, start, end); } private void updateIfBetter(final int newPeriod, final long newStart, final long newEnd) { final int newRepeats = (int) (newEnd - newStart + 1) / newPeriod; if (newRepeats > repeats || (newRepeats == repeats && newPeriod < period)) { start = newStart; end = newEnd; period = newPeriod; repeats = newRepeats; } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy