org.broadinstitute.hellbender.tools.dragstr.ComposeSTRTableFile Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.dragstr;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import org.apache.commons.io.FileUtils;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.Hidden;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.GATKTool;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.dragstr.STRTableFile;
import org.broadinstitute.hellbender.utils.dragstr.STRTableFileBuilder;
import picard.cmdline.programgroups.ReferenceProgramGroup;

import java.io.*;
import java.util.*;
import java.util.stream.Collectors;

/**
 * This tool looks for low-complexity STR sequences along the reference that are later used to estimate the Dragstr model 
 * during single sample auto calibration {@link CalibrateDragstrModel}.
 * Inputs
 * 
 *     This command takes as input the reference (possibly a subset of intervals) and an optional {@link STRDecimationTable decimation table} (herein referred as DT). 
 * 
 * 
 *     The DT modulates how often we sample a site for each possible period and repeat length. Since there is far more
 *     positions with short period and short repeat length sampling for those combinations should be less frequent.
 *     For further details about the format of this table and interpretation of its values please check the documentation
 *     in class {@link STRDecimationTable}.
 * 
 * 
 *    If no DT is provided, the tool uses a default one that has been tailored to work fine when run over 
 *    the entire Human genome and it should be alright with other genomes of comparable size (i.e. 1 to 10Gbps).
 *    With larger genomes, that default DT will likely result in an unecessarely large number of sampled sites 
 *    that it turn may increase the run time of tools that depend on the output. In contrast, 
 *    with smaller genomes or subsets (using targeted intervals) it might result in a number of sampled sites 
 *    too small to build accurate Dragstr model. In this case you really need to compose and provide
 *    your own DT or perhaps try out not to decimate at all ({@code --decimation NONE}). 
 *  
 * Output
 * 
 *    The output of this command is a zip file that contains the collection of sampled sites in 
 *    binary form ({@code all.bin}), and index for that file for quick access by location interval 
 *    ({@code all.idx}). Other files in the zip provide some summary and tracking information, for example
 *    the reference sequence dictionary ({@code reference.dict}), a copy of the DT ({@code decimation.txt}) 
 *    and summarized stats ({@code summary.txt}).
 * 
 * 
 * 
Examples
 * 
 *     # Human? just use the default.
 *     gatk ComposeSTRTableFile -R hg19.fasta -O hg19.str.zip
 *     # or ...
 *     gatk ComposeSTRTableFile -R hg19.fasta --decimation DEFAULT -O hg19.str.zip
 *
 * 
 *  *
 *     # yeast genome is roughly ~ 12Mbp long.
 *     gatk ComposeSTRTableFile -R yeast.fasta --decimation custom-yeast.dt -O yeast.str.zip
 *
 * 
 *  *
 *     #  Carsonella ruddii just about 160Kbps, prorably we don't want to decimate at all:
 *     gatk ComposeSTRTableFile -R Cruddii.fasta --decimation NONE -O yeast.str.zip
 *
 * 
 * 
 */
@CommandLineProgramProperties(
        programGroup = ReferenceProgramGroup.class,
        oneLineSummary = "Composes a genome-wide STR location table used for DragSTR model auto-calibration",
        summary = "Composes a genome-wide STR location table used for DragSTR model auto-calibration"
)
@DocumentedFeature
public final class ComposeSTRTableFile extends GATKTool {

    public static final String REFERENCE_SEQUENCE_BUFFER_SIZE_FULL_NAME = "reference-sequence-buffer-size";
    public static final String GENERATE_SITES_TEXT_OUTPUT_FULL_NAME = "generate-sites-text-output";
    public static final int MINIMUM_REFERENCE_SEQUENCE_BUFFER_SIZE = 1024;
    public static final int MAXIMUM_REFERENCE_SEQUENCE_BUFFER_SIZE = 100_000_000;
    public static final int DEFAULT_REFERENCE_SEQUENCE_BUFFER_SIZE = 100_000;

    @Argument(fullName = "decimation", 
              doc = "decimation per period and repeat. It can be \"DEFAULT\" to use the default values (DEFAULT), " +
                    " \"NONE\" to deactivate decimation (potentially resulting in a very large output file) " + 
                    "or indicate the path to a file that contains the decimation matrix.", 
              optional = true)
    private STRDecimationTable decimationTable = STRDecimationTable.DEFAULT;

    @Argument(doc = "name of the zip file where the sites sampled will be stored",
              fullName= StandardArgumentDefinitions.OUTPUT_LONG_NAME,
              shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME)
    private GATKPath outputPath = null;

    @Argument(doc = "request to generate a text formatted version of the STR table in the output zip (" + STRTableFile.SITES_TEXT_FILE_NAME + ")",
              fullName = GENERATE_SITES_TEXT_OUTPUT_FULL_NAME, optional = true)
    @Hidden
    private boolean generateSitesTextOutput = false;

    @Argument(fullName = DragstrHyperParameters.MAX_PERIOD_ARGUMENT_FULL_NAME, doc="maximum STR period sampled", optional = true, minValue = 1, maxValue = 20)
    private int maxPeriod = DragstrHyperParameters.DEFAULT_MAX_PERIOD;

    @Argument(fullName = DragstrHyperParameters.MAX_REPEATS_ARGUMENT_FULL_NAME, doc="maximum STR repeat sampled", optional = true, minValue = 1, maxValue = 100)
    private int maxRepeat = DragstrHyperParameters.DEFAULT_MAX_REPEAT_LENGTH;

    @Argument(fullName= REFERENCE_SEQUENCE_BUFFER_SIZE_FULL_NAME, doc="size of the look ahead reference sequence buffer",
            optional = true, minValue = MINIMUM_REFERENCE_SEQUENCE_BUFFER_SIZE, maxValue = MAXIMUM_REFERENCE_SEQUENCE_BUFFER_SIZE)
    @Hidden
    private int referenceSequenceBufferSize = DEFAULT_REFERENCE_SEQUENCE_BUFFER_SIZE;

    @Override
    public boolean requiresReference() {
        return true;
    }

    private static final String COMMAND_LINE_ANNOTATION_NAME = "commandLine";

    private File tempDir;
    private int[][][] nextMasks;

    public void onStartup() {
        super.onStartup();
        try {
            tempDir = File.createTempFile("gatk-sample-dragstr-sites", ".tmp");
        } catch (final IOException ex) {
            throw new GATKException("could not create temporary disk space", ex);
        }
        if (!tempDir.delete()) {
            throw new GATKException("could not create temporary disk space: could not delete tempfile");
        } else if (!tempDir.mkdir()) {
            throw new GATKException("could not create temporary disk space: could not create tempdir");
        }
    }

    public void onShutdown() {
        try {
            if (tempDir != null) {
                FileUtils.deleteDirectory(tempDir);
            }
        } catch (final IOException e) {
            throw new GATKException("issues removing temporary directory: " + tempDir, e);
        } finally {
            super.onShutdown();
        }
    }

    @Override
    public void traverse() {
        final SAMSequenceDictionary dictionary = getBestAvailableSequenceDictionary();
        initializeMasks(dictionary);
        try (final STRTableFileBuilder output = STRTableFileBuilder.newInstance(dictionary, decimationTable,
                generateSitesTextOutput, maxPeriod, maxRepeat)) {
            output.annotate(COMMAND_LINE_ANNOTATION_NAME, getCommandLine());
            final Map> intervalsByContig = composeAndGroupTraversalIntervalsByContig(dictionary);
            for (final Map.Entry> contigEntry : intervalsByContig.entrySet()) {
                final BufferedReferenceBases nucleotideSequence = BufferedReferenceBases.of(directlyAccessEngineReferenceDataSource(), contigEntry.getKey(), referenceSequenceBufferSize);
                final SAMSequenceRecord sequenceRecord = dictionary.getSequence(contigEntry.getKey());
                for (final SimpleInterval interval : contigEntry.getValue()) {
                    traverseInterval(sequenceRecord.getSequenceIndex(), nucleotideSequence, interval.getStart(), interval.getEnd(), decimationTable, output);
                }
            }
            output.store(outputPath);
        }
        progressMeter.stop();
    }

    private Map> composeAndGroupTraversalIntervalsByContig(final SAMSequenceDictionary dictionary) {
        if (!intervalArgumentCollection.intervalsSpecified()) {
            return dictionary.getSequences().stream()
                    .map(s -> new SimpleInterval(s.getSequenceName(), 1, s.getSequenceLength()))
                    .collect(Collectors.groupingBy(SimpleInterval::getContig, LinkedHashMap::new, Collectors.toList()));
        } else {
            final Map> keyUnsorted = IntervalUtils.sortAndMergeIntervals(intervalArgumentCollection.getIntervals(dictionary), dictionary, IntervalMergingRule.ALL);
            final Map> keySorted = new LinkedHashMap<>(keyUnsorted.size());
            keyUnsorted.entrySet().stream()
                    .sorted(Comparator.comparingInt(entry -> dictionary.getSequenceIndex(entry.getKey())))
                    .forEach(entry -> keySorted.put(entry.getKey(), entry.getValue()));
            return keySorted;
        }
    }

    private void initializeMasks(final SAMSequenceDictionary dictionary) {
        nextMasks = new int[dictionary.getSequences().size()][maxPeriod + 1][maxRepeat + 1];
        for (int i = 0; i < nextMasks.length; i++) {
            for (final int[] masks : nextMasks[i]) {
                Arrays.fill(masks, i);
            }
        }
    }

    private void traverseInterval(final int seqNumber, final BufferedReferenceBases sequence, final long seqStart, final long seqEnd,
                                  final STRDecimationTable decimationTable, final STRTableFileBuilder output)
    {
        final String id = sequence.contigID();
        final long length = sequence.length();
        final byte[] unitBuffer = new byte[this.maxPeriod];
        for (long pos = seqStart; pos <= seqEnd; pos++) {
            final BestPeriodRepeat best = findBestPeriodRepeatCombination(sequence, pos, length, unitBuffer);
            if (best != null) {
               pos = best.end;
               emitOrDecimateSTR(seqNumber, id, best, decimationTable, output);
            }
        }
    }

    /**
     * Returns the STR spec starting at a particular position on the input reference base sequence.
     * @return {@code null} iff the base at the starting position is not a standard base (i.e. A, C, G, T) for example N X or ?.
     */
    private BestPeriodRepeat findBestPeriodRepeatCombination(final BufferedReferenceBases sequence, final long pos,
                                                             final long length, final byte[] unitBuffer) {
        final BestPeriodRepeat best;
        // usually maxPeriodAtPos == maxPeriod except when close to the end of the sequence
        // is the maximum period to be considered that cannot exceed min(maxPeriod, seq-length - pos + 1)
        // but is always 1 or greater.
        final int maxPeriodAtPos = sequence.copyBytesAt(pos, unitBuffer, 0, maxPeriod);

        // Efficient code for period == 1:
        final byte firstUnitBase;
        if (!Nucleotide.decode(firstUnitBase = unitBuffer[0]).isStandard()) {
            best = null;
        } else {
            long beg, end;

            // we look upstream for same bases:
            for (beg = pos - 1; beg >= 1 && Nucleotide.same(sequence.byteAt(beg), firstUnitBase); beg--) { /* nothing to be done */ }
            beg++;

            // we look downstream for same bases:
            for (end = pos + 1; end <= length && Nucleotide.same(sequence.byteAt(end), firstUnitBase); end++) { /* nothing to be done */ }
            end--;

            // Initialize best period to 1:
            best = BestPeriodRepeat.initializeToOne(beg, end);
            // Now wo do period 2 and beyond:
            int cmp; // var to hold the next position in the unit (buffer) to compare against.
            for (int period = 2; period <= maxPeriodAtPos; period++) {
                // we stop if the last base in unit is not ACGT (usually N):
                if (!Nucleotide.decode(unitBuffer[period - 1]).isStandard()) {
                    break;
                }
                // We look upstream for matching p-mers
                for (beg = pos - 1,
                             cmp = period - 1; beg >= 1 && Nucleotide.same(sequence.byteAt(beg), unitBuffer[cmp]); beg--) {
                    if (--cmp == -1) {
                        cmp = period - 1;
                    }
                }
                beg++;
                // We look downstream for matching p-mers
                for (cmp = 0, end = pos + period; end <= length && Nucleotide.same(sequence.byteAt(end), unitBuffer[cmp]); end++) {
                    if (++cmp == period) {
                        cmp = 0;
                    }
                }
                end--;
                best.updateIfBetter(period, beg, end);
            }
        }
        return best;
    }

    private void emitOrDecimateSTR(final int seqNumber, final String seqId, final BestPeriodRepeat best,
                                   final STRDecimationTable decimationTable, final STRTableFileBuilder output)
    {
        final int effectiveRepeats = Math.min(maxRepeat, best.repeats);
        final int mask = nextMasks[seqNumber][best.period][effectiveRepeats]++;
        if (!decimationTable.decimate(mask, best.period, best.repeats)) {
            final DragstrLocus locus = DragstrLocus.make(seqNumber, best.start, (byte) best.period, (short) (best.end - best.start + 1), mask);
            output.emit(locus);
            progressMeter.update(new SimpleInterval(seqId, (int) best.start, (int) best.start));
        } else {
            output.decimate(best.period, best.repeats);
        }
    }

    private static class BestPeriodRepeat {
        private int period;
        private int repeats;
        private long start;
        private long end;

        BestPeriodRepeat(final int period, final long start, final long end) {
            this.start = start;
            this.end = end;
            this.period = period;
            this.repeats = (int) (end - start + 1) / period;
        }

        private static BestPeriodRepeat initializeToOne(final long start, final long end) {
            return new BestPeriodRepeat(1, start, end);
        }

        private void updateIfBetter(final int newPeriod, final long newStart, final long newEnd) {
            final int newRepeats = (int) (newEnd - newStart + 1) / newPeriod;
            if (newRepeats > repeats || (newRepeats == repeats && newPeriod < period)) {
                start = newStart;
                end = newEnd;
                period = newPeriod;
                repeats = newRepeats;
            }
        }
    }
}