All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.utils.reference.AbsoluteCoordinates Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.utils.reference;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.util.List;
import java.util.function.IntFunction;
import java.util.function.ToIntFunction;

/**
 * Class to translate back and forth from absolute {@code long-typed} base positions to relative ones (the usual contig, position pairs).
 * 

* If we were to concatenate the sequence of every chromosome in the order these appear in reference dictionary, * each base in the reference with a absolute position or offset from the beginning of this imaginary super-contig. *

*

* Some times it might be handy to use this type of address/coordinate rather than the usual contig (name or index) and position * within the contig. *

*

* This class implement such a transformation from absolute coordinates efficiently. *

*

* Relative to absolute is rather a trivial matter if you keep track of the accumulative number of bases on the reference * right before the start of each contig. *

*

Absolute to relative is a bit trickier. A obvious solution would consist in a binary search amogst the contig for the one that * include the absolute base range that encloses the requested position. The cost of this lookup would be O(log C) where C is the number of * contigs in the reference. In some cases like hg38 there are over 3000+ such contigs due to the alt-contigs and decoy, resulting in around 11 iterations * to find the target's location contig

*

* This class has a couple of accelerations: *

* First we check whether the target contig is the last returned, which should be quite often the case when accessing the refernce in * sequence. *

*

* Then we keep a "percentile" table that contains the contig index that would include that percentile absolute position. The granularity * of such table is linked to the number of contigs in the reference with a O(C) additional memory cost. *

*

* These two acceleration may effectively reduce the look-up cost to O(1) is most scenarios. The draw back is a more complicate code and the * need to deal with float-point arithmetic for the percentile look-up. *

*

*/ public final class AbsoluteCoordinates { /** * Length of each contig. */ private final int[] lengths; /** * Percentile look up table. */ private final int[] percentiles; /** * Contains the number of bases before the contig with the ith index (0-based). */ private final long[] accumulative; /** * Total length of the reference. */ private final long total; /** * Factor to multiply to an absolute position to get its corresponding percentile. */ private final float percentileFactor; /** * Function that resolves the contig index given its name. */ private final ToIntFunction contigToIndex; /** * Function that resolves the contig name given its index. */ private final IntFunction indexToContig; private int lastCtg; private AbsoluteCoordinates(final int[] lengths, final long[] accumulative, final ToIntFunction contigToIndex, final IntFunction indexToContig) { this.lengths = lengths; this.accumulative = accumulative; this.contigToIndex = contigToIndex; this.indexToContig = indexToContig; this.total = accumulative[accumulative.length - 1]; this.percentiles = calculatePercentiles(lengths, accumulative, total); this.percentileFactor = (percentiles.length - 1) / (float) this.total; this.lastCtg = 0; } private static int[] calculatePercentiles(final int[] lengths, final long[] accumulative, final long total) { final int[] result = new int[(accumulative.length << 1) + 1]; final float fraction = total / (float)(result.length - 1); double fractionAccumulator = 0; for (int i = 0, j = 0; i < lengths.length; i++) { final long accumulativePlusLength = accumulative[i + 1]; // == accumulative[i] + lengths[i]; while (fractionAccumulator < accumulativePlusLength && j < result.length - 1) { result[j++] = i; fractionAccumulator += fraction; } } result[result.length - 1] = lengths.length - 1; return result; } public static AbsoluteCoordinates of(final SAMSequenceDictionary dictionary) { final ToIntFunction contigToIndex = dictionary::getSequenceIndex; final IntFunction indexToContig = i -> dictionary.getSequence(i).getContig(); final List sequences = dictionary.getSequences(); final int numberOfSequences = sequences.size(); final int[] lengths = new int[numberOfSequences]; final long[] accumulative = new long[numberOfSequences + 1]; for (int i = 0; i < numberOfSequences; i++) { final SAMSequenceRecord sequence = sequences.get(i); lengths[i] = sequence.getSequenceLength(); } long leftSum = lengths[0]; for (int i = 1; i < numberOfSequences; i++) { accumulative[i] = leftSum; leftSum += lengths[i]; } accumulative[numberOfSequences] = leftSum; return new AbsoluteCoordinates(lengths, accumulative, contigToIndex, indexToContig); } public long toAbsolute(final String ctgName, final int position) { return toAbsolute(contigToIndex.applyAsInt(ctgName), position); } /** * Obtains the absolute coordinate for the start position in an interval. * @param simpleInterval the target position in relative coordinates * @return 1 or greater. * @throws IllegalArgumentException no such contig name or tht conting is too small. */ public long toAbsolute(final SimpleInterval simpleInterval) { return toAbsolute(simpleInterval.getContig(), simpleInterval.getStart()); } public long toAbsolute(final int ctgIdx, final int position) { if (lengths[ctgIdx] < position) { throw new IllegalArgumentException("position outside containg contig"); } lastCtg = ctgIdx; return accumulative[ctgIdx] + position; } public static class Relative { public final String contig; public final int contigIndex; public final int position; Relative(final String contig, final int contigIndex, final int position) { this.contig = contig; this.contigIndex = contigIndex; this.position = position; } } @FunctionalInterface interface RelativeFactory { E create(final String ctgName, final int ctgIdx, final int position); } public SimpleInterval toSimpleInterval(final long absoluteStart, final int length) { return toRelative(absoluteStart, (n, i, p) -> new SimpleInterval(n, p, p + length - 1)); } public Relative toRelative(final long absolute) { return toRelative(absolute, Relative::new); } public E toRelative(final long absolute, final RelativeFactory factory) { if (absolute < 1) { throw new IllegalArgumentException("absolute cannot be less than 1"); } else if (absolute > total) { throw new IllegalArgumentException("absolute is too large"); } else if (accumulative[lastCtg] < absolute) { if (absolute <= accumulative[lastCtg + 1]) { return factory.create(indexToContig.apply(lastCtg), lastCtg, (int) (absolute - accumulative[lastCtg])); } else { return searchRelative(absolute, factory); } } else { return searchRelative(absolute, factory); } } private E searchRelative(final long target, final RelativeFactory factory) { final int percentileIndex = (int) (target * percentileFactor); if (percentileIndex >= percentiles.length) { throw new IllegalArgumentException("xx " + target + " " + total ); } final int percentileCtg = percentiles[percentileIndex]; if (percentileIndex < percentiles.length - 1) { return searchRelative(target, percentileCtg, percentiles[percentileIndex + 1], factory); } else { return searchRelative(target, percentileCtg, lengths.length - 1, factory); } } private E searchRelative(final long target, final int minCtgIdx, final int maxCtgIdx, final RelativeFactory factory) { int i = minCtgIdx, j = maxCtgIdx; while (i < j) { final int mid = (i + j) >> 1; if (accumulative[mid] >= target) { j = mid - 1; } else if (accumulative[mid + 1] < target) { i = mid + 1; } else { i = mid; break; } } lastCtg = i; return factory.create(indexToContig.apply(i), i, (int) (target - accumulative[i])); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy