org.broadinstitute.hellbender.utils.IntervalUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.utils;

import htsjdk.samtools.QueryInterval;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.OverlapDetector;
import htsjdk.tribble.Feature;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.engine.FeatureManager;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.nio.PathLineIterator;
import org.broadinstitute.hellbender.utils.param.ParamUtils;

import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
 * Parse text representations of interval strings that
 * can appear in GATK-based applications.
 */
public final class IntervalUtils {

    private static final Logger log = LogManager.getLogger(IntervalUtils.class);

    /**
     * Recognized extensions for interval files
     */
    public static final List GATK_INTERVAL_FILE_EXTENSIONS = Collections.unmodifiableList(Arrays.asList(
        ".list", ".intervals"
    ));

    /**
     * Lexicographical (contig) order comparator.
     * 
     * Intervals from different contigs order is according their enclosing
     * contigs name ascending lexicographical order.
     * 
     * 
     * Intervals from the same contigs order is according to their start
     * position ascending numerical order, and, in case of a tie, the stop position's.
     * 
     * 
     * The {@code null} contig is supported and comes last.
     * 
     */
    public static final Comparator LEXICOGRAPHICAL_ORDER_COMPARATOR =
            Comparator.comparing(Locatable::getContig,Comparator.nullsLast(String::compareTo))
                    .thenComparingInt(Locatable::getStart)
                    .thenComparingInt(Locatable::getEnd);

    private static final Logger logger = LogManager.getLogger(IntervalUtils.class);

    /**
     * Compare two locations using a {@link htsjdk.samtools.SAMSequenceDictionary} sequence ordering
     *
     * @return 0 if first and second are equal, a negative value if first < second or a positive value if first > second,
     * using the ordering in the provided dictionary
     * @throws IllegalArgumentException if either first or second contigs could not be found in the dictionary
     */
    public static final int compareLocatables(Locatable first, Locatable second, SAMSequenceDictionary dictionary) {
        Utils.nonNull(first);
        Utils.nonNull(second);
        Utils.nonNull(dictionary);

        int result = 0;
        if(first != second) {
            // compare the contigs
            result = compareContigs(first, second, dictionary);
            if (result == 0) {
                // compare start position
                result = Integer.compare(first.getStart(), second.getStart());
                if (result == 0) {
                    // compare end position
                    result = Integer.compare(first.getEnd(), second.getEnd());
                }
            }
        }
        return result;
    }

    /**
     * Tests whether the first Locatable ends before the start of the second Locatable
     *
     * @param first first Locatable
     * @param second second Locatable
     * @param dictionary sequence dictionary used to determine contig ordering
     * @return true if first ends before the start of second, otherwise false
     */
    public static boolean isBefore(final Locatable first, final Locatable second, final SAMSequenceDictionary dictionary) {
        Utils.nonNull(first);
        Utils.nonNull(second);
        Utils.nonNull(dictionary);

        final int contigComparison = compareContigs(first, second, dictionary);
        return contigComparison == -1 || (contigComparison == 0 && first.getEnd() < second.getStart());
    }

    /**
     * Tests whether the first Locatable starts after the end of the second Locatable
     *
     * @param first first Locatable
     * @param second second Locatable
     * @param dictionary sequence dictionary used to determine contig ordering
     * @return true if first starts after the end of second, otherwise false
     */
    public static boolean isAfter(final Locatable first, final Locatable second, final SAMSequenceDictionary dictionary) {
        Utils.nonNull(first);
        Utils.nonNull(second);
        Utils.nonNull(dictionary);

        final int contigComparison = compareContigs(first, second, dictionary);
        return contigComparison == 1 || (contigComparison == 0 && first.getStart() > second.getEnd());
    }

    /**
     * Determines the relative contig ordering of first and second using the provided sequence dictionary
     *
     * @param first first Locatable
     * @param second second Locatable
     * @param dictionary sequence dictionary used to determine contig ordering
     * @return 0 if the two contigs are the same, a negative value if first's contig comes before second's contig,
     *         or a positive value if first's contig comes after second's contig
     */
    public static int compareContigs(final Locatable first, final Locatable second, final SAMSequenceDictionary dictionary) {
        Utils.nonNull(first);
        Utils.nonNull(second);
        Utils.nonNull(dictionary);

        final int firstRefIndex = dictionary.getSequenceIndex(first.getContig());
        final int secondRefIndex = dictionary.getSequenceIndex(second.getContig());
        if (firstRefIndex == -1 || secondRefIndex == -1) {
            throw new IllegalArgumentException("Can't do comparison because Locatables' contigs not found in sequence dictionary");
        }
        // compare the contigs
        return Integer.compare(firstRefIndex, secondRefIndex);
    }


    /**
     * getSpanningInterval returns interval that covers all of the locations passed in.
     * @param locations the locations to be spanned (on a single contig)
     * @return the minimal span that covers all locations (could be null if no locations are passed in).
     * @throws IllegalArgumentException if the argument is null
     * or if the argument contains any null element
     * or if the locations are not all on the same contig (compared by String.equals)
     */
    public static SimpleInterval getSpanningInterval(final List locations) {
        Utils.nonNull(locations);
        Utils.containsNoNull(locations, "locations must not contain a null");
        if (locations.isEmpty()){
            return null;
        }
        final List contigs = locations.stream().map(l -> l.getContig()).distinct().collect(Collectors.toList());
        Utils.validateArg(contigs.size() == 1, () -> "found different contigs from inputs:" + contigs);
        final int minStart = locations.stream().mapToInt(l -> l.getStart()).min().getAsInt();
        final int maxEnd   = locations.stream().mapToInt(l -> l.getEnd()).max().getAsInt();
        return new SimpleInterval(contigs.get(0), minStart, maxEnd);
    }

    /**
     * Convert a List of intervals in GenomeLoc format into a List of intervals in SimpleInterval format.
     *
     * @param genomeLocIntervals list of GenomeLoc intervals to convert
     * @return equivalent List of SimpleIntervals
     */
    public static List convertGenomeLocsToSimpleIntervals( final List genomeLocIntervals ) {
        final List convertedIntervals = new ArrayList<>(genomeLocIntervals.size());
        for ( final GenomeLoc genomeLoc : genomeLocIntervals ) {
            if ( genomeLoc.isUnmapped() ) {
                throw new UserException("Unmapped intervals cannot be converted to SimpleIntervals");
            }

            convertedIntervals.add(new SimpleInterval(genomeLoc));
        }
        return convertedIntervals;
    }

    /**
     * Converts an interval in SimpleInterval format into an htsjdk QueryInterval.
     *
     * In doing so, a header lookup is performed to convert from contig name to index
     *
     * @param interval interval to convert
     * @param sequenceDictionary sequence dictionary used to perform the conversion
     * @return an equivalent interval in QueryInterval format
     */
    public static QueryInterval convertSimpleIntervalToQueryInterval( final SimpleInterval interval, final SAMSequenceDictionary sequenceDictionary ) {
        Utils.nonNull(interval);
        Utils.nonNull(sequenceDictionary);

        final int contigIndex = sequenceDictionary.getSequenceIndex(interval.getContig());
        if ( contigIndex == -1 ) {
            throw new UserException("Contig " + interval.getContig() + " not present in reads sequence dictionary");
        }

        return new QueryInterval(contigIndex, interval.getStart(), interval.getEnd());
    }

    public static GenomeLocSortedSet loadIntervals(
            final List intervalStrings,
            final IntervalSetRule intervalSetRule,
            final IntervalMergingRule intervalMergingRule,
            final int padding,
            final GenomeLocParser genomeLocParser) {
        Utils.nonNull(intervalStrings);
        List allIntervals = new ArrayList<>();
        for ( final String intervalString : intervalStrings) {
            Utils.nonNull(intervalString);
            List intervals = parseIntervalArguments(genomeLocParser, intervalString);

            if ( padding > 0 ) {
                intervals = getIntervalsWithFlanks(genomeLocParser, intervals, padding);
            }

            allIntervals = mergeListsBySetOperator(intervals, allIntervals, intervalSetRule);
        }

        return sortAndMergeIntervals(genomeLocParser, allIntervals, intervalMergingRule);
    }

    /**
     * Method that takes a list of interval strings in the command line format and parses them into GenomicLoc objects
     * without attempting to perform any sort of interval merging whatsover regardless of overlap between the intervals.
     *
     * @param intervalStrings interval strings from the command line
     * @param padding         padding to be added to the intervals
     * @param genomeLocParser parser to be applied to the provided intervals
     * @return a list of intervals parsed into genomic locs without merging.
     */
    public static List loadIntervalsNonMerging(
            final List intervalStrings,
            final int padding,
            final GenomeLocParser genomeLocParser) {
        Utils.nonNull(intervalStrings);
        List allIntervals = new ArrayList<>();
        for ( final String intervalString : intervalStrings) {
            Utils.nonNull(intervalString);
            List intervals = parseIntervalArguments(genomeLocParser, intervalString);

            if (padding > 0) {
                intervals = intervals.stream()
                        .map(loc -> genomeLocParser.createPaddedGenomeLoc(loc, padding))
                        .collect(Collectors.toList());
            }
            allIntervals.addAll(intervals);
        }

        return allIntervals.stream()
                .sorted()
                .collect(Collectors.toList());
    }

    /**
     * Turns a set of strings describing intervals into a parsed set of intervals.  Valid string elements can be files,
     * intervals in samtools notation (chrA:B-C), or some combination of the above separated by semicolons.  Additionally,
     * 'all' can be supplied to indicate all possible intervals, but 'all' must be exclusive of all other interval
     * specifications.
     *
     * @param parser Genome loc parser.
     * @param argList A list of strings containing interval data.
     * @return an unsorted, unmerged representation of the given intervals.  Null is used to indicate that all intervals should be used.
     */
    public static List parseIntervalArguments(final GenomeLocParser parser, final List argList) {
        final List rawIntervals = new ArrayList<>();    // running list of raw GenomeLocs

        if (argList != null) { // now that we can be in this function if only the ROD-to-Intervals was provided, we need to
            // ensure that the arg list isn't null before looping.
            for (final String argument : argList) {
                rawIntervals.addAll(parseIntervalArguments(parser, argument));
            }
        }

        return rawIntervals;
    }

    public static List parseIntervalArguments(final GenomeLocParser parser, final String arg) {
        Utils.nonNull(parser, "parser is null");
        Utils.nonNull(arg, "arg is null");
        final List rawIntervals = new ArrayList<>();    // running list of raw GenomeLocs

        if ( arg.indexOf(';') != -1 ) {
            throw new CommandLineException.BadArgumentValue("-L " + arg, "The legacy -L \"interval1;interval2\" syntax " +
                    "is no longer supported. Please use one -L argument for each " +
                    "interval or an interval file instead.");
        }
        // If it's a Feature-containing file, convert it to a list of intervals
        else if ( FeatureManager.isFeatureFile(IOUtils.getPath(arg)) ) {
            try {
                rawIntervals.addAll(featureFileToIntervals(parser, arg));
            } catch (final IllegalArgumentException e){
                throw new UserException.MalformedFile(IOUtils.getPath(arg), "Failure while loading intervals from file.", e);
            }
        }
        // If it's an interval file, add its contents to the raw interval list
        else if ( isGatkIntervalFile(arg) ) {
            rawIntervals.addAll(gatkIntervalFileToList(parser, arg));
        }
        // If it's neither a Feature-containing file nor an interval file, but is an existing file, throw an error.
        // Note that since contigs can contain periods in their names, we can't use the mere presence of an "extension"
        // as evidence that the user intended the String to be interpreted as a file.
        else if ( new File(arg).exists() ) {
            throw new UserException.CouldNotReadInputFile(arg, String.format("The file %s exists, but does not contain Features " +
                    "(ie., is not in a supported Feature file format such as vcf, bcf, bed, or interval_list), " +
                    "and does not have one of the supported interval file extensions (" + GATK_INTERVAL_FILE_EXTENSIONS + "). " +
                    "Please rename your file with the appropriate extension. If %s is NOT supposed to be a file, " +
                    "please move or rename the file at location %s", arg, arg, new File(arg).getAbsolutePath()));
        }
        // Otherwise treat as an interval -> parse and add to raw interval list
        else {
            rawIntervals.add(parser.parseGenomeLoc(arg));
        }

        return rawIntervals;
    }

    /**
     * Converts a Feature-containing file to a list of intervals
     *
     * @param parser GenomeLocParser for creating intervals
     * @param featureFile file containing Features to convert to intervals
     * @return a List of intervals corresponding to the locations of the Features in the provided file
     * @throws UserException.CouldNotReadInputFile if the provided file is not in a supported Feature file format
     */
    public static List featureFileToIntervals( final GenomeLocParser parser, final String featureFile ) {

        try ( final FeatureDataSource dataSource = new FeatureDataSource<>(featureFile) ) {
            final List featureIntervals = new ArrayList<>();

            for ( final Feature feature : dataSource ) {
                featureIntervals.add(parser.createGenomeLoc(feature, true));
            }
            return featureIntervals;
        }
    }

    /**
     * Read a file of genome locations to process, this file must be in GATK interval format.
     *
     * @param glParser   GenomeLocParser
     * @param fileName  interval file
     * @return List List of Genome Locs that have been parsed from file
     */
    private static List gatkIntervalFileToList(final GenomeLocParser glParser, final String fileName) {
        Utils.nonNull(glParser, "glParser is null");
        Utils.nonNull(fileName, "file name is null");

        final Path inputPath = IOUtils.getPath(fileName);
        final List ret = new ArrayList<>();

        try (final PathLineIterator reader = new PathLineIterator(inputPath)) {
            for (final String line : reader) {
                final String trimmedLine = line.trim();
                if (!trimmedLine.isEmpty()) {
                    ret.add(glParser.parseGenomeLoc(trimmedLine));
                }
            }
            if (ret.isEmpty()) {
                throw new UserException.MalformedFile(inputPath, "It contains no intervals.");
            }
            return ret;
        } catch (final UserException e){
            throw e;
        } catch ( final Exception e ) {
            throw new UserException.MalformedFile(inputPath, "Interval file could not be parsed as a GATK interval file", e);
        }
    }

    /**
     * merge two interval lists, using an interval set rule
     * @param setOne a list of genomeLocs, in order (cannot be NULL)
     * @param setTwo a list of genomeLocs, also in order (cannot be NULL)
     * @param rule the rule to use for merging, i.e. union, intersection, etc
     * @return a list, correctly merged using the specified rule
     */
    public static List mergeListsBySetOperator(final List setOne, final List setTwo, final IntervalSetRule rule) {
        // shortcut, if either set is zero, return the other set
        if (setOne == null || setOne.isEmpty() || setTwo == null || setTwo.isEmpty()) {
            return Collections.unmodifiableList((setOne == null || setOne.isEmpty()) ? setTwo : setOne);
        }

        // our master list, since we can't guarantee removal time in a generic list
        final LinkedList retList = new LinkedList<>();

        // if we're set to UNION, just add them all
        if (rule == null || rule == IntervalSetRule.UNION) {
            retList.addAll(setOne);
            retList.addAll(setTwo);
            return Collections.unmodifiableList(retList);
        }

        // else we're INTERSECTION, create two indexes into the lists
        int iOne = 0;
        int iTwo = 0;

        // merge the second into the first using the rule
        while (iTwo < setTwo.size() && iOne < setOne.size())
            // if the first list is ahead, drop items off the second until we overlap
        {
            if (setTwo.get(iTwo).isBefore(setOne.get(iOne))) {
                iTwo++;
            }// if the second is ahead, drop intervals off the first until we overlap
            else if (setOne.get(iOne).isBefore(setTwo.get(iTwo))) {
                iOne++;
            }// we overlap, intersect the two intervals and add the result.  Then remove the interval that ends first.
            else {
                retList.add(setOne.get(iOne).intersect(setTwo.get(iTwo)));
                if (setOne.get(iOne).getStop() < setTwo.get(iTwo).getStop()) {
                    iOne++;
                } else {
                    iTwo++;
                }
            }
        }

        //if we have an empty list, throw an exception.  If they specified intersection and there are no items, this is bad.
        if (retList.isEmpty()) {
            throw new UserException.EmptyIntersection("There was an empty intersection");
        }

        // we don't need to add the rest of remaining locations, since we know they don't overlap. return what we have
        return Collections.unmodifiableList(retList);
    }

    /**
     * Sorts and merges an interval list.  Multiple techniques are available for merging: ALL, which combines
     * all overlapping and abutting intervals into an interval that spans the union of all covered bases, and
     * OVERLAPPING_ONLY, which unions overlapping intervals but keeps abutting intervals separate.
     *
     * @param parser Genome loc parser for the intervals.
     * @param intervals A collection of intervals to merge.
     * @param mergingRule A descriptor for the type of merging to perform.
     * @return A sorted, merged version of the intervals passed in.
     */
    public static GenomeLocSortedSet sortAndMergeIntervals(final GenomeLocParser parser, List intervals, final IntervalMergingRule mergingRule) {
        // Make a copy of the (potentially unmodifiable) list to be sorted
        intervals = new ArrayList<>(intervals);
        // sort raw interval list
        Collections.sort(intervals);
        // now merge raw interval list
        intervals = mergeIntervalLocations(intervals, mergingRule);

        return GenomeLocSortedSet.createSetFromList(parser, intervals);
    }

    /**
     * Merges a list of potentially-overlapping SimpleIntervals from a single contig into a new sorted list
     * of intervals covering the same genomic territory but with all overlapping (and, if specified, adjacent)
     * intervals merged.
     *
     * Since this method does not take a sequence dictionary, it imposes the restriction that all supplied intervals
     * must be on the same contig (otherwise we would not know the contig ordering).
     *
     * @param raw List of intervals to merge, all from the same contig. Does not need to be sorted.
     * @param rule Whether to merge adjacent intervals, or just overlapping intervals
     * @return A new list of merged disjoint intervals, sorted by start position
     */
    public static List sortAndMergeIntervalsFromSameContig(final List raw, final IntervalMergingRule rule) {
        if (raw.isEmpty()) {
            return new ArrayList<>();
        }

        // Since we're merging overlapping/adjacent intervals, we only need to sort by the start positions
        final List rawSorted = new ArrayList<>(raw);
        Collections.sort(rawSorted, Comparator.comparing(SimpleInterval::getStart));

        final List merged = new ArrayList<>();
        final Iterator it = rawSorted.iterator();
        SimpleInterval prev = it.next();
        final String commonContig = prev.getContig();

        while (it.hasNext()) {
            final SimpleInterval curr = it.next();
            if ( ! curr.getContig().equals(commonContig) ) {
                throw new IllegalArgumentException("This method can handle only intervals on the same contig, " +
                        "but found intervals from contigs " + commonContig + " and " + curr.getContig());
            }

            if (prev.overlaps(curr)) {
                prev = prev.mergeWithContiguous(curr);
            } else if (prev.contiguous(curr) && (rule == null || rule == IntervalMergingRule.ALL)) {
                prev = prev.mergeWithContiguous(curr);
            } else {
                merged.add(prev);
                prev = curr;
            }
        }
        merged.add(prev);
        return merged;
    }

    /**
     * computes whether the test interval list is equivalent to master.  To be equivalent, test must
     * contain GenomeLocs covering every base in master, exactly once.  Note that this algorithm
     * assumes that master genomelocs are all discontiguous (i.e., we don't have locs like 1-3 and 4-6 but
     * rather just 1-6).  In order to use this algorithm with contiguous genomelocs first merge them.  The algorithm
     * doesn't assume that test has discontinuous genomelocs.
     *
     * Returns a null string if there are no differences, otherwise returns a string describing the difference
     * (useful for UnitTests).  Assumes both lists are sorted
     *
     * @param masterArg sorted master genome locs
     * @param testArg sorted test genome locs
     * @return null string if there are no difference, otherwise a string describing the difference
     */
    public static String equateIntervals(final List masterArg, final List testArg) {
        final LinkedList master = new LinkedList<>(masterArg);
        final LinkedList test = new LinkedList<>(testArg);

        while ( ! master.isEmpty() ) { // there's still unchecked bases in master
            final GenomeLoc masterHead = master.pop();
            final GenomeLoc testHead = test.pop();

            if ( testHead.overlapsP(masterHead) ) {
                // remove the parts of test that overlap master, and push the remaining
                // parts onto master for further comparison.
                reverse(masterHead.subtract(testHead)).forEach(master::push);
            } else {
                // testHead is incompatible with masterHead, so we must have extra bases in testHead
                // that aren't in master
                return "Incompatible locs detected masterHead=" + masterHead + ", testHead=" + testHead;
            }
        }

        if ( test.isEmpty() ) // everything is equal
        {
            return null; // no differences
        } else {
            return "Remaining elements found in test: first=" + test.peek();
        }
    }

    private static > List reverse(final List l) {
        return l.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
    }

    /**
     * Check if string argument was intended as a file
     * Accepted file extensions: .bed .list, .picard, .interval_list, .intervals.
     * @param str token to identify as a filename.
     * @return true if the token looks like a filename, or false otherwise.
     */
    public static boolean isGatkIntervalFile(final String str) {
        return isGatkIntervalFile(str, true);
    }

    /**
     * Check if string argument was intended as a file
     * Accepted file extensions are defined in {@link #GATK_INTERVAL_FILE_EXTENSIONS}
     * @param str token to identify as a filename.
     * @param checkExists if true throws an exception if the file doesn't exist and has an interval file extension
     * @return true if the token looks like an interval file name, or false otherwise.
     */
    public static boolean isGatkIntervalFile(final String str, final boolean checkExists) {
        Utils.nonNull(str);
        boolean hasIntervalFileExtension = false;
        for ( final String extension : GATK_INTERVAL_FILE_EXTENSIONS) {
            if ( str.toLowerCase().endsWith(extension) ) {
                hasIntervalFileExtension = true;
            }
        }

        if ( hasIntervalFileExtension ) {
            final Path path = IOUtils.getPath(str);
            if ( ! checkExists || Files.exists(path) ) {
                return true;
            } else {
                throw new UserException.CouldNotReadInputFile(path, "The interval file does not exist.");
            }
        }
        else {
            return false;
        }
    }

    /**
     * Returns a map of contig names with their sizes.
     * @param reference The reference for the intervals.
     * @return A map of contig names with their sizes.
     */
    public static Map getContigSizes(final Path reference) {
        try(final CachingIndexedFastaSequenceFile referenceSequenceFile = new CachingIndexedFastaSequenceFile(reference)) {
            final List locs = GenomeLocSortedSet.createSetFromSequenceDictionary(
                    referenceSequenceFile.getSequenceDictionary()).toList();
            final Map lengths = new LinkedHashMap<>();
            for (final GenomeLoc loc : locs) {
                lengths.put(loc.getContig(), loc.size());
            }
            return lengths;
        }
    }

    /**
     * Splits an interval list into multiple files.
     * @param fileHeader The sam file header.
     * @param locs The genome locs to split.
     * @param scatterParts The output interval lists to write to.
     */
    public static void scatterContigIntervals(final SAMFileHeader fileHeader, final List locs, final List scatterParts) {

        // Contract: must divide locs up so that each of scatterParts gets a sublist such that:
        // (a) all locs concerning a particular contig go to the same part
        // (b) locs are not split or combined, and remain in the same order (so scatterParts[0] + ... + scatterParts[n] == locs)

        // Locs are already sorted.

        long totalBases = 0;
        for(final GenomeLoc loc : locs) {
            totalBases += loc.size();
        }

        final long idealBasesPerPart = totalBases / scatterParts.size();
        if(idealBasesPerPart == 0) {
            throw new UserException.BadInput(String.format("Genome region is too short (%d bases) to split into %d parts", totalBases, scatterParts.size()));
        }

        // Find the indices in locs where we switch from one contig to the next.
        final ArrayList contigStartLocs = new ArrayList<>();
        String prevContig = null;

        for(int i = 0; i < locs.size(); ++i) {

            final GenomeLoc loc = locs.get(i);
            if(prevContig == null || !loc.getContig().equals(prevContig)) {
                contigStartLocs.add(i);
            }
            prevContig = loc.getContig();

        }

        if(contigStartLocs.size() < scatterParts.size()) {
            throw new UserException.BadInput(String.format("Input genome region has too few contigs (%d) to split into %d parts", contigStartLocs.size(), scatterParts.size()));
        }

        long thisPartBases = 0;
        int partIdx = 0;
        IntervalList outList = new IntervalList(fileHeader);

        for(int i = 0; i < locs.size(); ++i) {

            final GenomeLoc loc = locs.get(i);
            thisPartBases += loc.getStop() - loc.getStart();

            outList.add(toInterval(loc, i));

            boolean partMustStop = false;

            if(partIdx < (scatterParts.size() - 1)) {

                // If there are n contigs and n parts remaining then we must split here,
                // otherwise we will run out of contigs.

                final int nextPart = partIdx + 1;
                final int nextPartMustStartBy = contigStartLocs.get(nextPart + (contigStartLocs.size() - scatterParts.size()));
                if(i + 1 == nextPartMustStartBy) {
                    partMustStop = true;
                }

            }
            else if(i == locs.size() - 1) {

                // We're done! Write the last scatter file.
                partMustStop = true;

            }

            if(partMustStop || thisPartBases > idealBasesPerPart) {

                // Ideally we would split here. However, we must make sure to do so
                // on a contig boundary. Test always passes with partMustStop == true
                // since that indicates we're at a contig boundary.

                GenomeLoc nextLoc = null;
                if((i + 1) < locs.size()) {
                    nextLoc = locs.get(i + 1);
                }

                if(nextLoc == null || !nextLoc.getContig().equals(loc.getContig())) {

                    // Write out this part:
                    outList.write(scatterParts.get(partIdx));

                    // Reset. If this part ran long, leave the excess in thisPartBases
                    // and the next will be a little shorter to compensate.
                    outList = new IntervalList(fileHeader);
                    thisPartBases -= idealBasesPerPart;
                    ++partIdx;
                }

            }

        }

    }

    /**
     * Splits an interval list into multiple sublists.
     * @param locs The genome locs to split.
     * @param splits The stop points for the genome locs returned by splitFixedIntervals.
     * @return A list of lists of genome locs, split according to splits
     */
    public static List> splitIntervalsToSubLists(final List locs, final List splits) {
        Utils.nonNull(locs, "locs is null");
        Utils.nonNull(splits, "splits is null");
        int start = 0;
        final List> sublists = new ArrayList<>(splits.size());
        for (final Integer stop: splits) {
            final List curList = new ArrayList<>();
            for (int i = start; i < stop; i++) {
                curList.add(locs.get(i));
            }
            start = stop;
            sublists.add(curList);
        }

        return sublists;
    }


    /**
     * Splits an interval list into multiple files.
     * @param fileHeader The sam file header.
     * @param splits Pre-divided genome locs returned by splitFixedIntervals.
     * @param scatterParts The output interval lists to write to.
     */
    public static void scatterFixedIntervals(final SAMFileHeader fileHeader, final List> splits, final List scatterParts) {
        Utils.nonNull(fileHeader, "fileHeader is null");
        Utils.nonNull(splits, "splits is null");
        Utils.nonNull(scatterParts, "scatterParts is null");
        Utils.containsNoNull(splits, "null split loc");

        if (splits.size() != scatterParts.size()) {
            throw new CommandLineException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
        }

        int fileIndex = 0;
        int locIndex = 1;
        for (final List split : splits) {
            final IntervalList intervalList = new IntervalList(fileHeader);
            for (final GenomeLoc loc : split) {
                intervalList.add(toInterval(loc, locIndex++));
            }
            intervalList.write(scatterParts.get(fileIndex++));
        }
    }

    /**
     * Splits the genome locs up by size.
     * @param locs Genome locs to split.
     * @param numParts Number of parts to split the locs into.
     * @return The stop points to split the genome locs.
     */
    public static List> splitFixedIntervals(final List locs, final int numParts) {
        Utils.nonNull(locs, "locs is null");

        if (locs.size() < numParts) {
            throw new CommandLineException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
        }
        final long locsSize = intervalSize(locs);
        final List splitPoints = new ArrayList<>();
        addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
        Collections.sort(splitPoints);
        splitPoints.add(locs.size());
        return splitIntervalsToSubLists(locs, splitPoints);
    }

    public static List> splitLocusIntervals(final List locs, final int numParts) {
        Utils.nonNull(locs, "locs is null");
        if (numParts < 0) {
            throw new CommandLineException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
        }

        // the ideal size of each split
        final long bp = IntervalUtils.intervalSize(locs);
        final long idealSplitSize = Math.max((long)Math.floor(bp / (1.0*numParts)), 1);

        // algorithm:
        // split = ()
        // set size = 0
        // pop the head H off locs.
        // If size + size(H) < splitSize:
        //      add H to split, continue
        // If size + size(H) == splitSize:
        //      done with split, put in splits, restart
        // if size + size(H) > splitSize:
        //      cut H into two pieces, first of which has splitSize - size bp
        //      push both pieces onto locs, continue
        // The last split is special -- when you have only one split left, it gets all of the remaining locs
        // to deal with rounding issues
        final List> splits = new ArrayList<>(numParts);

        LinkedList locsLinkedList = new LinkedList<>(locs);
        while ( ! locsLinkedList.isEmpty() ) {
            if ( splits.size() + 1 == numParts ) {
                // the last one gets all of the remaining parts
                splits.add(new ArrayList<>(locsLinkedList));
                locsLinkedList.clear();
            } else {
                final SplitLocusRecursive one = splitLocusIntervals1(locsLinkedList, idealSplitSize);
                splits.add(one.split);
                locsLinkedList = one.remaining;
            }
        }

        return splits;
    }

    private static SplitLocusRecursive splitLocusIntervals1(final LinkedList remaining, final long idealSplitSize) {
        final List split = new ArrayList<>();
        long size = 0;

        while ( ! remaining.isEmpty() ) {
            final GenomeLoc head = remaining.pop();
            final long newSize = size + head.size();

            if ( newSize == idealSplitSize ) {
                split.add(head);
                break; // we are done
            } else if ( newSize > idealSplitSize ) {
                final long remainingBp = idealSplitSize - size;
                final long cutPoint = head.getStart() + remainingBp;
                final GenomeLoc[] parts = head.split((int)cutPoint);
                remaining.push(parts[1]);
                remaining.push(parts[0]);
                // when we go around, head.size' = idealSplitSize - size
                // so newSize' = splitSize + head.size' = size + (idealSplitSize - size) = idealSplitSize
            } else {
                split.add(head);
                size = newSize;
            }
        }

        return new SplitLocusRecursive(split, remaining);
    }

    /**
     * Check whether two locatables overlap.
     * 
     *    Two locatables overlap if the share the same contig and they have at least one
     *    base in common based on their start and end positions.
     * 
     * 
     *    This method returns {@code false} if either input {@link Locatable} has a {@code null}
     *    contig.
     * 
     *
     * @param left first locatable.
     * @param right second locatable.
     * @throws IllegalArgumentException if either {@code left} or {@code right} locatable
     *  is {@code null}.
     * @return {@code true} iff there is an overlap between both locatables.
     */
    public static boolean overlaps(final Locatable left, final Locatable right) {
        Utils.nonNull(left,"the left locatable is null");
        Utils.nonNull(right,"the right locatable is null");
        if (left.getContig() == null || right.getContig() == null) {
            return false;
        } else if (!left.getContig().equals(right.getContig())) {
            return false;
        } else {
            return left.getStart() <= right.getEnd() && right.getStart() <= left.getEnd();
        }
    }

    public static String locatableToString(Locatable interval) {
        Utils.nonNull(interval);
        return String.format("%s:%s-%s", interval.getContig(), interval.getStart(), interval.getEnd());
    }

    private static final class SplitLocusRecursive {
        final List split;
        final LinkedList remaining;

        private SplitLocusRecursive(final List split, final LinkedList remaining) {
            this.split = split;
            this.remaining = remaining;
        }
    }

    public static List flattenSplitIntervals(final List> splits) {
        Utils.nonNull(splits, "splits is null");

        final List locs = new ArrayList<>();
        splits.forEach(locs::addAll);
        return locs;
    }

    private static void addFixedSplit(final List splitPoints, final List locs, final long locsSize, final int startIndex, final int stopIndex, final int numParts) {
        Utils.nonNull(splitPoints, "splitPoints is null");
        if (numParts < 2) {
            return;
        }
        final int halfParts = (numParts + 1) / 2;
        final Pair splitPoint = getFixedSplit(locs, locsSize, startIndex, stopIndex, halfParts, numParts - halfParts);
        final int splitIndex = splitPoint.getLeft();
        final long splitSize = splitPoint.getRight();
        splitPoints.add(splitIndex);
        addFixedSplit(splitPoints, locs, splitSize, startIndex, splitIndex, halfParts);
        addFixedSplit(splitPoints, locs, locsSize - splitSize, splitIndex, stopIndex, numParts - halfParts);
    }

    private static Pair getFixedSplit(final List locs, final long locsSize, final int startIndex, final int stopIndex, final int minLocs, final int maxLocs) {
        int splitIndex = startIndex;
        long splitSize = 0;
        for (int i = 0; i < minLocs; i++) {
            splitSize += locs.get(splitIndex).size();
            splitIndex++;
        }
        final long halfSize = locsSize / 2;
        while (splitIndex < (stopIndex - maxLocs) && splitSize < halfSize) {
            splitSize += locs.get(splitIndex).size();
            splitIndex++;
        }
        return new ImmutablePair<>(splitIndex, splitSize);
    }

    /**
     * Converts a GenomeLoc to a picard interval.
     * @param loc The GenomeLoc.
     * @param locIndex The loc index for use in the file.
     * @return The picard interval.
     */
    private static Interval toInterval(final GenomeLoc loc, final int locIndex) {
        return new Interval(loc.getContig(), loc.getStart(), loc.getStop(), false, "interval_" + locIndex);
    }

    /**
     * merge a list of genome locs that may be overlapping, returning the list of unique genomic locations
     *
     * @param raw the unchecked genome loc list
     * @param rule the merging rule we're using
     *
     * @return the list of merged locations
     */
    public static List mergeIntervalLocations(final List raw, final IntervalMergingRule rule) {
        if (raw.size() <= 1) {
            return Collections.unmodifiableList(raw);
        } else {
            final ArrayList merged = new ArrayList<>();
            final Iterator it = raw.iterator();
            GenomeLoc prev = it.next();
            while (it.hasNext()) {
                final GenomeLoc curr = it.next();
                if (prev.overlapsP(curr)) {
                    prev = prev.merge(curr);
                } else if (prev.contiguousP(curr) && (rule == null || rule == IntervalMergingRule.ALL)) {
                    prev = prev.merge(curr);
                } else {
                    merged.add(prev);
                    prev = curr;
                }
            }
            merged.add(prev);
            return Collections.unmodifiableList(merged);
        }
    }

    public static long intervalSize(final List locs) {
        long size = 0;
        for ( final GenomeLoc loc : locs ) {
            size += loc.size();
        }
        return size;
    }

    /**
     * Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs.
     * @param parser A genome loc parser for creating the new intervals
     * @param locs Original genome locs
     * @param basePairs Number of base pairs on each side of loc
     * @return The list of intervals between the locs
     */
    public static List getIntervalsWithFlanks(final GenomeLocParser parser, final List locs, final int basePairs) {

        if (locs.isEmpty()) {
            return Collections.emptyList();
        }

        final List expanded = locs.stream()
                .map(loc -> parser.createPaddedGenomeLoc(loc, basePairs))
                .collect(Collectors.toList());

        return sortAndMergeIntervals(parser, expanded, IntervalMergingRule.ALL).toList();
    }

    /**
     * Pads the provided intervals by the specified amount, sorts the resulting intervals, and merges intervals
     * that are adjacent/overlapping after padding.
     *
     * @param intervals intervals to pad
     * @param basePairs number of bases of padding to add to each side of each interval
     * @param dictionary sequence dictionary used to restrict padded intervals to the bounds of their contig
     * @return the provided intervals padded by the specified amount, sorted, with adjacent/overlapping intervals merged
     */
    public static List getIntervalsWithFlanks(final List intervals, final int basePairs, final SAMSequenceDictionary dictionary) {
        final GenomeLocParser parser = new GenomeLocParser(dictionary);
        final List intervalsAsGenomeLocs = genomeLocsFromLocatables(parser, intervals);
        final List paddedGenomeLocs = getIntervalsWithFlanks(parser, intervalsAsGenomeLocs, basePairs);
        return convertGenomeLocsToSimpleIntervals(paddedGenomeLocs);
    }

    /**
     * Accepts a sorted List of intervals, and returns a List of Lists of intervals grouped by contig,
     * one List per contig.
     *
     * @param sortedIntervals sorted List of intervals to group by contig
     * @return A List of Lists of intervals, one List per contig
     */
    public static List> groupIntervalsByContig(final List sortedIntervals) {
        final List> intervalGroups = new ArrayList<>();
        List currentGroup = new ArrayList<>();
        String currentContig = null;

        for ( final SimpleInterval currentInterval : sortedIntervals ) {
            if ( currentContig != null && ! currentContig.equals(currentInterval.getContig()) ) {
                intervalGroups.add(currentGroup);
                currentGroup = new ArrayList<>();
            }

            currentContig = currentInterval.getContig();
            currentGroup.add(currentInterval);
        }

        if ( ! currentGroup.isEmpty() ) {
            intervalGroups.add(currentGroup);
        }

        return intervalGroups;
    }

    private static LinkedHashMap> splitByContig(final List sorted) {
        final LinkedHashMap> splits = new LinkedHashMap<>();
        GenomeLoc last = null;
        List contigLocs = null;
        for (final GenomeLoc loc: sorted) {
            if (GenomeLoc.isUnmapped(loc)) {
                continue;
            }
            if (last == null || !last.onSameContig(loc)) {
                contigLocs = new ArrayList<>();
                splits.put(loc.getContig(), contigLocs);
            }
            contigLocs.add(loc);
            last = loc;
        }
        return splits;
    }

    /**
     * Generates a list of {@link GenomeLoc} instances given the appropriate {@link GenomeLocParser} factory
     * and a collection of {@link Locatable} instances.
     *
     * 
     *     The order in the result list is will correspond to the traversal order in the input collection.
     * 
     * @param locatables input locatable collection.
     * @throws IllegalArgumentException if {@code locatable} is {@code null} or contains any {@code null}.
     * @return never {@code null}. The result is an unmodifiable list.
     */
    public static List genomeLocsFromLocatables(final GenomeLocParser parser, final Collection locatables) {
        Utils.nonNull(parser, "the input genome-loc parser cannot be null");

        Utils.nonNull(locatables, "the input locatable collection cannot be null");
        Utils.containsNoNull(locatables, "some element in the locatable input collection is null");

        final List result = locatables.stream().map(parser::createGenomeLoc).collect(Collectors.toList());
        return Collections.unmodifiableList(result);
    }

    /**
     * Builds a list of intervals that cover the whole given sequence.
     */
    public static List getAllIntervalsForReference(final SAMSequenceDictionary sequenceDictionary) {
        return GenomeLocSortedSet.createSetFromSequenceDictionary(sequenceDictionary)
                .stream()
                .map(SimpleInterval::new)
                .collect(Collectors.toList());
    }


    /**
     * Given an interval query string and a sequence dictionary, determine if the query string can be
     * resolved as a valid interval query against more than one contig in the dictionary, i.e., more than
     * one of:
     *
     *     prefix
     *     prefix:nnn
     *     prefix:nnn+
     *     prefix:nnn-nnn
     *
     * and return the list of all possible interpretations (there can never be more than 2). Note that for
     * an ambiguity to exist, the query string must contain at least one colon.
     *
     * @param intervalQueryString
     * @param sequenceDictionary
     * @return List containing 0, 1, or 2 valid interpretations of {code queryString} given
     * {@code sequenceDictionary}. If the list is empty, the query doesn't match any contig in the sequence
     * dictionary. If the list contains more than one interval, the query string is ambiguous and should be
     * rejected. If the list contains a single interval, the query is unambiguous and can be safely used to
     * conduct a query.
     * @thows IllegalArgumentException if the query only matches a single contig in the dictionary, but the start
     * and end positions are not valid
     * @throws NumberFormatException if the query only matches a single contig in the dictionary, but the query
     * interval paramaters (start, end) cannot be parsed
     */
    public static List getResolvedIntervals(
            final String intervalQueryString,
            final SAMSequenceDictionary sequenceDictionary) {
        Utils.nonNull(intervalQueryString);
        Utils.validateArg(!intervalQueryString.isEmpty(), "intervalQueryString should not be empty");

        // Keep a list of all valid interpretations
        final List resolvedIntervals = new ArrayList<>();

        // Treat the entire query string as a contig name. If it exists in the sequence dictionary,
        // count that as one valid interpretation.
        final SAMSequenceRecord queryAsContigName = sequenceDictionary.getSequence(intervalQueryString);
        if (queryAsContigName != null) {
            resolvedIntervals.add(new SimpleInterval(intervalQueryString, 1, queryAsContigName.getSequenceLength()));
        }

        // The query must contain at least one colon for an ambiguity to exist.
        final int lastColonIndex = intervalQueryString.lastIndexOf(SimpleInterval.CONTIG_SEPARATOR);
        if (lastColonIndex == -1) {
            return resolvedIntervals;
        }

        // Get a prefix containing everything up to the last colon, and see if it represents a valid contig.
        final String prefix = intervalQueryString.substring(0, lastColonIndex);
        final SAMSequenceRecord prefixSequence = sequenceDictionary.getSequence(prefix);
        if (prefixSequence == null) {
            return resolvedIntervals;
        }

        try {
            final int lastDashIndex = intervalQueryString.lastIndexOf(SimpleInterval.START_END_SEPARATOR);
            int startPos;
            int endPos;

            // Try to resolve the suffix as a query against the contig represented by the prefix.
            if (intervalQueryString.endsWith(SimpleInterval.END_OF_CONTIG)) {
                // try to resolve as "prefix:nnn+"
                startPos = SimpleInterval.parsePositionThrowOnFailure(intervalQueryString.substring(lastColonIndex + 1, intervalQueryString.length()-1));
                endPos = prefixSequence.getSequenceLength();
            } else if (lastDashIndex > lastColonIndex) {
                // try to resolve as "prefix:start-end"
                startPos = SimpleInterval.parsePositionThrowOnFailure(intervalQueryString.substring(lastColonIndex + 1, lastDashIndex));
                endPos = SimpleInterval.parsePositionThrowOnFailure(intervalQueryString.substring(lastDashIndex + 1, intervalQueryString.length()));
            } else {
                // finally, try to resolve as "prefix:nnn"
                startPos = SimpleInterval.parsePositionThrowOnFailure(intervalQueryString.substring(lastColonIndex + 1, intervalQueryString.length()));
                endPos = startPos;
            }

            if (SimpleInterval.isValid(prefix, startPos, endPos)) {
                // We've pre-tested to validate the positions, so add this interval. This should never throw.
                resolvedIntervals.add(new SimpleInterval(prefix, startPos, endPos));
            } else {
                // Positions don't appear to be valid, but we don't want to throw if there is any other valid
                // interpretation of the query string
                if (resolvedIntervals.isEmpty()) {
                    // validatePositions throws on validation failure, which is guaranteed if we got here
                    SimpleInterval.validatePositions(prefix, startPos, endPos);
                }
            }
        } catch (NumberFormatException e) {
            // parsing of the start or end pos failed
            if (resolvedIntervals.isEmpty()) {
                throw e;
            } else {
                // We're interpreting this as a query against a full contig, but its POSSIBLE that the user
                // mis-entered the start or stop position, and had they entered them correctly, would have resulted
                // in an ambiguity. So accept the query as an interval for the full contig, but issue a warning
                // saying how the query as resolved.
                log.warn(String.format(
                        "The query interval string \"%s\" is interpreted as a query against the contig named \"%s\", " +
                                "but may have been intended as an (accidentally malformed) query against the contig named \"%s\"",
                        intervalQueryString,
                        resolvedIntervals.get(0).getContig(),
                        prefixSequence.getSequenceName()));
            }
        }
        return resolvedIntervals;
    }

    /**
     * Create a new interval, bounding start and stop by the start and end of contig
     *
     * This function will return null if start and stop cannot be adjusted in any reasonable way
     * to be on the contig.  For example, if start and stop are both past the end of the contig,
     * there's no way to fix this, and null will be returned.
     *
     * @param contig our contig
     * @param start our start as an arbitrary integer (may be negative, etc)
     * @param stop our stop as an arbitrary integer (may be negative, etc)
     * @param contigLength length of the contig
     * @return a valid interval over contig, or null if a meaningful interval cannot be created
     */
    public static SimpleInterval trimIntervalToContig(final String contig, final int start, final int stop, final int contigLength) {
        Utils.nonNull(contig);
        Utils.validateArg(contigLength >= 1, () -> "contigLength should be at least 1 but was " + contigLength);
        final int boundedStart = Math.max(1, start);
        final int boundedStop = Math.min(contigLength, stop);

        if ( boundedStart > contigLength || boundedStop < 1 ){
            // there's no meaningful way to create this interval, as the start and stop are off the contig
            return null;
        } else {
            return new SimpleInterval(contig, boundedStart, boundedStop);
        }
    }

    /**
     * Determines whether the provided interval is within the bounds of its assigned contig according to the provided dictionary
     *
     * @param interval interval to check
     * @param dictionary dictionary to use to validate contig bounds
     * @return true if the interval's contig exists in the dictionary, and the interval is within its bounds, otherwise false
     */
    public static boolean intervalIsOnDictionaryContig( final SimpleInterval interval, final SAMSequenceDictionary dictionary ) {
        Utils.nonNull(interval);
        Utils.nonNull(dictionary);

        final SAMSequenceRecord contigRecord = dictionary.getSequence(interval.getContig());
        if ( contigRecord == null ) {
            return false;
        }

        return interval.getEnd() <= contigRecord.getSequenceLength();
    }

    //-------------------------------------------------------------------------------------------------
    // Utility code related to the notion of splitting intervals at well-defined boundaries,
    // so that whichever intervals you start from, the resulting shards will line up.

    /**
     *
     * Splits the given input intervals into shards of at most the requested size.
     * The shard boundaries lie at integer multiples of shardSize.
     *
     * chr2:1-200 -> chr2:1-100,chr2:101-200
     *
     * This method will return all intervals in RAM.  If you need a solution that is light on RAM usage, though it
     *  returns an iterator, see {@link org.broadinstitute.hellbender.utils.iterators.ShardedIntervalIterator}
     */
    static public List cutToShards(Iterable intervals, int shardSize) {
        ArrayList ret = new ArrayList<>();
        for (SimpleInterval i : intervals) {
            int beginShard = shardIndex(i.getStart(), shardSize);
            int endShard = shardIndex(i.getEnd(), shardSize);
            if (beginShard==endShard) {
                ret.add(i);
                continue;
            }
            // more than one shard: output begin to end-of-shard, then multiple full shards, then begin-of-shard to end.
            ret.add(new SimpleInterval(i.getContig(), i.getStart(), endOfShard(beginShard, shardSize)));
            for (int shard = beginShard+1; shard getSpanningIntervals(final List locations, final SAMSequenceDictionary sequenceDictionary){
        return locations.stream()
                .collect(Collectors.groupingBy(Locatable::getContig))
                .values().stream().map(IntervalUtils::getSpanningInterval).sorted((i1,i2)->compareLocatables(i1,i2,sequenceDictionary))
                .collect(Collectors.toList());
    }

    /**
     * Combine the breakpoints of multiple intervals and return a list of locatables based on the updated breakpoints.
     *
     * Suppose we have two lists of locatables:
     * List 1:
     *
     *      *     1  1000  2000
     * 
     *
     * List 2:
     *
     *      *     1  500  2500
     *     1  2501  3000
     *     1  4000  5000
     * 
     *
     * The result would be:
     *      *     1  500  999
     *     1  1000  2000
     *     1  2001  2500
     *     1  2501  3000
     *     1  4000  5000
     * 
     *
     * Note that start breakpoints will always appear as starts of the resulting intervals.
     *
     * 
     * Does not alter the input.
     * 
     *
     * Any single list of input locatables containing duplicates or overlapping intervals will throw an exception.
     *
     * Intervals are assumed to include the start and end bases.
     *
     * This method performs all necessary sorting.
     *
     * @param unsortedLocatables1 list of locatables
     * @param unsortedLocatables2 list of locatables
     * @param dictionary Sequence dictionary to base the sort.  The order of contigs/sequences in the dictionary is the order of the sorting here.
     * @return Locatables from the combined breakpoints of unsortedLocatables1 and unsortedLocatables2.  If both inputs are null, return an
     *   empty list.  Please note that returned values are new copies.  If exactly one of the inputs is null, this method
     *   returns a copy of of the non-null input.
     */
    public static  List combineAndSortBreakpoints(final List unsortedLocatables1,
                                                                                     final List unsortedLocatables2,
                                                                                     final SAMSequenceDictionary dictionary) {

        Utils.nonNull(dictionary);

        final List locatables1 = sortLocatablesBySequenceDictionary(unsortedLocatables1, dictionary);
        final List locatables2 = sortLocatablesBySequenceDictionary(unsortedLocatables2, dictionary);

        if ((locatables1 == null) && (locatables2 == null)) {
            return Collections.emptyList();
        }

        validateNoOverlappingIntervals(locatables1);
        validateNoOverlappingIntervals(locatables2);

        if (CollectionUtils.isEmpty(locatables1)) {
            return locatables2.stream().map(SimpleInterval::new).collect(Collectors.toList());
        }
        if (CollectionUtils.isEmpty(locatables2)) {
            return locatables1.stream().map(SimpleInterval::new).collect(Collectors.toList());
        }

        final List masterList = new ArrayList<>();
        masterList.addAll(locatables1);
        masterList.addAll(locatables2);

        final Set contigs = masterList.stream()
                .map(Locatable::getContig).collect(Collectors.toSet());

        final Map>> contigToBreakpoints = contigs.stream()
                .collect(Collectors.toMap(Function.identity(), l -> new HashSet<>()));

        // Populate initialized maps with contigs to the break points.  Also, keep a mapping of contigs
        //  to start breakpoints.
        masterList.forEach(l -> {
            contigToBreakpoints.get(l.getContig()).add(Pair.of(l.getStart(), IntervalBreakpointType.START_BREAKPOINT));
            contigToBreakpoints.get(l.getContig()).add(Pair.of(l.getEnd(), IntervalBreakpointType.END_BREAKPOINT));
        });

        final List result = new ArrayList<>();

        for (final String contig : contigs) {

            // Sort the breakpoints for this contig.  Use the pair structure, since we need to differentiate between a
            //  breakpoint that is a start and a breakpoint that is end, yet have the same position.  This is especially
            //  important for single base intervals.
            final List> breakpoints = new ArrayList<>(contigToBreakpoints.get(contig));
            breakpoints.sort((p1, p2) -> {
                final int firstComparison = p1.getLeft().compareTo(p2.getLeft());
                if (firstComparison != 0) {
                    return firstComparison;
                } else {
                    // We want start breakpoints before end breakpoints
                    return p1.getRight().compareTo(p2.getRight());
                }
            });
            int numCurrentStarts = 0;
            int numCurrentEnds = 0;
            for (int i = 0; i < (breakpoints.size() - 1); i++) {
                final int currentBreakpoint = breakpoints.get(i).getLeft();
                final int nextBreakpoint = breakpoints.get(i + 1).getLeft();

                final boolean isCurrentBreakpointStart = breakpoints.get(i).getRight() == IntervalBreakpointType.START_BREAKPOINT;
                final boolean isNextBreakpointStart = breakpoints.get(i + 1).getRight() == IntervalBreakpointType.START_BREAKPOINT;

                // if both breakpoints are starts of intervals, then the result is bp1, bp2-1
                // if both breakpoints are ends of intervals, then the result is bp1+1, bp2
                int start = (!isCurrentBreakpointStart && !isNextBreakpointStart ? currentBreakpoint + 1 : currentBreakpoint);
                int end = (isCurrentBreakpointStart && isNextBreakpointStart ? nextBreakpoint - 1 : nextBreakpoint);

                // If the current breakpoint is an end and the next is a start, then we want to shrink both ends.
                //  Note that this could indicate that we are between intervals for one list of locatables,
                //   but not the other.
                final boolean isBetweenIntervals = !isCurrentBreakpointStart && isNextBreakpointStart;
                if (isBetweenIntervals) {
                    start++;
                    end--;
                }

                // If the next breakpoint is a start and the current is end AND we are not in the middle of an interval,
                //   then we do NOT add the interval at all (we are between intervals in both lists).
                if (isCurrentBreakpointStart) {
                    numCurrentStarts++;
                } else {
                    numCurrentEnds++;
                }

                // We also need to check if start > end, since we could have been between intervals that were one adjacent,
                //   so we don't want to add an interval for that.
                if (((!isBetweenIntervals) || (numCurrentStarts > numCurrentEnds)) &&
                        start <= end){
                    result.add(new SimpleInterval(contig, start, end));
                }
            }
        }
        return sortLocatablesBySequenceDictionary(result, dictionary);
    }

    /**
     *  Throws Bad Input exception if any overlaps are detected within the list of locatables.
     * @param locatables List of locatables to test.  {@code null} will never throw an exception.
     * @param  Locatable class
     */
    public static  void validateNoOverlappingIntervals(List locatables) {
        // Do not throw an exception for empty or null lists.
        if (CollectionUtils.isEmpty(locatables)) {
            return;
        }

        final HashSet locatablesSet = new HashSet<>(locatables);
        if (locatablesSet.size() != locatables.size()) {
            throw new UserException.BadInput("Duplicate(s) detected in input:  " + locatables.size() + " intervals had " + (locatables.size() - locatablesSet.size()) + " duplicates.");
        }

        final OverlapDetector overlapDetector = OverlapDetector.create(locatables);

        for (final Locatable locatable : locatables) {
            final Set overlaps = overlapDetector.getOverlaps(locatable);
            if (overlaps.size() > 1) {
                throw new UserException.BadInput("Overlap detected in input:  " + locatable + " overlapped " + StringUtils.join(overlaps, ", "));
            }
        }
    }

    /**
     * Sort by the contig then position as specified by the index order in the given sequence dictionary.
     *
     * @param locatables list of locatables.
     * @param dictionary Never {@code null}
     * @param  Locatable
     * @return new list that is sorted using the sequence index of the given dictionary.  Returns {@code null} if locatables
     *   is {@code null}.  Instances in the list are not copies of input.
     */
    public static  List sortLocatablesBySequenceDictionary(final Collection locatables, final SAMSequenceDictionary dictionary) {
        Utils.nonNull(dictionary);

        final List result = (locatables == null ? null : new ArrayList<>(locatables));

        if (result != null) {
            result.sort(getDictionaryOrderComparator(dictionary));
        }
        return result;
    }

    /**
     * Creates a map of which locatables (keys) overlap the other list of locatables (vals)
     *
     * Input lists will be sorted sorted by the input dictionary.
     *
     *  Within a single input list, segments cannot overlap (or duplicate).  If this occurs, an exception is thrown.
     *
     * No copies of inputs are created.
     *
     * @param keys -- the intervals we wish to query.  Sorted by interval.  No intervals overlap.  Never {@code null}
     * @param vals -- the intervals that we wish to map to the keys.  Sorted by interval.  No intervals overlap.  Never {@code null}
     * @param dictionary -- the SAMSequenceDictionary that the intervals (and sorting) derive from.  Never {@code null}
     * @return a mapping of intervals from keys to the list of overlapping intervals in vals.  All item in keys will
     * have a key.  Never {@code null}
     */
    public static  Map> createOverlapMap(final List keys, final List vals,
                                                                                              final SAMSequenceDictionary dictionary) {
        Utils.nonNull(keys);
        Utils.nonNull(vals);
        Utils.nonNull(dictionary);

        validateNoOverlappingIntervals(keys);
        validateNoOverlappingIntervals(vals);

        final List sortedKeys = sortLocatablesBySequenceDictionary(keys, dictionary);

        final OverlapDetector overlapDetector = OverlapDetector.create(vals);

        final Map> result = new HashMap<>();

        for (final T key: sortedKeys) {

            // Get the overlaps, sort'em, and create a map entry.
            final Set overlaps = overlapDetector.getOverlaps(key);
            final List overlapsAsList = sortLocatablesBySequenceDictionary(overlaps, dictionary);
            result.put(key, overlapsAsList);
        }

        return result;
    }

    /**
     *
     * The order of contigs/sequences in the dictionary is the order of the sorting here.
     *
     * @param dictionary dictionary to use for the sorting.  Intervals with sequences not in this dictionary will cause
     *                   exceptions to be thrown.  Never {@code null}.
     * @return an instance of {@code Comapator} for use in sorting of Locatables.
     */
    public static  Comparator getDictionaryOrderComparator(final SAMSequenceDictionary dictionary) {
        Utils.nonNull(dictionary);
        return (o1, o2) -> IntervalUtils.compareLocatables(o1, o2, dictionary);
    }

    /**
     * An enum to classify breakpoints whether the breakpoint is the start or end of a region.
     */
    public enum IntervalBreakpointType {
        START_BREAKPOINT, END_BREAKPOINT
    }

    /**
     * Determine whether the two intervals specified overlap each other by at least the threshold proportion specified.
     * This is a commutative operation.
     *
     * @param interval1 Never {@code null}
     * @param interval2 Never {@code null}
     * @param reciprocalOverlapThreshold proportion of the segments that must overlap.  Must be between 0.0 and 1.0 (inclusive).
     * @return whether there is a reciprocal overlap exceeding the given threshold.  If reciprocalOverlapThreshold is 0,
     * always returns true, even if intervals do not overlap.
     */
    public static boolean isReciprocalOverlap(final SimpleInterval interval1, final SimpleInterval interval2, final double reciprocalOverlapThreshold) {
        Utils.nonNull(interval1);
        Utils.nonNull(interval2);
        ParamUtils.inRange(reciprocalOverlapThreshold, 0.0, 1.0, "Reciprocal threshold must be between 0.0 and 1.0.");

        if (reciprocalOverlapThreshold == 0.) {
            return true;
        }

        // This overlap check is required, since intersect call below requires that the intervals overlap.
        return interval1.overlaps(interval2) &&
                (interval1.intersect(interval2).size() >= (interval2.size() * reciprocalOverlapThreshold)) &&
                (interval2.intersect(interval1).size() >= (interval1.size() * reciprocalOverlapThreshold));
    }

    /**
     * Given a collection of {@link SimpleInterval} instances returns a map where the key is the enclosing contig name and the value is the
     * list of the intervals  on that contig that result from sorting and then merging those that are contiguous or overlap.
     *
     * 
     *     The output collections (map and lists) are mutable, and any modification on them should not change the inputs as a side effect.
     * 
     *
     * @param input input collection of lacatables, may contain duplicates.
     * @param dictionary the reference dictionary.
     * @param  the locatable type.
     * @throws IllegalArgumentException if input is {@code null}.
     * @return never {@code null}, but perhaps an empty map. It is guarantee that no value in the map is an empty list upon return.
     */
    public static  Map> sortAndMergeIntervals(final Collection input,
                                                                                                final SAMSequenceDictionary dictionary, final IntervalMergingRule rule) {
        final GenomeLocParser parser = new GenomeLocParser(dictionary);
        final List locs = input.stream().map(parser::createGenomeLoc).collect(Collectors.toList());
        final GenomeLocSortedSet resultSet = sortAndMergeIntervals(parser, locs, rule);
        return resultSet.stream().map(loc -> new SimpleInterval(loc.contigName, loc.start, loc.stop)).collect(Collectors.groupingBy(SimpleInterval::getContig));
    }
}