All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.util.PickIlluminaIndicesCommand Maven / Gradle / Ivy

The newest version!
/**
 * Copyright (c) 2015, Fulcrum Genomics LLC
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

package com.fulcrumgenomics.util;

import htsjdk.samtools.util.*;

import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

import static java.lang.Math.pow;

/**
 * Program for picking sets of indices of arbitrary length that meet certain constraints
 * and attempt to maximize the edit distance between all members of the set picked.
 *
 * @author Tim Fennell
 */
class PickIlluminaIndicesCommand {
    /** The temperature at which the adapter ligation happens. */
    public static final int LIGATION_TEMPERATURE = 30;

    public int INDEX_LENGTH;
    public int N_INDICES;
    public int EDIT_DISTANCE;
    public boolean ALLOW_REVERSES;
    public boolean ALLOW_REVERSE_COMPLEMENTS;
    public boolean ALLOW_PALINDROMES;
    public int MAX_HOMOPOLYMER;
    public double MIN_GC;
    public double MAX_GC;
    public File OUTPUT;
    public int NUM_THREADS;
    public File VIENNA_RNA_DIR;
    public double MIN_DELTAG;
    public List INDEX_ADAPTER;
    public List AVOID_SEQUENCE;
    public List CANDIDATES;

    final static Log log = Log.getInstance(PickIlluminaIndicesCommand.class);

    private static final int[] SCORE_BY_DISTANCE = {0, 500000, 5000, 1};
    private int defaultEdgeSetSize = 0;

    /**
     *  Little class to encapsulate a index sequence and it's relationships to other indices.
     */
    class Index {
        final byte[] sequence;
        private final Collection related;
        private int _score;
        private byte _minEditDistance;
        private List folds = new ArrayList<>();

        Index(final byte[] sequence) {
            this.sequence = sequence;
            this.related = new HashSet<>(defaultEdgeSetSize);
            reset();
        }

        private void reset() { _score = Integer.MIN_VALUE; _minEditDistance = -1; }

        synchronized void addRelated(final Index b) {
            this.related.add(b);
            reset();
        }

        void removeRelated(final Index b) {
            this.related.remove(b);
            reset();
        }

        /** Returns the barcode that is this one reversed. */
        Index reverse() {
            final byte[] tmp = Arrays.copyOf(this.sequence, this.sequence.length);
            SequenceUtil.reverseQualities(tmp); // tmp isn't a quality score array, but all this method does is reverse in place!
            return new Index(tmp);
        }

        /** Generates a score where large scores are bad. */
        final int score() {
            if (_score == Integer.MIN_VALUE ) recalculate();
            return _score;
        }

        /** Returns the minimum edit distance between this barcode and any barcode in it's related set. */
        final int minEditDistance() {
            if (_minEditDistance == -1) recalculate();
            return _minEditDistance;
        }

        /** Gives the edit distance between this barcode and another of the same length. */
        byte calculateEditDistance(final Index that) {
            byte tmp = 0;
            for (int i=0; i {
        final BlockingQueue queue = new ArrayBlockingQueue<>(10000);
        final ThreadPoolExecutor exec = new ThreadPoolExecutor(NUM_THREADS, NUM_THREADS, 1, TimeUnit.MINUTES, queue);

        /** Basic constructor. */
        IndexSet(final Comparator comparator) {
            super(comparator);
        }


        /**
         * Overridden to remove a Barcode and the "edit" all the related indices to remove their relationship
         * to this barcode and re-insert them into the Set to resort them.
         */
        @Override public boolean remove(final Object o) {
            final Index index = (Index) o;

            /** Initialize a countdown int that each thread will update so we know when everything is processed. */
            final AtomicInteger countdown = new AtomicInteger(index.related.size());
            final Object latch = new Object();

            if (super.remove(index)) {
                /** For each barcode queue up a job to recalculate scores and remove/add from the ranked set. */
                for (final Index other : index.related) {
                    exec.execute(() -> {
                        try {
                            IndexSet.super.remove(other);
                            other.removeRelated(index);
                            other.recalculate();
                            IndexSet.this.add(other);

                            if (countdown.decrementAndGet() == 0) {
                                synchronized (latch) { latch.notify(); }
                            }
                        } catch (final Exception e) {
                            log.warn("Exception during the removing and index and recalculating its neighbors' scores: " + e.getMessage() + "\n" +
                                    Arrays.stream(e.getStackTrace()).map(Object::toString).collect(Collectors.joining("\n")));
                        }
                    });
                }

                /** Wait until all the related indices get updated. */
                while (countdown.get() > 0) {
                    try { synchronized(latch) { latch.wait(1000); } }
                    catch (InterruptedException ie) { /* Do Nothing */ }
                }

                return true;
            }
            else {
                return false;
            }
        }
    }

    protected int execute() {
        // Generate all kmers
        final List kmers = (this.CANDIDATES.isEmpty()) ? generateAllKmers(INDEX_LENGTH) : this.CANDIDATES;
        List indexes = new LinkedList<>();
        log.info("Generated " + kmers.size() + " kmers.");

        this.defaultEdgeSetSize = guessAtNumberOfEdgesPerBarcode(INDEX_LENGTH, EDIT_DISTANCE);
        log.info("Guess at number of edges: " + this.defaultEdgeSetSize);

        // Remove any that violate the max homopolyer length and convert them into indices
        for (final byte[] kmer : kmers) {
            if (lengthOfLongestHomopolymer(kmer) <= MAX_HOMOPOLYMER) {
                indexes.add(new Index(kmer));
            }
        }
        log.info("Retained " + indexes.size() + " indices after applying max homopolymer restriction.");

        // Shuffle the indexes so there is no bias in nucleotide representation.
        Collections.shuffle(indexes);

        if (!ALLOW_PALINDROMES) {
            filterForPalindromes(indexes);
            log.info("Retained " + indexes.size() + " indices after filtering for palindromes.");
        }

        filterForAvoidSequences(indexes);
        log.info("Retained " + indexes.size() + " indices after filtering for sequences to avoid.");

        filterForGc(indexes);
        log.info("Retained " + indexes.size() + " indices after filtering for GC content.");

        // Filter out reverses
        if (!ALLOW_REVERSES) {
            filterOutReverses(indexes);
            log.info("Retained " + indexes.size() + " indices after removing indices that are reversals of other indices.");
        }

        // Filter out reverse complements
        if (!ALLOW_REVERSE_COMPLEMENTS) {
            filterOutReverseComplements(indexes);
            log.info("Retained " + indexes.size() + " indices after removing indices that are reverse-complements of other indices.");
        }

        // Filter out indices that have very bad secondary structure
        if (VIENNA_RNA_DIR != null) {
            DnaFoldPredictor predictor = new DnaFoldPredictor(VIENNA_RNA_DIR, LIGATION_TEMPERATURE);
            final Iterator iterator = indexes.iterator();
            while (iterator.hasNext()) {
                final Index index = iterator.next();
                final String bases = StringUtil.bytesToString(index.sequence);

                // Predict folding for each adapter that the index will be embedded in
                for (final String adapter : this.INDEX_ADAPTER) {
                    final String seq = adapter.replaceFirst("N+", bases);
                    final DnaFoldPrediction prediction = predictor.predict(seq);
                    index.folds.add(prediction);
                }

                final double deltaG = index.folds.stream().mapToDouble(DnaFoldPrediction::deltaG).min().orElse(MIN_DELTAG);
                if (deltaG < MIN_DELTAG) iterator.remove();
            }
            log.info("Retained " + indexes.size() + " indices after removing indicies with too low deltaG.");
        }

        if (indexes.isEmpty()) {
            log.error("No indices left after previous filters.");
            System.exit(1);
        }

        // Make a big graph that connects all indices with edit distances between 1 and the
        // maximum EDIT_DISTANCE + 1
        log.info("Building the graph of all indices.");
        buildGraph(indexes);

        // First build a tree set where the "first" item is the one with the highest number of other indices
        log.info("Ranking indices.");
        final IndexSet rankedIndexes = rankBarcodes(indexes);
        indexes = null;

        // Finally go through and throw out indices in order until we achieve the desired edit distance
        int lastMinEdit = 0;
        int count = rankedIndexes.size();
        while (count > N_INDICES || lastMinEdit < EDIT_DISTANCE) {
            final Index first = rankedIndexes.first();

            if (first.minEditDistance() > lastMinEdit) {
                lastMinEdit = first.minEditDistance();
                log.info("There are " + count + " indices with a minimum edit distance of " + lastMinEdit);
                if (count == N_INDICES) break;
            }

            if (rankedIndexes.remove(first)) count--;
            else throw new IllegalStateException("Unable to remove the index from the ranked indices: " + first.toString());
            if (count % 100 == 0) log.info("Down to " + count + " indices.");
        }

        log.info("Ended with " + count + " indices with a min edit distance of " + rankedIndexes.first().minEditDistance());
        try {
            final int adapters = this.INDEX_ADAPTER.size();
            final BufferedWriter out = IOUtil.openFileForBufferedWriting(OUTPUT);
            final NumberFormat fmt = NumberFormat.getIntegerInstance();
            final NumberFormat pct = NumberFormat.getPercentInstance();
            final NumberFormat dec = new DecimalFormat("0.00");
            out.write("Barcode\tMinEditDistance\tNumBarcodesAtMinEditDistance\tGC%");
            for (int i=1; i<=adapters; ++i) {
                for (final String header : new String[] {"seq", "fold", "deltag"}) {
                    out.write("\t");
                    out.write(header + "_" + i);
                }
            }
            out.newLine();

            for (final Index bc : rankedIndexes) {
                final int minEditDistance = bc.minEditDistance();
                final int numberAtEditDistance = bc.getCountAtEditDistance(minEditDistance);

                out.write(StringUtil.bytesToString(bc.sequence));
                out.write('\t');
                out.write(fmt.format(minEditDistance));
                out.write('\t');
                out.write(fmt.format(numberAtEditDistance));
                out.write('\t');
                out.write(pct.format(SequenceUtil.calculateGc(bc.sequence)));
                for (int i=0; i indexes) {
        final Iterator iterator = indexes.iterator();
        while (iterator.hasNext()) {
            final Index b = iterator.next();
            final double gc = SequenceUtil.calculateGc(b.sequence);
            if (gc < MIN_GC || gc > MAX_GC) iterator.remove();
        }
    }

    /**
     * Takes the list of indices and identifies ones that have reverse complements in the set and then removes
     * from the set the barcode that comes later in the list.
     */
    private void filterOutReverseComplements(final List indexes) {
        final LinkedHashSet tmp = new LinkedHashSet<>();
        for (final Index bc : indexes) {
            if (!tmp.contains(bc.reverseComplement())) tmp.add(bc);
        }
        indexes.clear();
        indexes.addAll(tmp);
    }

    /**
     * Takes the list of indices and identifies ones that have reverses (not RCs) in the set and then removes
     * from the set the barcode that comes later in the list.
     */
    private void filterOutReverses(final List indexes) {
        final LinkedHashSet tmp = new LinkedHashSet<>();
        for (final Index bc : indexes) {
            if (!tmp.contains(bc.reverse())) tmp.add(bc);
        }
        indexes.clear();
        indexes.addAll(tmp);
    }

    /** Filters out any indices that are kmers contained in the list of sequences to avoid. */
    private void filterForAvoidSequences(final List indexes) {
        final Set avoidKmers = new HashSet<>();
        for (final String seq : AVOID_SEQUENCE) {
            for (int i=0; i iterator = indexes.iterator();
        while (iterator.hasNext()) {
            final String barcodeString = StringUtil.bytesToString(iterator.next().sequence);
            if (avoidKmers.contains(barcodeString)) {
                iterator.remove();
            }
        }
    }

    /** Removes all sequences where the sequence == revcomp(sequence) */
    private void filterForPalindromes(final List indexes) {
        final Iterator iterator = indexes.iterator();
        while (iterator.hasNext()) {
            final Index bc = iterator.next();
            final byte[] rc = Arrays.copyOf(bc.sequence, bc.sequence.length);
            SequenceUtil.reverseComplement(rc);
            if (Arrays.equals(bc.sequence, rc)) {
                iterator.remove();
            }
        }
    }

    /**
     * Generates a SortetSet of indices where they are ranked by the "most connected" to
     * the "least connected".  Must be called after constructing the graph.
     */
    private IndexSet rankBarcodes(final Collection input) {
        final Comparator comparator = (lhs, rhs) -> {
            // Remember: BIGGER scores == WORSE scores
            int retval = rhs.score() - lhs.score();
            if (retval == 0) {
                for (int i=0; i indexes) {
        final ExecutorService exec = Executors.newFixedThreadPool(NUM_THREADS);

        final Index[] bcs = indexes.toArray(new Index[indexes.size()]);
        final int len = bcs.length;
        final AtomicInteger counter = new AtomicInteger(0);

        for (int i=0; i {
                for (int j=firstJ; j generateAllKmers(final int length) {
        final List sofar = new LinkedList<>();
        final byte[] bases = {'A', 'C', 'G', 'T'};

        if (sofar.size() == 0) {
            sofar.add(new byte[length]);
        }

        while (true) {
            final byte[] bs = sofar.remove(0);
            final int indexOfNextBase = findIndexOfNextBase(bs);

            if (indexOfNextBase == -1) {
                sofar.add(bs);
                break;
            }
            else {
                for (final byte b : bases) {
                    final byte[] next = Arrays.copyOf(bs, bs.length);
                    next[indexOfNextBase] = b;
                    sofar.add(next);
                }
            }
        }

        return sofar;
    }

    /** Finds the first non-zero character in the array, or returns -1 if all are non-zero. */
    int findIndexOfNextBase(final byte[] bs) {
        for (int i=0; i0; --i) total += nPickK(length, i) * (int) pow(3, i);
        return (int) (total / 2d);
    }

    /** Implementation of the calculation for how many combinations are there of K items from N items. */
    final int nPickK(final int n, final int k) {
        return fac(n) / ( fac(k)  * fac(n-k));
    }

    /** Simple factorial() implementation. */
    final int fac(final int n) {
        if (n == 0) return 1;
        int result = n;
        for (int i=n-1; i>1; --i) result *= i;
        return result;
    }
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy