All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.spark.pathseq.PSKmerUtils Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.spark.pathseq;

import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.engine.spark.datasources.ReferenceFileSparkSource;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmerShort;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmerizer;
import org.broadinstitute.hellbender.tools.spark.utils.LargeLongHopscotchSet;
import org.broadinstitute.hellbender.tools.spark.utils.LongBloomFilter;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.reference.ReferenceBases;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;

/**
 * PathSeq utilities for kmer libraries
 */
public class PSKmerUtils {

    public static final String HOPSCOTCH_SET_EXTENSION = ".hss";
    public static final String BLOOM_FILTER_EXTENSION = ".bfi";
    private static final Logger logger = LogManager.getLogger(PSKmerUtils.class);

    /**
     * Gets all kmers from a given reference as a Collection of long arrays, while logging handy progress messages.
     */
    protected static Collection getMaskedKmersFromLocalReference( final ReferenceFileSparkSource ref, final int kSize,
                                                                          final int kSpace, final SVKmerShort mask) {

        //Load reference records
        final Map records;
        try {
            records = ref.getAllReferenceBases();
        } catch (final IOException e) {
            throw new GATKException("Could not get reference bases");
        }

        final Collection collection = new ArrayList<>(records.size());

        //Initialize progress state
        final long totalBases = records.values().stream().mapToLong(refBases -> refBases.getBases().length).sum();
        logger.info("Generating kmers from " + totalBases + " bases in " + records.size() + " records...");
        final ProgressCounter counter = new ProgressCounter(totalBases, 1e6, "Mbp", logger);

        //Get kmers from each reference record
        for (final String recName : records.keySet()) {

            //Kmerize the record
            final byte[] bases = records.get(recName).getBases();
            final long[] list = SVKmerizer.stream(bases, kSize, kSpace, new SVKmerShort(kSize))
                    .mapToLong(kmer -> PSKmerCollection.canonicalizeAndMask((SVKmerShort) kmer, kSize, mask))
                    .toArray();

            //Add kmers to the result
            collection.add(list);

            counter.update(bases.length);
        }
        logger.info("Finished generating kmers!");
        return collection;
    }

    /**
     * Returns the total number of longs in the Collection
     */
    public static long longArrayCollectionSize(final Collection lists) {
        return lists.stream().mapToLong(arr -> arr.length).sum();
    }

    /**
     * Converts a Collection of Lists of Longs's into a Hopscotch set
     */
    protected static LargeLongHopscotchSet longArrayCollectionToSet(final Collection longs, final long numLongs) {
        final LargeLongHopscotchSet kmerHopscotchSet = new LargeLongHopscotchSet(numLongs);
        for (final long[] array : longs) {
            for (final long val : array) {
                kmerHopscotchSet.add(val);
            }
        }
        return kmerHopscotchSet;
    }

    /**
     * Converts a Collection of Lists of Longs's into a Bloom filter
     */
    protected static LongBloomFilter longArrayCollectionToBloomFilter(final Collection longs, final long numLongs, final double bloomFpp) {
        final LongBloomFilter bloomFilter = new LongBloomFilter(numLongs, bloomFpp);
        final ProgressCounter counter = new ProgressCounter(numLongs, 1e6, "million kmers", logger);
        for (final long[] array : longs) {
            bloomFilter.addAll(array);
            counter.update(array.length);
        }
        return bloomFilter;
    }

    /**
     * Writes an object to a URI using Kryo serialization.
     */
    public static void writeKryoObject(final Object obj, String uri) {
        final Output output = new Output(BucketUtils.createFile(uri));
        final Kryo kryo = new Kryo();
        kryo.writeObject(output, obj);
        output.close();
    }

    public static void writeKmerSet(final String uri, final PSKmerSet set) {
        String filePath = uri;
        if (!uri.toLowerCase().endsWith(HOPSCOTCH_SET_EXTENSION.toLowerCase())) {
            filePath = filePath + HOPSCOTCH_SET_EXTENSION;
        }
        writeKryoObject(set, filePath);
    }

    public static void writeKmerBloomFilter(final String uri, final PSKmerBloomFilter bloomFilter) {
        String filePath = uri;
        if (!uri.toLowerCase().endsWith(BLOOM_FILTER_EXTENSION.toLowerCase())) {
            filePath = filePath + BLOOM_FILTER_EXTENSION;
        }
        writeKryoObject(bloomFilter, filePath);
    }

    public static PSKmerCollection readKmerFilter(final String uri) {
        final Input input = new Input(BucketUtils.openFile(uri));
        final Kryo kryo = new Kryo();
        if (uri.endsWith(HOPSCOTCH_SET_EXTENSION)) {
            return kryo.readObject(input, PSKmerSet.class);
        } else if (uri.endsWith(BLOOM_FILTER_EXTENSION)) {
            return kryo.readObject(input, PSKmerBloomFilter.class);
        }
        throw new UserException.BadInput("Unknown kmer set extension in file name " + uri);
    }

    private final static class ProgressCounter {
        long processedItems, processedItemsSinceLast;
        final long initialTime, totalItems, itemsInterval;
        final double normFactor;
        final Logger logger;
        final String name;

        public ProgressCounter (final long totalItems, final double normFactor, final String name, final Logger logger) {
            this.processedItems = 0;
            this.processedItemsSinceLast = 0;
            this.totalItems = totalItems;
            this.itemsInterval = totalItems / 100;
            this.initialTime = System.currentTimeMillis();
            this.normFactor = normFactor;
            this.logger = logger;
            this.name = name;
        }

        public void update(final long numItems) {
            processedItems += numItems;
            processedItemsSinceLast += numItems;
            if (processedItemsSinceLast > itemsInterval) {
                final long currentTime = System.currentTimeMillis();
                final double percentComplete = 100.0 * processedItems / (double) totalItems;
                final double averageRate = processedItems / (normFactor * (currentTime - initialTime) / (1000.0 * 60.0));
                final double estimatedTime = ((totalItems - processedItems) / normFactor) / averageRate;
                final String percentStr = String.format("%1$.1f", percentComplete);
                final String totalStr = String.format("%1$.1f", processedItems / normFactor);
                final String rateStr = String.format("%1$.1f", averageRate);
                final String timeStr = String.format("%1$.2f", estimatedTime);
                logger.info(percentStr + "% complete - " + totalStr + " " + name + " at " + rateStr + " " + name + "/min, " + timeStr + " min remaining");
                processedItemsSinceLast = 0;
            }
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy