All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.campagnelab.dl.somatic.intermediaries.FirstSimulationStrategy Maven / Gradle / Ivy

package org.campagnelab.dl.somatic.intermediaries;

import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.util.XorShift1024StarRandom;
import org.campagnelab.dl.somatic.tools.Mutate;
import org.campagnelab.dl.somatic.utils.ProtoPredictor;
import org.campagnelab.dl.varanalysis.protobuf.BaseInformationRecords;
import org.campagnelab.goby.predictions.ProtoHelper;

import java.util.Collections;
import java.util.List;
import java.util.Random;

/**
 * Introduce a strategy to introduce mutations.
 * Created by fac2003 on 7/19/16.
 */
public class FirstSimulationStrategy implements SimulationStrategy {
    long seed;

    public FirstSimulationStrategy(double deltaSmall, double deltaBig, double zygHeuristic, long seed) {
        setup(deltaSmall, deltaBig, zygHeuristic, seed, 0);
    }

    @Override
    public void setup(double deltaSmall, double deltaBig, double zygHeuristic, long seed, double canonThreshold) {
        this.seed = seed;
        rand = new XorShift1024StarRandom(seed);

        this.deltaSmall = deltaSmall;
        this.deltaBig = deltaBig;
        this.zygHeuristic = zygHeuristic;
    }

    public FirstSimulationStrategy() {

    }


    //delta will be halved in homozygous cases (to account for twice the reads at a base)
    //min fraction of bases mutated at a record (ceilinged fraction)
    double deltaSmall;
    //max fraction of bases mutated at a record (floored fraction)
    double deltaBig;
    //minimum proportion of counts to presume allele
    double zygHeuristic;
    final String[] STRING = new String[]{"A", "T", "C", "G"};
    Random rand;


    class MutationDirection {
        int oldBase;
        int newBase;
        double delta;
        /**
         * The frequency of mutation for the source allele. Can be different from somaticFrequency when two alleles or
         * more.
         */
        double somaticFrequency;

        @Override
        public String toString() {
            return String.format("%d>%d delta=%f somaticFrequency=%f", oldBase, newBase, delta, somaticFrequency);
        }

        /**
         * creates a mutationDirection object which defines where reads will be moved from,to, and how many
         *
         * @param oldBase   index of source base
         * @param newBase   index of destination base
         * @param delta     proportion source base reads to move (0 - 1 in heterozygous case, 0 - 0.5 in homozygous case)
         * @param frequency proportion of source allele reads to move (always 0 - 1, either delta or delta/2 in heterozygous case)
         */
        MutationDirection(int oldBase, int newBase, double delta, double frequency) {
            this.oldBase = oldBase;
            this.newBase = newBase;
            this.delta = delta;
            this.somaticFrequency = frequency;
        }
    }

    /**
     * @param counts arrays of counts, in genotype order.
     * @return
     */
    MutationDirection dirFromCounts(int referenceBase, int[] counts) {
        int numGenos = counts.length;
        int maxCount = 0;
        int secondMostCount = 0;
        int maxCountIdx = -1;
        int secondMostCountIdx = -1;
        int totalNumCounts = 0;
        //find highest count idx, second highest count idx, and record number of counts
        for (int i = 0; i < numGenos; i++) {
            totalNumCounts += counts[i];
            if (counts[i] > maxCount) {
                secondMostCountIdx = maxCountIdx;
                secondMostCount = maxCount;
                maxCountIdx = i;
                maxCount = counts[i];
            } else if (counts[i] > secondMostCount) {
                secondMostCountIdx = i;
                secondMostCount = counts[i];
            }
        }
        //no reads whatsoever
        if (maxCountIdx == -1) {
            return null;
        }
        boolean monozygotic;
        //all reads same base, monozygotic
        if (secondMostCountIdx == -1) {
            monozygotic = true;
        } else {
            //see if base with second most reads exceeds heuristic
            monozygotic = (zygHeuristic * totalNumCounts > counts[secondMostCountIdx]);
        }
        //make rand generator and generate proportion mutating bases
        //generate mutation rate for the source allele.
        double delta = deltaSmall + ((deltaBig - deltaSmall) * rand.nextDouble());
        double originalDelta = delta;
        int newBase = -1;
        int oldBase;
        int otherAlleleBase = -1;
        boolean allowed = false;
        if (monozygotic) {
            oldBase = maxCountIdx;
            //generate from non-max bases uniformly

            // homozygote site: somatic mutation likely to mutate only one of the two alleles, so we halve delta
            // to simulate this:
            if (rand.nextDouble() > 0.001) {
                // Most of the time, we follow the rule. Sometimes we don't because two mutations in the same site can occur,
                // albeit rarely (we don't know the exact frequency).
                // If we did not allow this at all, the model could never see any examples of the rare case, likely causing odd
                // behavior when a rare case shows up.
                delta = delta / 2;
            }
            while (!allowed) {
                newBase = rand.nextInt(numGenos);
                allowed = true;
                if (newBase == oldBase || newBase == 4 || newBase == referenceBase) {
                    //replace self case, N case
                    allowed = false;
                }
            }
        } else { //handle heterozygous case
            boolean mutatingAllele = rand.nextBoolean();
            oldBase = mutatingAllele ? maxCountIdx : secondMostCountIdx;
            otherAlleleBase = !mutatingAllele ? maxCountIdx : secondMostCountIdx;
            while (!allowed) {
                newBase = rand.nextInt(numGenos);
                allowed = true;
                if (newBase == oldBase || newBase == 4 || newBase == otherAlleleBase || newBase == referenceBase) {
                    //replace self case, other allele case, N case
                    allowed = false;
                }
            }

        }

        double somaticFrequency = originalDelta;

        return new MutationDirection(oldBase, newBase, delta, somaticFrequency);
    }

    @Override
    public int numberOfSamplesSupported() {
        return 2;
    }

    @Override
    public BaseInformationRecords.BaseInformation mutate(boolean makeSomatic,
                                                         BaseInformationRecords.BaseInformation record,
                                                         BaseInformationRecords.SampleInfo germlineSample,
                                                         BaseInformationRecords.SampleInfo otherSample,
                                                         SimulationCharacteristics sim) {

        BaseInformationRecords.BaseInformation.Builder baseBuild = record.toBuilder();
        baseBuild.setMutated(makeSomatic);
        int numSamples = record.getSamplesCount();
        for (int i = 0; i < numSamples - 1; i++) {
            baseBuild.setSamples(i, record.getSamples(i).toBuilder().setIsTumor(false));
        }
        baseBuild.setSamples(numSamples - 1, record.getSamples(numSamples - 1).toBuilder().setIsTumor(true));
        if (!makeSomatic) {
            // don't change counts if we are not to make the second sample somatic.
            return baseBuild.build();
        }
        BaseInformationRecords.SampleInfo somatic = baseBuild.getSamples(numSamples - 1);
        int numGenos = somatic.getCountsList().size();
        int[] forward = new int[numGenos];
        int[] backward = new int[numGenos];
        int[] sums = new int[numGenos];
        //fill declared arrays
        int i = 0;
        int referenceBase = -1;
        for (BaseInformationRecords.CountInfo count : somatic.getCountsList()) {
            forward[i] = count.getGenotypeCountForwardStrand();
            backward[i] = count.getGenotypeCountReverseStrand();
            sums[i] = forward[i] + backward[i];
            if (count.getMatchesReference() == true) {
                referenceBase = i;
            }
            i++;
        }
        assert referenceBase != -1 : "A count must match the reference.";
        MutationDirection dir = dirFromCounts(referenceBase, sums);

        if (dir == null) {
            return baseBuild.build();
        }

        final int oldBase = dir.oldBase;
        final int newBase = dir.newBase;
        final double delta = dir.delta;
        double frequency = dir.somaticFrequency;
        int changedCount = 0;
        int germlineCount = 0;
        int sumCount = 0;

        int fMutCount = 0;
        int oldCount = forward[oldBase];
        sumCount += oldCount;
        for (i = 0; i < oldCount; i++) {
            if (rand.nextDouble() < delta) {
                forward[oldBase]--;
                forward[newBase]++;
                fMutCount++;
                changedCount++;
            } else {
                germlineCount++;
            }
        }
        int bMutCount = 0;
        oldCount = backward[oldBase];
        sumCount += oldCount;
        for (i = 0; i < oldCount; i++) {
            if (rand.nextDouble() < delta) {
                backward[oldBase]--;
                backward[newBase]++;
                bMutCount++;
                changedCount++;
            } else {
                germlineCount++;
            }
        }
        //write to respective builders and return rebuild
        BaseInformationRecords.SampleInfo.Builder somaticBuild = somatic.toBuilder();


        //generate mutated numVariationsLists score lists (some boilerplate here...)
        //will add 1 to mutated variant counts using mutateIntegerListsVarAdd
        List fromVC = new ObjectArrayList();
        List toVC = new ObjectArrayList();
        if (somaticBuild.getCounts(oldBase).getNumVariationsInReadsCount() > 0) {

            //forward strand
            fromVC.addAll(ProtoPredictor.expandFreq(somaticBuild.getCounts(oldBase).getNumVariationsInReadsList()));
            toVC.addAll(ProtoPredictor.expandFreq(somaticBuild.getCounts(newBase).getNumVariationsInReadsList()));
            mutateIntegerListsVarAdd(fMutCount, fromVC, toVC, somaticBuild.getCounts(oldBase).getMatchesReference(), somaticBuild.getCounts(newBase).getMatchesReference());
        }

        //generate mutated insert sizes lists (some boilerplate here...)
        List fromIS = new ObjectArrayList();
        List toIS = new ObjectArrayList();
        if (somaticBuild.getCounts(oldBase).getNumVariationsInReadsCount() > 0) {

            //forward strand
            fromIS.addAll(ProtoPredictor.expandFreq(somaticBuild.getCounts(oldBase).getInsertSizesList()));
            toIS.addAll(ProtoPredictor.expandFreq(somaticBuild.getCounts(newBase).getInsertSizesList()));
            mutateIntegerLists(fMutCount, fromIS, toIS);
        }


        //generate mutated quality score lists (some boilerplate here...)
        //get old list of from scores
        List fromForward = new ObjectArrayList();
        List fromBackward = new ObjectArrayList();
        List toForward = new ObjectArrayList();
        List toBackward = new ObjectArrayList();
        if (somaticBuild.getCounts(oldBase).getQualityScoresForwardStrandCount() > 0 && somaticBuild.getCounts(oldBase).getQualityScoresReverseStrandCount() > 0) {

            //forward strand
            fromForward.addAll(ProtoPredictor.expandFreq(somaticBuild.getCounts(oldBase).getQualityScoresForwardStrandList()));
            toForward.addAll(ProtoPredictor.expandFreq(somaticBuild.getCounts(newBase).getQualityScoresForwardStrandList()));
            mutateIntegerLists(fMutCount, fromForward, toForward);

            //reverse strand
            fromBackward.addAll(ProtoPredictor.expandFreq(somaticBuild.getCounts(oldBase).getQualityScoresReverseStrandList()));
            toBackward.addAll(ProtoPredictor.expandFreq(somaticBuild.getCounts(newBase).getQualityScoresReverseStrandList()));
            mutateIntegerLists(bMutCount, fromBackward, toBackward);

        }

        //generate mutated readIndex lists
        List fromForwardR = new ObjectArrayList();
        List fromBackwardR = new ObjectArrayList();
        List toForwardR = new ObjectArrayList();
        List toBackwardR = new ObjectArrayList();
        if (somaticBuild.getCounts(oldBase).getReadIndicesForwardStrandCount() > 0 && somaticBuild.getCounts(oldBase).getReadIndicesReverseStrandCount() > 0) {

            //forward strand
            fromForwardR.addAll(ProtoPredictor.expandFreq(somaticBuild.getCounts(oldBase).getReadIndicesForwardStrandList()));
            toForwardR.addAll(ProtoPredictor.expandFreq(somaticBuild.getCounts(newBase).getReadIndicesForwardStrandList()));
            mutateIntegerLists(fMutCount, fromForwardR, toForwardR);

            //reverse strand
            fromBackwardR.addAll(ProtoPredictor.expandFreq(somaticBuild.getCounts(oldBase).getReadIndicesReverseStrandList()));
            toBackwardR.addAll(ProtoPredictor.expandFreq(somaticBuild.getCounts(newBase).getReadIndicesReverseStrandList()));
            mutateIntegerLists(bMutCount, fromBackwardR, toBackwardR);


        }
        String mutatedAllele = "?";


        i = 0;
        for (BaseInformationRecords.CountInfo count : somaticBuild.getCountsList()) {
            BaseInformationRecords.CountInfo.Builder countBuild = count.toBuilder();
            countBuild.setGenotypeCountForwardStrand(forward[i]);
            countBuild.setGenotypeCountReverseStrand(backward[i]);
            if (i == oldBase) {
                //replace quality scores
                countBuild.clearQualityScoresForwardStrand();
                countBuild.clearQualityScoresReverseStrand();
                countBuild.addAllQualityScoresForwardStrand(ProtoHelper.compressFreq(fromForward));
                countBuild.addAllQualityScoresReverseStrand(ProtoHelper.compressFreq(fromBackward));

                //replace readIndices
                countBuild.clearReadIndicesForwardStrand();
                countBuild.clearReadIndicesReverseStrand();
                countBuild.addAllReadIndicesForwardStrand(ProtoHelper.compressFreq(fromForwardR));
                countBuild.addAllReadIndicesReverseStrand(ProtoHelper.compressFreq(fromBackwardR));

                //replace numVars
                countBuild.clearNumVariationsInReads();
                countBuild.addAllNumVariationsInReads(ProtoHelper.compressFreq(fromVC));

                //replace insert sizes
                countBuild.clearInsertSizes();
                countBuild.addAllInsertSizes(ProtoHelper.compressFreq(fromIS));


            } else if (i == newBase) {
                mutatedAllele = countBuild.getToSequence();
                //replace quality scores
                countBuild.clearQualityScoresForwardStrand();
                countBuild.clearQualityScoresReverseStrand();
                countBuild.addAllQualityScoresForwardStrand(ProtoHelper.compressFreq(toForward));
                countBuild.addAllQualityScoresReverseStrand(ProtoHelper.compressFreq(toBackward));

                //replace readIndices
                countBuild.clearReadIndicesForwardStrand();
                countBuild.clearReadIndicesReverseStrand();
                countBuild.addAllReadIndicesForwardStrand(ProtoHelper.compressFreq(toForwardR));
                countBuild.addAllReadIndicesReverseStrand(ProtoHelper.compressFreq(toBackwardR));
                baseBuild.setMutatedBase(count.getToSequence());

                //replace numVars
                countBuild.clearNumVariationsInReads();
                countBuild.addAllNumVariationsInReads(ProtoHelper.compressFreq(toVC));

                //replace insert sizes
                countBuild.clearInsertSizes();
                countBuild.addAllInsertSizes(ProtoHelper.compressFreq(toIS));


            }
            somaticBuild.setCounts(i, countBuild);
            i++;
        }
        somaticBuild.setFormattedCounts(Mutate.regenerateFormattedCounts(somaticBuild, mutatedAllele));
        baseBuild.setSamples(numSamples - 1, somaticBuild);
        baseBuild.setFrequencyOfMutation((float) frequency);
        //   System.out.printf("delta=%f somaticFreq=%f oldCount=%d%n", delta, frequency, sumCount);
        // String newBaseString = newBase source, List dest) {
        Collections.shuffle(source, rand);
        dest.addAll(source.subList(0, fMutCount));
        List tmp = new IntArrayList(source.subList(fMutCount, source.size()));
        source.clear();
        source.addAll(tmp);

    }


    private void mutateIntegerListsVarAdd(int fMutCount, List source, List dest, boolean sourceIsRef, boolean destIsRef) {
        //compute increment (-1,0,or 1)
        int increment = 0;
        if (sourceIsRef) increment++;
        if (destIsRef) increment--;

        int newDestIndex = dest.size();
        Collections.shuffle(source, rand);
        dest.addAll(source.subList(0, fMutCount));
        for (int i = newDestIndex; i < dest.size(); i++) {
            dest.set(i, dest.get(i) + increment);
        }
        List tmp = new IntArrayList(source.subList(fMutCount, source.size()));
        source.clear();
        source.addAll(tmp);

    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy