eqtlmappingpipeline.metaqtl4.MetaQTL4CorrelationTask Maven / Gradle / Ivy

Go to download
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package eqtlmappingpipeline.metaqtl4;

import gnu.trove.map.hash.TObjectIntHashMap;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.Callable;
import org.molgenis.genotype.variant.GeneticVariant;
import umcg.genetica.console.ProgressBar;
import umcg.genetica.containers.Pair;
import umcg.genetica.math.stats.Correlation;
import umcg.genetica.math.stats.Descriptives;
import umcg.genetica.math.stats.ZScores;
import umcg.genetica.util.Primitives;

/**
 *
 * @author harm-jan
 */
public class MetaQTL4CorrelationTask implements Callable> {

    private final int distributionSize;
    private final long[] randomizationSeeds;
    private final ArrayList availableTraits;
    private final TObjectIntHashMap availableTraitsHash;
    private final MetaQTL4Dataset[] datasets;
    private final GeneticVariant[][] geneticVariantIndex;
    private final MetaQTL4Settings m_settings;
    private final MetaQTL4TraitAnnotation traitAnnotation;
    private final Integer[][] traitIndex;
    private final Set traitsToInclude;
    private final Set variantsToInclude;
    private final int threadIndex;
    private final int[] realDistribution;
    private final int[] permutedDistribution;

    MetaQTL4CorrelationTask(int nrThreads,
            int distributionSize,
            long[] randomizationSeeds,
            ArrayList availableTraits,
            TObjectIntHashMap availableTraitsHash,
            MetaQTL4Dataset[] datasets,
            GeneticVariant[][] geneticVariantIndex,
            MetaQTL4Settings m_settings,
            MetaQTL4TraitAnnotation traitAnnotation,
            Integer[][] traitIndex,
            Set traitsToInclude,
            Set variantsToInclude,
            int threadIndex) {
        this.distributionSize = distributionSize;
        this.randomizationSeeds = randomizationSeeds;
        this.availableTraits = availableTraits;
        this.availableTraitsHash = availableTraitsHash;
        this.datasets = datasets;
        this.geneticVariantIndex = geneticVariantIndex;
        this.m_settings = m_settings;
        this.traitAnnotation = traitAnnotation;
        this.traitIndex = traitIndex;
        this.traitsToInclude = traitsToInclude;
        this.variantsToInclude = variantsToInclude;
        this.threadIndex = threadIndex;
        this.realDistribution = new int[distributionSize];
        this.permutedDistribution = new int[distributionSize];
    }

    private int test(int[] sampleSizes, double[][] genotypes, double[] genotypeVariances, Integer traitId, boolean[][] includeTraitSample) {
        int nrDatasets = datasets.length;
        double[] zscores = new double[nrDatasets];
        for (int datasetId = 0; datasetId < nrDatasets; datasetId++) {

            if (genotypes[datasetId] == null) {
                zscores[datasetId] = Double.NaN;
            } else {
                double[] x = genotypes[datasetId];
                if (x == null) {
                    System.err.println("ERROR: genotype is null");
                    zscores[datasetId] = Double.NaN;
                } else {
                    double varianceX = genotypeVariances[datasetId];
                    boolean[] includeDatasetTraitSample = includeTraitSample[datasetId];

                    double meanY;
                    double varianceY;

                    // re-normalize the trait data when the genotypes had missing values
                    Integer datasetTraitId = traitIndex[datasetId][traitId];
                    double[] y = datasets[datasetId].getTraitData(datasetTraitId);
//                    if (sampleSizes[datasetId] != y.length) {
                    double[] newY = new double[x.length];
                    int itr = 0;
                    double sum = 0;

                    // recalculate mean and variance
                    for (int s = 0; s < y.length; s++) {
//                        if (includeDatasetTraitSample[s]) {
                            newY[itr] = y[s];
                            sum += newY[itr];
                            itr++;
//                        }
                    }

                    y = newY;
                    meanY = sum / itr;
                    double varsum = 0;
                    for (int i = 0; i < y.length; i++) {
                        y[i] -= meanY;
                        varsum += y[i] * y[i];
                    }
                    varianceY = varsum / (y.length - 1);
//                    } else {
                    double varianceYOrig = datasets[datasetId].getTraitVariance(traitId);
//                    }

                    if (varianceY == 0) {
                        // trait has no variance
                        zscores[datasetId] = Double.NaN;
                    } else {
                        //Calculate correlation coefficient:
                        double correlation = Correlation.correlateMeanCenteredData(x, y, varianceX, varianceY);
                        double correlation2 = JSci.maths.ArrayMath.correlation(x, y);
                        System.out.println(correlation + "\t" + correlation2 + "\t" + x.length + "\t" + varianceX + "\t" + varianceYOrig + "\t" + varianceY + "\t" + Descriptives.variance(x) + "\t" + Descriptives.variance(y));
                        if (correlation >= -1 && correlation <= 1) {
                            zscores[datasetId] = Correlation.convertCorrelationToZScore(x.length, correlation);
                        } else {
                            System.err.println("Error! correlation invalid: " + correlation);
                            System.exit(-1);
                        }
                    }
                }
            }
        }

        double metaZ = ZScores.getWeightedZ(zscores, sampleSizes);
        double p = ZScores.zToP(metaZ);
        int bin = (int) Math.round(p * distributionSize);
        if (bin == distributionSize) {
            bin--;
        }

        return bin;
    }

    private Pair correctGenotypesForMissingValuesAndNormalize(int[] coupledTraitInt, GeneticVariant variant, float[] genotypesTMP, boolean[] includeTraitSample) {
        int xLen = genotypesTMP.length;
        int nrMissing = 0;
        double[] snpmeancorrectedgenotypes;
        double varianceX;
        double meanX;
        if (variant.getCallRate() < 1d) {
            double sum = 0;
            for (int i = 0; i < xLen; i++) {
                if (genotypesTMP[i] < 0) {
                    nrMissing++;
                    int coupledTrait = coupledTraitInt[i]; // not really sure if this is still required..
                    includeTraitSample[coupledTrait] = false;
                } else {
                    sum += genotypesTMP[i];
                }
            }
            int newXLen = xLen - nrMissing;
            meanX = sum / newXLen;
            snpmeancorrectedgenotypes = new double[newXLen];

            int itr = 0;
            for (int i = 0; i < newXLen; i++) {
                if (genotypesTMP[i] >= 0) {
                    snpmeancorrectedgenotypes[itr] = genotypesTMP[i] - meanX;
                    itr++;
                }
            }
        } else {
            double sum = 0;
            snpmeancorrectedgenotypes = new double[xLen];
            for (int i = 0; i < xLen; i++) {
                double genotype = genotypesTMP[i];
                sum += genotype;
                snpmeancorrectedgenotypes[i] = genotype;
            }
            //  meanX = sum / genotypesTMP.length;
        }

        varianceX = JSci.maths.ArrayMath.variance(snpmeancorrectedgenotypes);
        return new Pair(snpmeancorrectedgenotypes, varianceX);
    }

    private double[] permuteGenotypes(double[] genotypes, Random rand) {
        ArrayList availableGenotypes = new ArrayList();
        for (int i = 0; i < genotypes.length; i++) {
            availableGenotypes.add(genotypes[i]);
        }
        Collections.shuffle(availableGenotypes, rand);
        return Primitives.toPrimitiveArr(availableGenotypes.toArray(new Double[0]));
    }

    @Override
    public Pair call() throws Exception {
        try {
            // normal procedure
            int nrPermutations = randomizationSeeds.length;
            int nrDatasets = geneticVariantIndex.length;
            int nrVariants = geneticVariantIndex[nrDatasets - 1].length;

            // iterate the variants
            ArrayList traitsToTest = null;
            boolean[] flipEffects = new boolean[nrDatasets];
            int numberOfDatasetsPassingQC = 0;

            double mafthreshold = m_settings.getSnpQCMAFThreshold();
            double hwepthreshold = m_settings.getSnpQCHWEThreshold();
            double callratethreshold = m_settings.getSnpQCHWEThreshold();

            int[] maxNrSamplesPerDataset = new int[datasets.length];
            boolean[][] includeTraitSample = new boolean[nrDatasets][0];
            for (int d = 0; d < maxNrSamplesPerDataset.length; d++) {
                int nrSamples = datasets[d].getGenotypeToTraitCouplingInt().length;
                maxNrSamplesPerDataset[d] = nrSamples;
                includeTraitSample[d] = new boolean[nrSamples];
            }

            ProgressBar pb = new ProgressBar(nrVariants, "Running calculations...");
            for (int variantId = 0; variantId < nrVariants; variantId += threadIndex) {
                pb.iterate();
                // load the genotypes
                double[][] genotypes = new double[nrDatasets][0];
                double[] genotypeVariances = new double[nrDatasets];
                int[] sampleSizes = new int[nrDatasets];

                int startPosition = -1;
                String sequence = null;
                numberOfDatasetsPassingQC = 0;

                for (int datasetId = 0; datasetId < nrDatasets; datasetId++) {
                    GeneticVariant variant = geneticVariantIndex[datasetId][variantId];

                    MetaQTL4Dataset dataset = datasets[datasetId];
                    int[] gte = dataset.getGenotypeToTraitCouplingInt();
                    if (variant != null) {
                        float[] genotypesTMP = dataset.getSampleDosages(variant);
                        double maf = variant.getMinorAlleleFrequency();
                        double hwep = variant.getHwePvalue();
                        double callrate = variant.getCallRate();

                        // TODO: best done on loading dataset with filter.
                        if (maf > mafthreshold && hwep > hwepthreshold && callrate > callratethreshold) {
                            // TODO: remove missing genotypes and rescale the genotype data
                            Pair genotypedata = correctGenotypesForMissingValuesAndNormalize(gte, variant, genotypesTMP, includeTraitSample[datasetId]);
                            genotypes[datasetId] = genotypedata.getLeft();

                            genotypeVariances[datasetId] = genotypedata.getRight();
                            sampleSizes[datasetId] = genotypes[datasetId].length;
                            if (sequence == null) {
                                startPosition = variant.getStartPos();
                                sequence = variant.getSequenceName();
                            }
                        } else {
                            // TODO: output to the log?
                            genotypes[datasetId] = null;
                        }
                    } else {
                        genotypes[datasetId] = null;
                    }
                }

                // determine which traits to test
                // If this is a cis OR a trans analysis but NOT BOTH: use XOR
                // else test ALL traits
                if (m_settings.isCisAnalysis() ^ m_settings.isTransAnalysis()) {
                    // index once, use many
                    if (traitsToTest == null) {
                        traitsToTest = new ArrayList();
                        // possible faster with hashmap lookup
                        for (int i = 0; i < availableTraits.size(); i++) {
                            MetaQTL4MetaTrait trait = availableTraits.get(i); // a treemap may be more efficient here
                            boolean sameChr = trait.getChr().equals(sequence);
                            if (sameChr) {
                                int distance = Math.abs(trait.getChrMidpoint() - startPosition); // precalculate this, if possible
                                if (m_settings.isCisAnalysis() && distance < m_settings.getCiseQTLAnalysMaxSNPProbeMidPointDistance()) {
                                    traitsToTest.add(i);
                                }
                                if (m_settings.isTransAnalysis() && distance > m_settings.getCiseQTLAnalysMaxSNPProbeMidPointDistance()) {
                                    traitsToTest.add(i);
                                }
                            }
                            if (!sameChr && m_settings.isTransAnalysis()) {
                                traitsToTest.add(i);
                            }
                        }
                    }
                }

                if (traitsToTest != null && traitsToTest.isEmpty()) {
                    // do something, log perhaps
                } else {
                    // run test
                    int[] dist;
                    for (int permutation = -1; permutation < nrPermutations; permutation++) {

                        if (permutation >= 0) {
                            dist = permutedDistribution;
                            long seed = randomizationSeeds[permutation];
                            Random rand = new Random(seed);

                            // reorder genotypes on the basis of the permuted sample links
                            for (int d = 0; d < datasets.length; d++) {
                                if (genotypes[d] != null) {
                                    genotypes[d] = permuteGenotypes(genotypes[d], rand);
                                }
                            }
                        } else {
                            dist = realDistribution;
                        }

                        if (traitsToTest == null) {
                            // cistrans analysis
                            for (int trait = 0; trait < availableTraits.size(); trait++) {
                                int bin = test(sampleSizes, genotypes, genotypeVariances, trait, includeTraitSample);
                                dist[bin]++;
                            }
                        } else {
                            for (Integer trait : traitsToTest) {
//                                MetaQTL4MetaTrait traitObj = availableTraits.get(trait);
//                                System.out.println(Strings.concat(traitObj.getPlatformIds(), Strings.comma));
                                int bin = test(sampleSizes, genotypes, genotypeVariances, trait, includeTraitSample);
                                dist[bin]++;
                            }
                        }
                    }
                }
            }
            pb.close();

        } catch (Exception e) {
            e.printStackTrace();
        }
        return new Pair(realDistribution, permutedDistribution);
    }

//    private int test(int[] sampleSizes, double[][] genotypes, double[] genotypeVariances, Integer traitId, boolean[][] includeTraitSample) {
//        int nrDatasets = datasets.length;
//        double[] zscores = new double[nrDatasets];
//        for (int datasetId = 0; datasetId < nrDatasets; datasetId++) {
//
//            if (genotypes[datasetId] == null) {
//                zscores[datasetId] = Double.NaN;
//            } else {
//                double[] x = genotypes[datasetId];
//
//                if (x == null) {
//                    System.err.println("ERROR: genotype is null");
//                    zscores[datasetId] = Double.NaN;
//                } else {
//                    double varianceX = genotypeVariances[datasetId];
//                    boolean[] includeDatasetTraitSample = includeTraitSample[datasetId];
//
//                    double meanY;
//                    double varianceY;
//
//                    // re-normalize the trait data when the genotypes had missing values
//                    Integer datasetTraitId = traitIndex[datasetId][traitId];
//                    double[] y = datasets[datasetId].getTraitData(datasetTraitId);
//                    if (sampleSizes[datasetId] != y.length) {
//                        double[] newY = new double[x.length];
//                        int itr = 0;
//                        double sum = 0;
//
//                        // recalculate mean and variance
//                        for (int s = 0; s < y.length; s++) {
//                            if (includeDatasetTraitSample[s]) {
//                                newY[itr] = y[s];
//                                sum += newY[itr];
//                                itr++;
//                            }
//                        }
//
//                        y = newY;
//                        meanY = sum / itr;
//                        double varsum = 0;
//                        for (int i = 0; i < y.length; i++) {
//                            y[i] -= meanY;
//                            varsum += y[i] * y[i];
//                        }
//                        varianceY = varsum / (y.length - 1);
//                    } else {
//                        varianceY = datasets[datasetId].getTraitVariance(datasetTraitId);
//                    }
//
//                    if (varianceY == 0) {
//                        // trait has no variance
//                        zscores[datasetId] = Double.NaN;
//                    } else {
//                        //Calculate correlation coefficient:
//                        double correlation = Correlation.correlate(x, y, varianceX, varianceY);
////                        double correlation2 = JSci.maths.ArrayMath.correlation(x, y);
////
////                        if (correlation != correlation2) {
////                            System.err.println("ERROR: "+(correlation-correlation2));
////                            double[] newY = new double[x.length];
////                            int itr = 0;
////                            double sum = 0;
////
////                            // recalculate mean and variance
////                            for (int s = 0; s < y.length; s++) {
////                                if (includeDatasetTraitSample[s]) {
////                                    newY[itr] = y[s];
////                                    sum += newY[itr];
////                                    itr++;
////                                }
////                            }
////
////                            meanY = sum / itr;
////                            double varsum = 0;
////                            for (int i = 0; i < newY.length; i++) {
////                                newY[i] -= meanY;
////                                varsum += newY[i] * newY[i];
////                            }
////                            double varianceYRecalculated = varsum / (newY.length - 1);
////                            System.err.println(correlation + "\t" + correlation2 + "\t"
////                                    + x.length + "\t"
////                                    + varianceYRecalculated + "\t"
////                                    + varianceX + "\t"
////                                    + varianceY + "\t"
////                                    + datasets[datasetId].getTraitVariance(datasetTraitId) + "\t"
////                                    + Descriptives.variance(x) + "\t"
////                                    + Descriptives.variance(y)
////                            );
//////                            for (int i = 0; i < y.length; i++) {
//////                                System.out.println(i + "\t" + y[i] + "\t" + x[i]);
//////                            }
//////                            System.exit(-1);
////                        }
//
//                        if (correlation >= -1 && correlation <= 1) {
//                            zscores[datasetId] = Correlation.convertCorrelationToZScore(x.length, correlation);
//                        } else {
//                            System.err.println("Error! correlation invalid: " + correlation);
//                            System.exit(-1);
//                        }
//                    }
//                }
//            }
//        }
//
//        double metaZ = ZScores.getWeightedZ(zscores, sampleSizes);
//        double p = ZScores.zToP(metaZ);
//        int bin = (int) Math.round(p * distributionSize);
//        if (bin == distributionSize) {
//            bin--;
//        }
//
//        return bin;
//    }
//
//    private Pair correctGenotypesForMissingValuesAndNormalize(int[] coupledTraitInt, GeneticVariant variant, float[] genotypesTMP, boolean[] includeTraitSample) {
//        int xLen = genotypesTMP.length;
//        int nrMissing = 0;
//        double[] snpmeancorrectedgenotypes;
//        double varianceX;
//        double meanX;
//        if (variant.getCallRate() < 1d) {
//            double sum = 0;
//            for (int i = 0; i < xLen; i++) {
//                if (genotypesTMP[i] < 0) {
//                    nrMissing++;
//                    int coupledTrait = coupledTraitInt[i]; // not really sure if this is still required..
//                    includeTraitSample[coupledTrait] = false;
//                } else {
//                    sum += genotypesTMP[i];
//                }
//            }
//            int newXLen = xLen - nrMissing;
//            meanX = sum / newXLen;
//            snpmeancorrectedgenotypes = new double[newXLen];
//
//            int itr = 0;
//            for (int i = 0; i < newXLen; i++) {
//                if (genotypesTMP[i] >= 0) {
//                    snpmeancorrectedgenotypes[itr] = genotypesTMP[i] - meanX;
//                    itr++;
//                }
//            }
//        } else {
//            double sum = 0;
//            snpmeancorrectedgenotypes = new double[xLen];
//            for (int i = 0; i < xLen; i++) {
//                double genotype = genotypesTMP[i];
//                sum += genotype;
//                snpmeancorrectedgenotypes[i] = genotype;
//            }
//            //  meanX = sum / genotypesTMP.length;
//        }
//
//        varianceX = JSci.maths.ArrayMath.variance(snpmeancorrectedgenotypes);
//        return new Pair(snpmeancorrectedgenotypes, varianceX);
//    }
//
//    private double[] permuteGenotypes(double[] genotypes, Random rand) {
//        ArrayList availableGenotypes = new ArrayList();
//
//        if (genotypes == null) {
//            System.err.println("ERROR: genotypes null");
//            return null;
//        }
//
//        for (int i = 0; i < genotypes.length; i++) {
//            availableGenotypes.add(genotypes[i]);
//        }
//        Collections.shuffle(availableGenotypes, rand);
//        return Primitives.toPrimitiveArr(availableGenotypes.toArray(new Double[0]));
//    }

}