All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eqtlmappingpipeline.metaqtl3.EQTLRegression Maven / Gradle / Ivy

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package eqtlmappingpipeline.metaqtl3;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import umcg.genetica.console.ProgressBar;
import umcg.genetica.containers.Pair;
import umcg.genetica.io.trityper.EQTL;
import umcg.genetica.io.trityper.SNP;
import umcg.genetica.io.trityper.SNPLoader;
import umcg.genetica.io.trityper.TriTyperExpressionData;
import umcg.genetica.io.trityper.TriTyperGeneticalGenomicsDataset;
import umcg.genetica.io.trityper.QTLTextFile;
import umcg.genetica.math.PCA;
import umcg.genetica.math.matrix.DoubleMatrixDataset;
import umcg.genetica.math.stats.Regression;

/**
 *
 * @author harmjan
 */
public class EQTLRegression {

    TriTyperGeneticalGenomicsDataset[] gg;
    EQTL[] eqtlsToRegressOut;

    public void regressOutEQTLEffects(ArrayList> eqtls, TriTyperGeneticalGenomicsDataset[] gg) throws IOException {
        this.gg = gg;



        this.eqtlsToRegressOut = new EQTL[eqtls.size()];
        for (int q = 0; q < eqtls.size(); q++) {
            eqtlsToRegressOut[q] = new EQTL();
            eqtlsToRegressOut[q].setRsName(eqtls.get(q).getLeft());
            eqtlsToRegressOut[q].setProbe(eqtls.get(q).getRight());
        }
        System.out.println("About to regress out: " + eqtls.size() + " QTLs from data.");
        regressOutEQTLEffects();
    }

    public void regressOutEQTLEffects(EQTL[] eqtls, TriTyperGeneticalGenomicsDataset[] gg) throws IOException {
        this.gg = gg;
        this.eqtlsToRegressOut = eqtls;
        System.out.println("About to regress out: " + eqtls.length + " QTLs from data.");
        regressOutEQTLEffects();
    }

    public void regressOutEQTLEffects(String regressOutEQTLEffectFileName, boolean outputfiles, TriTyperGeneticalGenomicsDataset[] gg) throws IOException {


        this.gg = gg;
        System.out.println("\n\n\nRemoving eQTL effects from the following eQTL file: '" + regressOutEQTLEffectFileName);
        QTLTextFile in = new QTLTextFile(regressOutEQTLEffectFileName, QTLTextFile.R);
        eqtlsToRegressOut = in.read();
        in.close();
        System.out.println("Number of eQTLs to regress out found in file:\t" + eqtlsToRegressOut.length);
        regressOutEQTLEffects();
        if (outputfiles) {
            for (int d = 0; d < gg.length; d++) {
                TriTyperGeneticalGenomicsDataset ds = gg[d];
                TriTyperExpressionData dsexp = ds.getExpressionData();
                double[][] matrix = dsexp.getMatrix();
                String[] probes = dsexp.getProbes();
                String[] individuals = dsexp.getIndividuals();
                String filename = ds.getSettings().expressionLocation;
                DoubleMatrixDataset dsout = new DoubleMatrixDataset(matrix, Arrays.asList(probes), Arrays.asList(individuals));
                dsout.recalculateHashMaps();
                System.out.println("Saving expression file after removal of eQTL effects: " + filename + "-EQTLEffectsRemoved.txt.gz");
                dsout.save(filename + "-EQTLEffectsRemoved.txt.gz");
            }
        }

    }

    /**
     * Removes the effect of a supplied list of eQTL from the datasets by use of
     * regression
     *
     * @param regressOutEQTLEffectFileName the location of the file containing
     * the eQTL to be removed
     */
    private void regressOutEQTLEffects() throws IOException {

        //Inventorize whether for a particular probe there are multiple SNPs that we want to regress out:
        HashMap> hashProbesCovariates = new HashMap>();
        HashMap hashEQTLIds = new HashMap();
        int nrProbesWithMultipleCovariates = 0;

        for (int v = 0; v < eqtlsToRegressOut.length; v++) {
            EQTL current = eqtlsToRegressOut[v];
            hashEQTLIds.put(current, v);
            String probe = current.getProbe();

            if (!hashProbesCovariates.containsKey(probe)) {
                ArrayList eqtls = new ArrayList();
                eqtls.add(current);
                hashProbesCovariates.put(probe, eqtls);
            } else {
                hashProbesCovariates.get(probe).add(current);
                nrProbesWithMultipleCovariates++;
            }
        }

        if (nrProbesWithMultipleCovariates > 0) {
            System.out.println("There are:\t" + nrProbesWithMultipleCovariates + "\tprobes for which we want to regress out multiple SNPs. This will be conducted through multiple regression employing PCA.");
        }

        // remove the eqtl effects
        System.out.println("Removing eQTLs:");
        int[] nrEQTLsRegressedOut = new int[gg.length];
        int[][] explainedVariancePerEQTLProbe = new int[gg.length][101];



        SNPLoader[] ggSNPLoaders = new SNPLoader[gg.length];
        boolean dosageInformationPresentForAllDatasets = true;
        for (int d = 0; d < gg.length; d++) {
            ggSNPLoaders[d] = gg[d].getGenotypeData().createSNPLoader();
            if (!ggSNPLoaders[d].hasDosageInformation()) {
                dosageInformationPresentForAllDatasets = false;
            }
        }

        //Remove multiple SNPs acting on one single probe:
        for (int d = 0; d < gg.length; d++) {
            HashSet hashEQTLsMultipleRegressionRegressedOut = new HashSet();
            HashMap snpPassesQC = new HashMap();

            TriTyperGeneticalGenomicsDataset currentDataset = gg[d];
            String[] probes = gg[d].getExpressionData().getProbes();
            System.out.print("Dataset:\t" + gg[d].getSettings().name);
            ProgressBar pgb = new ProgressBar(probes.length);

            for (int p = 0; p < probes.length; p++) {

                ArrayList covariatesForThisProbe = hashProbesCovariates.get(probes[p]);

                if (covariatesForThisProbe != null) {
                    ArrayList eventualListOfEQTLs = new ArrayList();
                    ArrayList snpsForProbe = new ArrayList();
                    ArrayList xs = new ArrayList();
                    ArrayList meanxs = new ArrayList();


                    for (EQTL e : covariatesForThisProbe) {
                        if (!hashEQTLsMultipleRegressionRegressedOut.contains(e)) {
                            Integer snpId = gg[d].getGenotypeData().getSnpToSNPId().get(e.getRsName());

                            if (snpId != -9 && (snpPassesQC.get(snpId) == null || snpPassesQC.get(snpId))) {
                                // load SNP

                                SNP currentSNP = currentDataset.getGenotypeData().getSNPObject(snpId);
                                ggSNPLoaders[d].loadGenotypes(currentSNP);

                                if (ggSNPLoaders[d].hasDosageInformation()) {
                                    ggSNPLoaders[d].loadDosage(currentSNP);
                                }

                                if (currentSNP.passesQC()) {
                                    int[] indWGA = currentDataset.getExpressionToGenotypeIdArray();
                                    double[] x = currentSNP.selectGenotypes(indWGA);
                                    double meanX = JSci.maths.ArrayMath.mean(x);
                                    double varianceX = JSci.maths.ArrayMath.variance(x);
                                    for (int i = 0; i < x.length; i++) {
                                        x[i] -= meanX;
                                    }

                                    if (varianceX != 0) {
                                        eventualListOfEQTLs.add(e);
                                        snpsForProbe.add(currentSNP);
                                        xs.add(x);
                                        meanxs.add(meanX);
                                        snpPassesQC.put(snpId, true);
                                    } else {
                                        snpPassesQC.put(snpId, false);
                                    }
                                } else {
                                    snpPassesQC.put(snpId, false);
                                    currentSNP.clearGenotypes();
                                }
                            }
                        }
                    }

                    // regress out single effects
                    if (eventualListOfEQTLs.size() == 1) {
                        SNP currentSNP = snpsForProbe.get(0);

                        int[] expressionToGenotypeId = currentDataset.getExpressionToGenotypeIdArray();
                        double[] x = xs.get(0);
                        double meanX = meanxs.get(0);

                        //Get the expression data:
                        double[][] rawData = currentDataset.getExpressionData().getMatrix();
                        double meanY;
                        double varianceY;

                        //Check what the number of samples is with genotype data available:
                        int nrSamplesWGenotypeData = x.length;

                        double[] y = new double[nrSamplesWGenotypeData];
                        int totalGGSamples = currentDataset.getTotalGGSamples();
                        if (nrSamplesWGenotypeData == totalGGSamples) {
                            //All genotypes have been succesfully called, use quick approach:
                            meanY = currentDataset.getExpressionData().getProbeMean()[p];
                            varianceY = currentDataset.getExpressionData().getProbeVariance()[p];
                            for (int s = 0; s < totalGGSamples; s++) {
                                y[s] = rawData[p][s] - meanY;
                            }
                        } else {

                            //Not all genotypes have been succesfully called, use slow approach:
                            int itr = 0;
                            for (int s = 0; s < rawData[p].length; s++) {
                                int genotypeId = expressionToGenotypeId[s];
                                if (genotypeId != -1) {
                                    byte genotype = currentSNP.getGenotypes()[genotypeId];
                                    if (genotype != -1 && currentDataset.getGenotypeData().getIsIncluded()[genotypeId]) {
                                        double dVal = rawData[p][s];
                                        y[itr] = dVal;
                                        itr++;
                                    }
                                }
                            }
                            //Normalize subset of data:
                            meanY = JSci.maths.ArrayMath.mean(y);
                            varianceY = JSci.maths.ArrayMath.variance(y);
                            for (int i = 0; i < y.length; i++) {
                                y[i] -= meanY;
                            }

                        }

                        //Get regression coefficient:
                        double[] rc = Regression.getLinearRegressionCoefficients(x, y);

                        double correlation = JSci.maths.ArrayMath.correlation(x, y);
                        double propExplainedVarianceTrait = correlation * correlation - 1.0d / (double) y.length;
                        if (propExplainedVarianceTrait < 0) {
                            propExplainedVarianceTrait = 0;
                        }
                        explainedVariancePerEQTLProbe[d][(int) Math.round(propExplainedVarianceTrait * 100d)]++;

                        //Make copy of this particular eQTL:
                        double[] rawDataUpdated = new double[totalGGSamples];

                        if (nrSamplesWGenotypeData == totalGGSamples) {

                            //Regress out eQTL affect in linear regression way:
                            for (int s = 0; s < totalGGSamples; s++) {
                                double residual = y[s] - x[s] * rc[0];
                                rawDataUpdated[s] = residual;
                            }

                        } else {

                            //Correct for missing genotypes, first determine average genotype of called samples:
//                           int[] indWGA = currentDataset.getWGAGenotypeIDs();
                            for (int s = 0; s < totalGGSamples; s++) {
                                int ind = expressionToGenotypeId[s];
                                if (ind != -1) {
                                    double valX = currentSNP.getGenotypes()[ind];
                                    if (valX == -1) {
                                        valX = 0;
                                    } else {
                                        valX -= meanX;
                                    }
                                    rawDataUpdated[s] = (double) rawData[p][s] - valX * rc[0];
                                }
                            }

                        }

                        //Make mean and standard deviation of residual gene expression identical to what it was before:
                        double meanUpdated = JSci.maths.ArrayMath.mean(rawDataUpdated);
                        double stdDevRatio = JSci.maths.ArrayMath.standardDeviation(rawDataUpdated) / Math.sqrt(varianceY);
                        for (int s = 0; s < totalGGSamples; s++) {
                            rawDataUpdated[s] -= meanUpdated;
                            rawDataUpdated[s] /= stdDevRatio;
                            rawDataUpdated[s] += meanY;
                        }
                        System.arraycopy(rawDataUpdated, 0, rawData[p], 0, totalGGSamples);
                        nrEQTLsRegressedOut[d]++;
                    } else if (eventualListOfEQTLs.size() > 1 && !dosageInformationPresentForAllDatasets) {
                        System.err.println("Multiple linear regression is not supported for datasets that do not have dosage information.");
                        System.exit(-1);
                    } else if (eventualListOfEQTLs.size() > 1 && dosageInformationPresentForAllDatasets) {
                        // use multiple linear regression via PCA

                        hashEQTLsMultipleRegressionRegressedOut.addAll(eventualListOfEQTLs);

                        int nrSNPs = snpsForProbe.size();
                        int totalGGSamples = currentDataset.getTotalGGSamples();

                        // use PCA
                        // Multiple SNPs need to be regressed out. Get SNP genotype values, standardize mean and std dev for each of these:
                        double[][] dataMatrix = new double[nrSNPs][0];
                        for (int i = 0; i < dataMatrix.length; i++) {
                            dataMatrix[i] = xs.get(i);
                        }

                        //Calculate covariance matrix:
                        double[][] correlationMatrix = new double[nrSNPs][nrSNPs];
                        double sampleCountMinusOne = totalGGSamples - 1;

                        for (int f = 0; f < nrSNPs; f++) {
                            for (int g = f; g < nrSNPs; g++) {
                                double covarianceInterim = 0;
                                for (int h = 0; h < totalGGSamples; h++) {
                                    covarianceInterim += dataMatrix[f][h] * dataMatrix[g][h];
                                }
                                double covariance = covarianceInterim / (double) (sampleCountMinusOne);
                                correlationMatrix[f][g] = covariance;
                                correlationMatrix[g][f] = covariance;
                            }
                        }
                        
                        //Perform eigenvalue decomposition:
                        Jama.EigenvalueDecomposition eig = PCA.eigenValueDecomposition(correlationMatrix);
                        double[][] eigenArrayLists = new double[correlationMatrix.length][correlationMatrix.length];
                        for (int pca = 0; pca < nrSNPs; pca++) {
                            eigenArrayLists[pca] = PCA.getEigenVector(eig, pca);
                        }
                        
                        //Calculate principal component scores:
                        double[][] dataMatrixPCScores = new double[nrSNPs][totalGGSamples];
                        for (int sample = 0; sample < totalGGSamples; sample++) {
                            for (int pca = 0; pca < nrSNPs; pca++) {
                                for (int snp = 0; snp < nrSNPs; snp++) {
                                    double probeCoefficient = eigenArrayLists[pca][snp];
                                    dataMatrixPCScores[pca][sample] += dataMatrix[snp][sample] * probeCoefficient;
                                }
                            }
                        }
                   
                        //Orthogonal PCAs have been determined, use these to regress out the effects on gene expression levels:
                        TriTyperExpressionData expresionData = currentDataset.getExpressionData();
                        double[][] rawData = currentDataset.getExpressionData().getMatrix();

                        //Check what the number of samples is with genotype data available:
                        double[] y = new double[totalGGSamples];

                        //All genotypes have been succesfully called:
                        double meanYOriginal = expresionData.getProbeMean()[p];
                        double varianceYOriginal = expresionData.getProbeVariance()[p];
                        System.arraycopy(rawData[p], 0, y, 0, totalGGSamples);

                        boolean[] regressOutPCA = new boolean[nrSNPs];
                        double[] eigenValues = eig.getRealEigenvalues();
                        boolean atLeastOnePCANotRegressedOut = false;
                        for (int pca = 0; pca < nrSNPs; pca++) {
                            regressOutPCA[pca] = true;
                            if (PCA.getEigenValueVar(eigenValues, pca) < 1e-10) {
                                regressOutPCA[pca] = false;
                                atLeastOnePCANotRegressedOut = true;
                            }
                        }

                        if (atLeastOnePCANotRegressedOut) {
                            //Provide information on the PCAs:
                            System.out.println("There is at least one PCA that has not been regressed out as it does not explain a lot of genetic variation!:");
                            for (int pca = 0; pca < nrSNPs; pca++) {
                                double[] x = dataMatrixPCScores[pca];
                                double correlation = JSci.maths.ArrayMath.correlation(x, y);
                                double r2 = correlation * correlation;
                                int pcaNr = pca + 1;
                                String snpsStronglyCorrelatedWithPCA = "";
                                for (int snp = 0; snp < nrSNPs; snp++) {
                                    double correlationPCASNP = Math.abs(JSci.maths.ArrayMath.correlation(x, dataMatrix[snp]));
                                    double r2PCASNP = correlationPCASNP * correlationPCASNP;
                                    if (r2PCASNP > 0.1) {
                                        snpsStronglyCorrelatedWithPCA += "\t" + snpsForProbe.get(snp).getName() + ", " + r2PCASNP;
                                    }
                                }
                                System.out.println(probes[p] + "\tPCA" + pcaNr + "\tExplainedVariance:\t" + PCA.getEigenValueVar(eigenValues, pca) + "\tEigenvalue:\t" + eigenValues[eigenValues.length - 1 - pca] + "\tPCATraitR2:\t" + r2 + "\tSNPsStronglyCorrelatedWithPCA:\t" + snpsStronglyCorrelatedWithPCA);
                            }
                            System.out.println("");
                        }

                        //Process each PC, determine total amount of variation explained by the combination of PCs:
                        double propExplainedVarianceTrait = 0;
                        for (int pca = 0; pca < nrSNPs; pca++) {
                            if (regressOutPCA[pca]) {
                                //Get PCA scores:
                                double[] x = dataMatrixPCScores[pca];
                                //Get correlation coefficient with trait:
                                double correlation = JSci.maths.ArrayMath.correlation(x, y);
                                propExplainedVarianceTrait += correlation * correlation - 1.0d / (double) y.length;
                            }
                        }
                        if (propExplainedVarianceTrait < 0) {
                            propExplainedVarianceTrait = 0;
                        }
                        explainedVariancePerEQTLProbe[d][(int) Math.round(propExplainedVarianceTrait * 100d)]++;

                        //Regress out PC effects on trait:
                        for (int pca = 0; pca < nrSNPs; pca++) {
                            if (regressOutPCA[pca]) {
                                //Get PC scores:
                                double[] x = dataMatrixPCScores[pca];

                                //Get regression coefficient:
                                double[] rc = Regression.getLinearRegressionCoefficients(x, y);

                                //Regress out eQTL affect in linear regression way:
                                for (int s = 0; s < totalGGSamples; s++) {
                                    y[s] = y[s] - x[s] * rc[0];
                                }
                            }
                        }

                        double meanYUpdated = JSci.maths.ArrayMath.mean(y);
                        double varianceYUpdated = JSci.maths.ArrayMath.variance(y);

                        //Make mean and standard deviation of residual gene expression identical to what it was before:
                        double stdDevRatio = Math.sqrt(varianceYUpdated) / Math.sqrt(varianceYOriginal);
                        for (int s = 0; s < totalGGSamples; s++) {
                            y[s] -= meanYUpdated;
                            y[s] /= stdDevRatio;
                            y[s] += meanYOriginal;
                        }

                        //Replace original expression data with updated residual gene expression data:
                        for (int s = 0; s < totalGGSamples; s++) {
                            if (Double.isNaN(y[s])) {
                                System.out.println("Error!:\t" + probes[p] + "\t" + gg[d].getSettings().name + "\t" + s + "\t" + meanYUpdated + "\t" + stdDevRatio + "\t" + meanYOriginal);
                            }
                            rawData[p][s] = y[s];
                        }

                        nrEQTLsRegressedOut[d]++;

                    }

                    for (SNP s : snpsForProbe) {
                        s.clearGenotypes();
                    }
                }
                pgb.iterate();
            }
            pgb.print();
            pgb.close();
            System.out.println("");
        }

        for (int ds = 0; ds < gg.length; ds++) {
            ggSNPLoaders[ds].close();
            ggSNPLoaders[ds] = null;
        }

        System.out.println("\n");
        System.out.println("eQTLs regressed per dataset:");
        for (int d = 0; d < gg.length; d++) {
            System.out.println(gg[d].getSettings().name + "\t" + nrEQTLsRegressedOut[d]);
        }

        String output;
        System.out.println("\n");
        System.out.println("Proportion explained variance of genotypic variation on eQTLs per dataset:");


        output = "r2";
        for (TriTyperGeneticalGenomicsDataset gg1 : gg) {
            output += "\t" + gg1.getSettings().name;
        }

        System.out.println(output);
        for (int e = 0; e <= 100; e++) {
            double r2 = (double) e / 100;
            output = String.valueOf(r2);
            for (int d = 0; d < gg.length; d++) {
                output += "\t" + explainedVariancePerEQTLProbe[d][e];
            }
            System.out.println(output);
        }

    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy