All Downloads are FREE. Search and download functionalities are using the official Maven repository.

umcg.genetica.gwas.Independifier Maven / Gradle / Ivy

There is a newer version: 1.0.7
Show newest version
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package umcg.genetica.gwas;

import gnu.trove.map.hash.THashMap;
import gnu.trove.map.hash.TObjectIntHashMap;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.logging.Level;
import java.util.logging.Logger;
import umcg.genetica.io.text.TextFile;
import umcg.genetica.io.trityper.SNP;
import umcg.genetica.io.trityper.SNPLoader;
import umcg.genetica.io.trityper.TriTyperGenotypeData;
import umcg.genetica.io.trityper.util.DetermineLD;
import umcg.genetica.math.matrix.DoubleMatrixDataset;

/**
 *
 * All shall be independent.
 *
 * @author juha
 */
public class Independifier {

    private static final Logger LOGGER = Logger.getLogger(Independifier.class.getName());
    TriTyperGenotypeData genotypeData;
    final SNPLoader snpLoader;
    Map> permDatasets;

    public Independifier(String datadir) throws IOException {
        genotypeData = new TriTyperGenotypeData();
        genotypeData.load(datadir);
        snpLoader = genotypeData.createSNPLoader();
    }

    public Independifier(TriTyperGenotypeData genotypeData, SNPLoader snpLoader) throws IOException {
        this.genotypeData = genotypeData;
        this.snpLoader = snpLoader;
    }

    public void selectSNPsWithSimilarMAFsAsRealInputSNPs(String[] snps) throws IOException {

        TObjectIntHashMap snpToSNPId = genotypeData.getSnpToSNPId();

        Vector vecMAFs = new Vector();
        for (int s = 0; s < snps.length; s++) {
            if (snpToSNPId.get(snps[s]) != -9) {
                int id = snpToSNPId.get(snps[s]);
                SNP snp = genotypeData.getSNPObject(id);
                synchronized (snpLoader) {
                    snpLoader.loadGenotypes(snp);
                }
                Double maf = snp.getMAF();
                vecMAFs.add(maf);
                System.out.println(snp.getName() + "\t" + maf);
            } else {
                System.out.println("Error! SNP " + snps[s] + " is not present in the genotype data!!!!!!");
            }
        }


        double[] mafs = new double[vecMAFs.size()];
        for (int m = 0; m < mafs.length; m++) {
            mafs[m] = ((Double) vecMAFs.get(m)).doubleValue();
        }
        double median = JSci.maths.ArrayMath.median(mafs);
        System.out.println("Median MAF:\t" + median);

        System.out.println("\n\n\n\n\n\n");
        for (int snpId = 0, len = snpToSNPId.size(); snpId < len; snpId++) {
            SNP snp = genotypeData.getSNPObject(snpId);
            synchronized (snpLoader) {

                snpLoader.loadGenotypes(snp);
            }
            Double maf = snp.getMAF();
            System.out.println(snpId + "\t" + snp.getName() + "\t" + maf);
        }

    }

    /**
     *
     * Independifies given SNPs with given thresholds.
     *
     * @param snps
     * @param r2Threshold
     * @param bpThreshold Base pair distance threshold, ld will be calculated
     * for SNPs
     * @return
     * @throws IOException
     */
    public String[] independify(String[] snps, double r2Threshold, int bpThreshold) throws IOException {

        // limit to SNPs present in genotype data
        TObjectIntHashMap snpToSNPId = genotypeData.getSnpToSNPId();
        Vector vecSNPsPresentInGenotypeData = new Vector();
        for (int s = 0; s < snps.length; s++) {
            if (snpToSNPId.get(snps[s]) == -9) {
                vecSNPsPresentInGenotypeData.add(snps[s]);
            } else {
                System.out.println("Error! SNP " + snps[s] + " is not present in the genotype data!!!!!!");
            }
        }
        snps = new String[vecSNPsPresentInGenotypeData.size()];
        for (int s = 0; s < snps.length; s++) {
            snps[s] = (String) vecSNPsPresentInGenotypeData.get(s);
        }

        // calculate pairwise LD over all SNPs in the same chromosome within the given distance
        DetermineLD ldCalc = new DetermineLD();
        double[][] r2Matrix = new double[snps.length][snps.length];
        int[] snpsChr = new int[snps.length];
        int[] snpsChrPos = new int[snps.length];
        for (int s1 = 0; s1 < snps.length; s1++) {
            Integer get = snpToSNPId.get(snps[s1]);
            SNP snp1 = genotypeData.getSNPObject(get);
            synchronized (snpLoader) {
                snpLoader.loadGenotypes(snp1);
            }
            byte chr1 = snp1.getChr();
            int chrPos1 = snp1.getChrPos();
            snpsChr[s1] = chr1;
            snpsChrPos[s1] = chrPos1;
            for (int s2 = s1 + 1; s2 < snps.length; s2++) {
                SNP snp2 = genotypeData.getSNPObject(snpToSNPId.get(snps[s2]));
                if (s1 != s2 && chr1 == snp2.getChr() && Math.abs(chrPos1 - snp2.getChrPos()) < bpThreshold) {
                    synchronized (snpLoader) {
                        snpLoader.loadGenotypes(snp2);
                    }
                    double r2 = ldCalc.getRSquared(snp1, snp2, genotypeData, DetermineLD.RETURN_R_SQUARED, DetermineLD.INCLUDE_CASES_AND_CONTROLS, false);
                    r2Matrix[s1][s2] = r2;
                    r2Matrix[s2][s1] = r2;
                }
            }
        }

        int nrIndependifiedLeadSNPs = 0;
        int nrSNPsToUse = snps.length;
        System.out.println("Nr of SNPs with available genotype data in imputed dataset:\t" + nrSNPsToUse);

        int[] clusters = new int[snps.length];
        for (int s1 = 0; s1 < snps.length; s1++) {
            clusters[s1] = -1;
        }
        int nrClusters = 0;

        // assign SNPs to clusters
        for (int s1 = 0; s1 < nrSNPsToUse; s1++) {
            boolean SNPInLDWithOtherSNP = false;
            for (int s2 = 0; s2 < nrSNPsToUse; s2++) {
                if (s1 != s2 && snpsChr[s1] == snpsChr[s2] && Math.abs(snpsChrPos[s1] - snpsChrPos[s2]) < bpThreshold) {
                    double r2 = r2Matrix[s1][s2];
                    if (r2 > r2Threshold) {
                        SNPInLDWithOtherSNP = true;
                        if (clusters[s1] == -1 && clusters[s2] == -1) {
                            //Both SNPs have not yet been assigned to any cluster, easy!
                            clusters[s1] = nrClusters;
                            clusters[s2] = nrClusters;
                            nrClusters++;
                        } else {
                            if (clusters[s1] != -1 && clusters[s2] != -1) {
                                //Both SNPS have already been assigned to clusters, merge these clusters:
                                int previousClusterS1 = clusters[s1];
                                int previousClusterS2 = clusters[s2];
                                for (int q = 0; q < nrSNPsToUse; q++) {
                                    if (clusters[q] == previousClusterS1) {
                                        clusters[q] = nrClusters;
                                    }
                                    if (clusters[q] == previousClusterS2) {
                                        clusters[q] = nrClusters;
                                    }
                                }
                                nrClusters++;
                            } else {
                                if (clusters[s1] == -1) {
                                    clusters[s1] = clusters[s2];
                                }
                                if (clusters[s2] == -1) {
                                    clusters[s2] = clusters[s1];
                                }
                            }
                        }
                    }
                }
            }
            if (!SNPInLDWithOtherSNP) {
                clusters[s1] = nrClusters;
                nrClusters++;
            }
        }

        // build string array representation of SNP clusters
        // the first SNP in each cluster will be the most significant one if the SNPs were given in order of significance
        List independifiedSNPs = new ArrayList();
        nrIndependifiedLeadSNPs = 0;
        for (int c = 0; c < nrClusters; c++) {
            boolean clusterContainsSNPs = false;
            for (int s1 = 0; s1 < nrSNPsToUse; s1++) {
                if (clusters[s1] == c) {
                    clusterContainsSNPs = true;
                    break;
                }
            }
            if (clusterContainsSNPs) {
                nrIndependifiedLeadSNPs++;
                String snpsThisCluster = "";
                String delim = "";
                for (int s1 = 0; s1 < nrSNPsToUse; s1++) {
                    if (clusters[s1] == c) {
                        snpsThisCluster += delim + snps[s1];
                        delim = ";";
                    }
                }
                independifiedSNPs.add(snpsThisCluster);
            }
        }

        return independifiedSNPs.toArray(new String[0]);
    }

    public String[] independify(String permutationDataset, int permutation, double pThreshold, double r2Threshold, int bpThreshold, int nrIndependentSNPsWanted) throws IOException {
        return independify(permutationDataset, permutation, pThreshold, r2Threshold, bpThreshold, nrIndependentSNPsWanted, null);
    }

    public void addPermutationDataset(String name, String filename) throws IOException {
        DoubleMatrixDataset permDataset = new DoubleMatrixDataset(filename);
        if (permDatasets == null) {
            permDatasets = new HashMap>();
        }
        permDatasets.put(name, permDataset);
        if (filename.endsWith(".txt")) {
            permDataset.save(filename.replaceAll(".txt", ".binary"));
        }
    }

    public String[] independify(String permutationDataset, int permutation, double pThreshold, double r2Threshold, int bpThreshold, int nrIndependentSNPsWanted, String outFile) throws IOException {

        DoubleMatrixDataset permDataset = permDatasets.get(permutationDataset);
        if (permDataset == null) {
            throw new IllegalStateException("Dataset '" + permutationDataset + "' not set!");
        }

        TObjectIntHashMap snpToSNPId = genotypeData.getSnpToSNPId();
        Vector vecTopResults = new Vector();
        for (int s = 0; s < permDataset.nrCols; s++) {
            if (permDataset.rawData[permutation][s] < pThreshold) {
                if (snpToSNPId.get(permDataset.colObjects.get(s)) == -9) {
//                    System.err.println("SNP not in data! " + permDataset.colObjects.get(s));
                    continue;
                }
                Byte chr = genotypeData.getChr(s);
                if (chr < 0 || chr > 23) {
                    // bad SNP is bad
                } else {
                    umcg.genetica.containers.StringDoubleObject e = new umcg.genetica.containers.StringDoubleObject(permDataset.colObjects.get(s), permDataset.rawData[permutation][s]);
                    vecTopResults.add(e);
                }
            }
        }
        umcg.genetica.util.StringDoubleObjectSorterSortOnDouble sorter = new umcg.genetica.util.StringDoubleObjectSorterSortOnDouble();
        sorter.sort(vecTopResults);

        String[] snps = new String[vecTopResults.size()];
        for (int q = 0; q < vecTopResults.size(); q++) {
            snps[q] = ((umcg.genetica.containers.StringDoubleObject) vecTopResults.get(q)).stringValue;
        }

        LOGGER.log(Level.INFO, "Permutation {0}:\tUsing {1} SNPs, this will require {2} MB of memory", new Object[]{permutation, snps.length, 4 * snps.length * (snps.length / 1024d / 1024d)});

        DetermineLD ldCalc = new DetermineLD();
        float[][] r2Matrix = new float[snps.length][snps.length];
        int[] snpsChr = new int[snps.length];
        int[] snpsChrPos = new int[snps.length];
        for (int s1 = 0; s1 < snps.length; s1++) {
            Integer get = snpToSNPId.get(snps[s1]);
            SNP snp1 = genotypeData.getSNPObject(get);
            synchronized (snpLoader) {
                snpLoader.loadGenotypes(snp1);
            }
            byte chr1 = snp1.getChr();
            int chrPos1 = snp1.getChrPos();
            snpsChr[s1] = chr1;
            snpsChrPos[s1] = chrPos1;
            for (int s2 = s1 + 1; s2 < snps.length; s2++) {
                SNP snp2 = genotypeData.getSNPObject(snpToSNPId.get(snps[s2]));
                if (s1 != s2 && chr1 == snp2.getChr() && Math.abs(chrPos1 - snp2.getChrPos()) < bpThreshold) {
                    synchronized (snpLoader) {
                        snpLoader.loadGenotypes(snp2);
                    }
                    double r2 = ldCalc.getRSquared(snp1, snp2, genotypeData, DetermineLD.RETURN_R_SQUARED, DetermineLD.INCLUDE_CASES_AND_CONTROLS, false);
                    r2Matrix[s1][s2] = (float) r2;
                    r2Matrix[s2][s1] = (float) r2;
                }
                snp2.clearGenotypes();
            }
            snp1.clearGenotypes();
        }

        int nrIndependifiedLeadSNPs = 0;
        int nrSNPsToUse = snps.length;

        int lower = 0;
        int upper = snps.length;
        List independifiedSNPs = new ArrayList();

        while (nrIndependifiedLeadSNPs != nrIndependentSNPsWanted) {

            nrSNPsToUse = (upper + lower) / 2;

            int[] clusters = new int[snps.length];
            for (int s1 = 0; s1 < snps.length; s1++) {
                clusters[s1] = -1;
            }
            int nrClusters = 0;

            for (int s1 = 0; s1 < nrSNPsToUse; s1++) {
                boolean SNPInLDWithOtherSNP = false;
                for (int s2 = 0; s2 < nrSNPsToUse; s2++) {
                    if (s1 != s2 && snpsChr[s1] == snpsChr[s2] && Math.abs(snpsChrPos[s1] - snpsChrPos[s2]) < bpThreshold) {
                        double r2 = r2Matrix[s1][s2];
                        if (r2 > r2Threshold) {
                            SNPInLDWithOtherSNP = true;
                            if (clusters[s1] == -1 && clusters[s2] == -1) {
                                //Both SNPs have not yet been assigned to any cluster, easy!
                                clusters[s1] = nrClusters;
                                clusters[s2] = nrClusters;
                                nrClusters++;
                            } else {
                                if (clusters[s1] != -1 && clusters[s2] != -1) {
                                    //Both SNPS have already been assigned to clusters, merge these clusters:
                                    int previousClusterS1 = clusters[s1];
                                    int previousClusterS2 = clusters[s2];
                                    for (int q = 0; q < nrSNPsToUse; q++) {
                                        if (clusters[q] == previousClusterS1) {
                                            clusters[q] = nrClusters;
                                        }
                                        if (clusters[q] == previousClusterS2) {
                                            clusters[q] = nrClusters;
                                        }
                                    }
                                    nrClusters++;
                                } else {
                                    if (clusters[s1] == -1) {
                                        clusters[s1] = clusters[s2];
                                    }
                                    if (clusters[s2] == -1) {
                                        clusters[s2] = clusters[s1];
                                    }
                                }
                            }
                        }
                    }
                }
                if (!SNPInLDWithOtherSNP) {
                    clusters[s1] = nrClusters;
                    nrClusters++;
                }
            }

            independifiedSNPs = new ArrayList();
            TextFile out = null;
            if (outFile != null && !outFile.isEmpty()) {
                out = new TextFile(outFile, true);
            }
            nrIndependifiedLeadSNPs = 0;
            for (int c = 0; c < nrClusters; c++) {
                boolean clusterContainsSNPs = false;
                for (int s1 = 0; s1 < nrSNPsToUse; s1++) {
                    if (clusters[s1] == c) {
                        clusterContainsSNPs = true;
                        break;
                    }
                }
                if (clusterContainsSNPs) {
                    nrIndependifiedLeadSNPs++;
                    if (out != null) {
                        out.write(c + "");
                    }
                    String snpsThisCluster = "";
                    String delim = "";
                    for (int s1 = 0; s1 < nrSNPsToUse; s1++) {
                        if (clusters[s1] == c) {
                            snpsThisCluster += delim + snps[s1];
                            delim = ";";
                            if (out != null) {
                                out.write("\t" + snps[s1]);
                            }
                        }
                    }
                    independifiedSNPs.add(snpsThisCluster);
                    if (out != null) {
                        out.writeln();
                    }
                }
            }
            if (out != null) {
                out.close();
            }

//            System.out.println("Number of SNPs to include:\t" + snps.length + "\t" + nrIndependifiedLeadSNPs);
            if (nrIndependifiedLeadSNPs != nrIndependentSNPsWanted) {
                if (nrIndependifiedLeadSNPs < nrIndependentSNPsWanted) {
                    lower = nrSNPsToUse;
                } else {
                    upper = nrSNPsToUse;
                }
            }
            if (upper - lower <= 1) {
                break;
            }

        }

        return independifiedSNPs.toArray(new String[0]);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy