All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eqtlmappingpipeline.util.CisEQTLProbeSNPLDCheck Maven / Gradle / Ivy

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package eqtlmappingpipeline.util;

import umcg.genetica.io.trityper.probeannotation.ProbeTranslation;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import umcg.genetica.console.ProgressBar;
import umcg.genetica.containers.Pair;
import umcg.genetica.containers.SortableSNP;
import umcg.genetica.io.text.TextFile;
import umcg.genetica.io.trityper.SNP;
import umcg.genetica.io.trityper.SNPLoader;
import umcg.genetica.io.trityper.TriTyperGenotypeData;
import umcg.genetica.io.trityper.util.ChrAnnotation;
import umcg.genetica.io.trityper.util.DetermineLD;
import umcg.genetica.text.Strings;

/**
 *
 * @author harmjan
 */
public class CisEQTLProbeSNPLDCheck {

    double cr = 0.95;
    double hwe = 0.001;
    double maf = 0.01;

    //For reference only
//    public static void main(String[] args) {
//        try {
//
//            CisEQTLProbeSNPLDCheck p = new CisEQTLProbeSNPLDCheck();
//
//            int probecol = 0;
//            int probechrcol = 2;
//            int probechrposcol = 3;
//            String probeMapFile = "/target/gpfs2/gcc/groups/wijmenga/home/mjbonder/VCF/QTL-Filtering/IlluminaHT-12-V3_EQtlMappingFileAA2_4.txt";
//            
//            String snpProbeFile = "/target/gpfs2/gcc/groups/wijmenga/home/mjbonder/VCF/QTL-Filtering/All_eQTL-SNP-Probe_Combies_MJ-26-9-2013.txt";
//            String[] referenceDatasets = new String[]{
//                "/target/gpfs2/gcc/groups/wijmenga/home/mjbonder/VCF/1kg/",
//                "/target/gpfs2/gcc/groups/wijmenga/home/mjbonder/VCF/HapMap3/",
//                "/target/gpfs2/gcc/groups/wijmenga/home/mjbonder/VCF/GoNL_v4/"
//            };
//            String[] referenceDatasetNames = new String[]{
//                "1000Genomes",
//                "HapMapCEU",
//                "GoNL_V4"
//            };
//            String outputdirectory = "/target/gpfs2/gcc/groups/wijmenga/home/mjbonder/VCF/QTL-Filtering/Expression_";
//            p.runNew(probeMapFile, snpProbeFile, referenceDatasets, referenceDatasetNames, outputdirectory, 0.2, 0.2, probecol, probechrposcol, probechrcol);
//
//        } catch (Exception e) {
//            e.printStackTrace();
//        }
//    }
    public void runNew(String probeMapFile, String snpprobefile, String[] referenceDatasets, String[] referenceDatasetNames, String outputDirectory, double rSquaredThreshold, double dPrimeThreshold,
            int probecol, int probechrposcol, int probechrcol) throws IOException {
        ArrayList probeChr = new ArrayList();
        ArrayList probeChrPos = new ArrayList();
        ArrayList probes = new ArrayList();
        HashMap probeToId = new HashMap();



        TextFile tf = new TextFile(probeMapFile, TextFile.R);
        tf.readLine();
        String[] probePositions = tf.readLineElems(TextFile.tab);
        int id = 0;
        while (probePositions != null) {
            String probeStr = probePositions[probecol];
            String probechrposStr = probePositions[probechrposcol];
            String probeChrStr = probePositions[probechrcol];
            probeChr.add(ChrAnnotation.parseChr(probeChrStr));
            probeChrPos.add(probechrposStr);
            probes.add(probeStr);
            probeToId.put(probeStr, id);
            id++;
            probePositions = tf.readLineElems(TextFile.tab);
        }
        tf.close();

        TextFile tf2 = new TextFile(snpprobefile, TextFile.R);
        HashSet> snpProbePairs = tf2.readAsPairs(0, 1);
        tf2.close();

        DetermineLD ldcalc = new DetermineLD();

        HashMap, Boolean> qcResultsBoolean = new HashMap, Boolean>();
        HashMap, String[]> qcResultsStr = new HashMap, String[]>();
        HashSet> snpProbePairsTested = new HashSet>();

        for (int d = 0; d < referenceDatasets.length; d++) {
            HashSet> snpProbePairsTestedForThisReferenceDs = new HashSet>();
            String referenceDatasetLocation = referenceDatasets[d];
            String referenceDatasetName = referenceDatasetNames[d];

            TriTyperGenotypeData reference = new TriTyperGenotypeData(referenceDatasetLocation);
            SNPLoader loader = reference.createSNPLoader();
            String[] snps = reference.getSNPs();
            TextFile output = new TextFile(outputDirectory + referenceDatasetName + "-QCOutput.txt", TextFile.W);
            output.writeln("SNPProbe\tFailsQC\tReason");

            HashSet> snpProbePairsToTestForThisReferenceDataset = new HashSet>();

            for (Pair snpProbePair : snpProbePairs) {
                String snp = snpProbePair.getLeft();
                Integer snpId = reference.getSnpToSNPId().get(snp);

                if (snpId == -9) {
                    snpProbePairsTested.add(snpProbePair);
                    snpProbePairsTestedForThisReferenceDs.add(snpProbePair);
                    output.writeln(snpProbePair.toString() + "\tUNKNOWN: SNP not present in " + referenceDatasetName);


                    String[] qcStrArr = qcResultsStr.get(snpProbePair);
                    if (qcStrArr == null) {
                        qcStrArr = new String[referenceDatasetNames.length];
                    }
                    qcStrArr[d] = "SNP not present in " + referenceDatasetName;
                    qcResultsStr.put(snpProbePair, qcStrArr);

                    Boolean b = qcResultsBoolean.get(snpProbePair);
                    if (b == null) {
                        qcResultsBoolean.put(snpProbePair, null);
                    }

                } else if (reference.getChr(snpId) == null || reference.getChr(snpId) < 1 || reference.getChr(snpId) > 23) {
                    // get the chromosome location
                    snpProbePairsTested.add(snpProbePair);
                    snpProbePairsTestedForThisReferenceDs.add(snpProbePair);
                    String[] qcStrArr = qcResultsStr.get(snpProbePair);
                    if (qcStrArr == null) {
                        qcStrArr = new String[referenceDatasetNames.length];
                    }
                    output.writeln(snpProbePair.toString() + "\tUNKNOWN: SNP maps to chromosome " + reference.getChr(snpId) + " in dataset " + referenceDatasetName);
                    qcStrArr[d] = "SNP maps to chromosome " + reference.getChr(snpId) + " in dataset " + referenceDatasetName;
                    qcResultsStr.put(snpProbePair, qcStrArr);
                    Boolean b = qcResultsBoolean.get(snpProbePair);
                    if (b == null) {
                        qcResultsBoolean.put(snpProbePair, null);
                    }
                } else {
                    snpProbePairsToTestForThisReferenceDataset.add(snpProbePair);
                }
            }

            for (byte chr = 1; chr < 23; chr++) {
                ArrayList snpsOnChr = new ArrayList();
                ArrayList probesOnChr = new ArrayList();

                int nrProbes = probes.size();
                for (int p = 0; p < nrProbes; p++) {
                    if (probeChr.get(p).equals(chr)) {
                        probesOnChr.add(p);
                    }
                }

                HashSet snpsOnChrStrHash = new HashSet();
                for (int s = 0; s < snps.length; s++) {
                    if (reference.getChr(s) == chr) {
                        snpsOnChrStrHash.add(snps[s]);
                        snpsOnChr.add(new SortableSNP(snps[s], s, chr, reference.getChrPos(s), SortableSNP.SORTBY.ID));
                    }
                }

                System.out.println(snpsOnChr.size() + "\tSNPs on Chr " + chr);
                System.out.println(probesOnChr.size() + "\tProbes on Chr " + chr);
                Collections.sort(snpsOnChr);

                HashMap> snpsInProbes = new HashMap>();

                for (int p = 0; p < probesOnChr.size(); p++) {
                    Integer probeId = probesOnChr.get(p);
                    String actualAnnotation = probeChrPos.get(probeId);
                    probePositions = actualAnnotation.split(":");
                    HashSet snpsInProbe = new HashSet();
                    for (String pos : probePositions) {
                        String[] probePositionElements = pos.split("-");
                        if (probePositionElements.length < 2) {
                            System.err.println("ERROR: " + pos + "\tis not parseable for probe: " + probes.get(probeId));
                        } else {
                            try {
                                Integer start = Integer.parseInt(probePositionElements[0]);
                                Integer stop = Integer.parseInt(probePositionElements[1]);
                                for (SortableSNP s : snpsOnChr) {
                                    int chrPos = s.chrpos;
                                    if (chrPos >= start && chrPos <= stop) {
                                        snpsInProbe.add(s.id);
                                    }
                                }
                            } catch (NumberFormatException e) {
                                System.err.println(pos + "\tis not parseable for probe: " + probes.get(probeId));
                            }
                        }
                    }
                    snpsInProbes.put(probeId, snpsInProbe);
                }

                for (Pair snpProbePair : snpProbePairsToTestForThisReferenceDataset) {
                    if (snpProbePairsTestedForThisReferenceDs.contains(snpProbePair)) {
                        // do not test again
                        continue;
                    }

                    String snp = snpProbePair.getLeft();
                    Integer snpId = reference.getSnpToSNPId().get(snp);
                    if (snpsOnChrStrHash.contains(snp)) {
                        snpProbePairsTestedForThisReferenceDs.add(snpProbePair);
                        snpProbePairsTested.add(snpProbePair);
                        String probe = snpProbePair.getRight();
                        Integer probeId = probeToId.get(probe);
                        if (probeId == null) {
                            System.err.println("ERROR: no annotation loaded for probe: " + probe);
                            output.writeln(snpProbePair.toString() + "\tUNKNOWN: \tProbe annotation not loaded.");
                            String[] qcStrArr = qcResultsStr.get(snpProbePair);
                            if (qcStrArr == null) {
                                qcStrArr = new String[referenceDatasetNames.length];
                            }
                            qcStrArr[d] = "Probe annotation not loaded.";
                            qcResultsStr.put(snpProbePair, qcStrArr);
                            qcResultsBoolean.put(snpProbePair, null);
                        } else {
                            HashSet probeSNPs = snpsInProbes.get(probeId);
                            if (probeSNPs.isEmpty()) {
                                output.writeln(snpProbePair.toString() + "\tFALSE\tNo SNPs underneath probe in " + referenceDatasetName);
                                String[] qcStrArr = qcResultsStr.get(snpProbePair);
                                if (qcStrArr == null) {
                                    qcStrArr = new String[referenceDatasetNames.length];
                                }
                                qcStrArr[d] = "No SNPs underneath probe in " + referenceDatasetName;
                                qcResultsStr.put(snpProbePair, qcStrArr);
                                qcResultsBoolean.put(snpProbePair, false);
                            } else {
                                SNP snpObj1 = reference.getSNPObject(snpId);
                                loader.loadGenotypes(snpObj1);
                                String qcStr = "";
                                Boolean failsQC = false;
                                if (snpObj1.getMAF() > maf && snpObj1.getHWEP() > hwe && snpObj1.getCR() > cr) {
                                    for (Integer otherSNP : probeSNPs) {
                                        SNP snpObj2 = reference.getSNPObject(otherSNP);
                                        loader.loadGenotypes(snpObj2);

                                        if (snpObj2.getMAF() > maf && snpObj2.getHWEP() > hwe && snpObj2.getCR() > cr) {
                                            Pair ld = ldcalc.getLD(snpObj1, snpObj2, reference, DetermineLD.INCLUDE_CASES_AND_CONTROLS, false);
                                            if (ld.getLeft() > rSquaredThreshold || ld.getRight() > dPrimeThreshold) {
                                                failsQC = true;
                                                if (qcStr.length() == 0) {
                                                    qcStr += snpObj2.getName() + "; TRUE (rsq: " + ld.getLeft() + ",D': " + ld.getRight() + ", MAF: " + snpObj2.getMAF() + ", CR: " + snpObj2.getCR() + ", HWEP: " + snpObj2.getHWEP() + ")";
                                                } else {
                                                    qcStr += "; " + snpObj2.getName() + "; TRUE (rsq: " + ld.getLeft() + ",D': " + ld.getRight() + ", MAF: " + snpObj2.getMAF() + ", CR: " + snpObj2.getCR() + ", HWEP: " + snpObj2.getHWEP() + ")";
                                                }
                                            } else {
                                                if (qcStr.length() == 0) {
                                                    qcStr += snpObj2.getName() + "; FALSE (rsq: " + ld.getLeft() + ",D': " + ld.getRight() + ", MAF: " + snpObj2.getMAF() + ", CR: " + snpObj2.getCR() + ", HWEP: " + snpObj2.getHWEP() + ")";
                                                } else {
                                                    qcStr += "; " + snpObj2.getName() + "; FALSE (rsq: " + ld.getLeft() + ",D': " + ld.getRight() + ", MAF: " + snpObj2.getMAF() + ", CR: " + snpObj2.getCR() + ", HWEP: " + snpObj2.getHWEP() + ")";
                                                }
                                            }
                                        } else {
                                            if (qcStr.length() == 0) {
                                                qcStr += snpObj2.getName() + "; UNKNOWN (MAF: " + snpObj2.getMAF() + ", CR: " + snpObj2.getCR() + ", HWEP: " + snpObj2.getHWEP() + ")";
                                            } else {
                                                qcStr += "; " + snpObj2.getName() + "; UNKNOWN (MAF: " + snpObj2.getMAF() + ", CR: " + snpObj2.getCR() + ", HWEP: " + snpObj2.getHWEP() + ")";
                                            }
                                        }
                                        snpObj2.clearGenotypes();
                                        snpObj2 = null;
                                    }
                                    output.writeln(snpProbePair.toString() + "\t" + failsQC + "\t" + qcStr);

                                } else {
                                    qcStr += "SNP does not pass QC thresholds in " + referenceDatasetName + "(MAF: " + snpObj1.getMAF() + ", CR: " + snpObj1.getCR() + ", HWEP: " + snpObj1.getHWEP() + ")";
                                    output.writeln(snpProbePair.toString() + qcStr);
                                    failsQC = null;
                                }
                                Boolean b = qcResultsBoolean.get(snpProbePair);

                                if (b == null && failsQC == null) {
                                    qcResultsBoolean.put(snpProbePair, null);
                                } else {
                                    boolean finalb = false;
                                    if (failsQC == null && b != null) {
                                        finalb = b.booleanValue();
                                    } else if (failsQC != null && b == null) {
                                        finalb = failsQC;
                                    } else {
                                        boolean bb = b.booleanValue();
                                        if (!failsQC && !bb) {
                                            finalb = false;
                                        } else {
                                            finalb = true;
                                        }
                                    }
                                    qcResultsBoolean.put(snpProbePair, finalb);
                                }

                                String[] qcStrArr = qcResultsStr.get(snpProbePair);
                                if (qcStrArr == null) {
                                    qcStrArr = new String[referenceDatasetNames.length];
                                }
                                qcStrArr[d] = qcStr;
                                qcResultsStr.put(snpProbePair, qcStrArr);
                                snpObj1.clearGenotypes();
                                snpObj1 = null;
                            }

                        }
                    }
                }
            }

            output.close();
            loader.close();

        }

        TextFile output2 = new TextFile(outputDirectory + "Merged-CombinationsNotTestedByQC.txt", TextFile.W);
        for (Pair p : snpProbePairs) {
            if (!snpProbePairsTested.contains(p)) {
                output2.writeln(p.toString());
            }
        }
        output2.close();


        TextFile output3 = new TextFile(outputDirectory + "Merged-QCResults.txt", TextFile.W);

        String header = "SNP-Probe\tFailsQC";
        for (String refDs : referenceDatasetNames) {
            header += "\tQCOutput-" + refDs;
        }

        output3.writeln(header);
        for (Pair p : snpProbePairs) {
            if (!qcResultsStr.containsKey(p)) {
            } else {
                String[] qcStr = qcResultsStr.get(p);
                Boolean b = qcResultsBoolean.get(p);
                if (b == null) {
                    output3.writeln(p.toString() + "\tUNKNOWN\t" + Strings.concat(qcStr, Strings.tab));
                } else {
                    output3.writeln(p.toString() + "\t" + b + "\t" + Strings.concat(qcStr, Strings.tab));
                }

            }
        }
        output3.close();


    }

    public void determineSNPProbePairsWhichMayHaveFalsePositiveEffect(String probeTranslation, String reference, String outdir) throws IOException {

        ProbeTranslation pb = new ProbeTranslation();
        pb.load(probeTranslation);


        TriTyperGenotypeData ds = new TriTyperGenotypeData();
        ds.load(reference);

        String[] snps = ds.getSNPs();

        TextFile output = new TextFile(outdir + "SNPProbeCombosWithPossibleHybArtifacts1Kg.txt", TextFile.W);
        for (byte chr = 1; chr < 23; chr++) {
            ArrayList snpsOnChr = new ArrayList();
            ArrayList probesOnChr = new ArrayList();

            int nrProbes = pb.getNumProbes();
            for (int p = 0; p < nrProbes; p++) {
                if (pb.getProbeChr(p) == chr) {
                    probesOnChr.add(p);
                }
            }

            for (int s = 0; s < snps.length; s++) {
                if (ds.getChr(s) == chr) {
                    snpsOnChr.add(new SortableSNP(snps[s], s, chr, ds.getChrPos(s), SortableSNP.SORTBY.ID));
                }
            }

            System.out.println(snpsOnChr.size() + "\tSNPs on Chr " + chr);
            System.out.println(probesOnChr.size() + "\tProbes on Chr " + chr);
            Collections.sort(snpsOnChr);

            HashMap> snpsInProbes = new HashMap>();

            for (int p = 0; p < probesOnChr.size(); p++) {
                String actualAnnotation = pb.getActualMappingPosition(probesOnChr.get(p));
                String[] elems = actualAnnotation.split(":");

                for (String pos : elems) {
                    String[] elems2 = pos.split("-");
                    Integer start = Integer.parseInt(elems2[0]);
                    Integer stop = Integer.parseInt(elems2[1]);

                    for (SortableSNP s : snpsOnChr) {
                        int chrPos = s.chrpos;
                        if (chrPos >= start && chrPos <= stop) {
                            HashSet snpsInProbe = snpsInProbes.get(probesOnChr.get(p));
                            if (snpsInProbe == null) {
                                snpsInProbe = new HashSet();
                            }
                            snpsInProbe.add(s.id);
                            snpsInProbes.put(probesOnChr.get(p), snpsInProbe);
                        }
                    }
                }
            }
            System.out.println(snpsInProbes.size() + "\tprobes with SNPs");

            SNPLoader loader = ds.createSNPLoader();
            DetermineLD ldcalc = new DetermineLD();

            ProgressBar progress = new ProgressBar(snpsOnChr.size(), "Testing chr: " + chr);

            HashSet snpsNotPassingQC = new HashSet();
            for (SortableSNP s : snpsOnChr) {
                // for each probe within 1Mb, check
                if (!snpsNotPassingQC.contains(s.id)) {
                    SNP snpObj1 = ds.getSNPObject(s.id);
                    loader.loadGenotypes(snpObj1);

                    if (snpObj1.getMAF() > maf && snpObj1.getHWEP() > hwe && snpObj1.getCR() > cr) {
                        for (int p = 0; p < probesOnChr.size(); p++) {
                            HashSet snpsInProbe = snpsInProbes.get(probesOnChr.get(p));
                            if (snpsInProbe != null) {
                                boolean failsQC = false;
                                String qcStr = "";
                                for (Integer snpInProbe : snpsInProbe) {
                                    if (!snpsNotPassingQC.contains(snpInProbe)) {
                                        SNP snpObj2 = ds.getSNPObject(snpInProbe);
                                        loader.loadGenotypes(snpObj2);
                                        if (snpObj2.getMAF() > maf && snpObj2.getHWEP() > hwe && snpObj2.getCR() > cr) {
                                            Pair ld = ldcalc.getLD(snpObj1, snpObj2, ds, DetermineLD.INCLUDE_CASES_AND_CONTROLS, false);
                                            if (ld.getLeft() > 0.2 || ld.getRight() > 0.2) {
                                                failsQC = true;
                                                qcStr += "\t" + snpObj2.getName() + " (" + ld.getLeft() + ", " + ld.getRight() + ")";
                                            }
                                        } else {
                                            snpsNotPassingQC.add(snpInProbe);
                                        }
                                        snpObj2.clearGenotypes();
                                    }
                                }
                                if (failsQC) {
                                    String outputStr = s.name + "\t" + pb.getProbes()[probesOnChr.get(p)] + qcStr;
                                    output.writeln(outputStr);
                                }
                            }
                        }
                    } else {
                        snpsNotPassingQC.add(s.id);
                    }
                    snpObj1.clearGenotypes();
                }
                progress.iterate();
            }
            progress.close();
        }
        output.close();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy