All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nl.umcg.bondermj.microbialmetanalysis.BinaryMicrobePcaAnalysis Maven / Gradle / Ivy

The newest version!
/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nl.umcg.bondermj.microbialmetanalysis;

import nl.umcg.westrah.binarymetaanalyzer.*;
import gnu.trove.map.hash.TObjectIntHashMap;
import java.io.File;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import umcg.genetica.console.ProgressBar;
import umcg.genetica.io.Gpio;
import umcg.genetica.io.text.TextFile;
import umcg.genetica.io.trityper.util.BaseAnnot;
import umcg.genetica.math.stats.Correlation;
import umcg.genetica.math.stats.Descriptives;
import umcg.genetica.math.stats.ZScores;
import umcg.genetica.text.Strings;

/**
 *
 * @author Marc Jan & Harm-Jan
 */
public class BinaryMicrobePcaAnalysis {

    public static void main(String[] args) {

        String settingsFile = null;
        String textToReplace = null;
        String replaceTextWith = null;

        if (args.length == 3) {
            settingsFile = args[0];
            textToReplace = args[1];
            replaceTextWith = args[2];

        } else if (args.length == 1) {
            settingsFile = args[0];

        } else {
            System.out.println("Usage: settings.xml replacetext replacetextwith");
            System.exit(-1);
        }

        BinaryMicrobePcaAnalysis meta = new BinaryMicrobePcaAnalysis(settingsFile, textToReplace, replaceTextWith);
        System.exit(0);

    }

    private MetaQTL4TraitAnnotation probeAnnotation;
    private BinaryMetaAnalysisDataset[] datasets = new BinaryMetaAnalysisDataset[0];
    private int[][] snpIndex;
    private String[] snpList;
    private final BinaryMetaAnalysisSettings settings;
    private String[] snpChr;
    private int[] snpPositions;
    private Integer[][] probeIndex;

    private QTL[] finalEQTLs;
    private boolean bufferHasOverFlown;
    private double maxSavedPvalue = -Double.MAX_VALUE;
    private int locationToStoreResult;

    TObjectIntHashMap traitMap = new TObjectIntHashMap();
    MetaQTL4MetaTrait[] traitList = null;

    public BinaryMicrobePcaAnalysis(String settingsFile, String textToReplace, String replaceTextWith) {
        // initialize settings
        settings = new BinaryMetaAnalysisSettings();
        settings.parse(settingsFile, textToReplace, replaceTextWith);
        int maxResults = settings.getFinalEQTLBufferMaxLength();
        int tmpbuffersize = (maxResults / 10);

        if (tmpbuffersize == 0) {
            tmpbuffersize = 10;
        } else if (tmpbuffersize > 250000) {
            tmpbuffersize = 250000;
        }

        try {
            run((maxResults + tmpbuffersize));
        } catch (IOException ex) {
            Logger.getLogger(BinaryMicrobePcaAnalysis.class.getName()).log(Level.SEVERE, null, ex);
        }

    }

    public void run(int bufferSize) throws IOException {

        String outdir = settings.getOutput();
        outdir = Gpio.formatAsDirectory(outdir);
        Gpio.createDir(outdir);
        // load probe annotation and index
        // this particular probe annotation can take multiple probes for a single location into account.
        System.out.println("Loading probe annotation from: " + settings.getProbetranslationfile());
        loadProbeAnnotation();

        for (int permutation = 0; permutation < settings.getNrPermutations() + 1; permutation++) {
            clearResultsBuffer();
            maxSavedPvalue = -Double.MAX_VALUE;
            // create dataset objects
            System.out.println("Running permutation " + permutation);
            datasets = new BinaryMetaAnalysisDataset[settings.getDatasetlocations().size()];

            System.out.println("Loading datasets");
            for (int d = 0; d < datasets.length; d++) {
                datasets[d] = new BinaryMetaAnalysisDataset(settings.getDatasetlocations().get(d), settings.getDatasetnames().get(d), settings.getDatasetPrefix().get(d), permutation, settings.getDatasetannotations().get(d), probeAnnotation);
            }
            System.out.println("Loaded " + datasets.length + " datasets");

            // create meta-analysis SNP index. have to recreate this every permutation, 
            // since the order of SNPs is generated at random.
            System.out.println("Creating SNP index");
            createSNPIndex();
            System.out.println("Total of " + snpIndex.length + " SNPs");
            System.out.println("Creating probe index");
            createProbeIndex();
            System.out.println("Total of " + probeIndex.length + " probes");

            if (snpChr == null) {
                System.out.println("Loading SNP annotation from " + settings.getSNPAnnotationFile());
                loadSNPAnnotation();
            }

            // run analysis
            System.out.println("Cis-analysis: " + settings.isCis());
            System.out.println("Trans-analysis: " + settings.isTrans());
            System.out.println("Cis-window: " + settings.getCisdistance());
            System.out.println("Trans-window: " + settings.getTransdistance());

            System.out.println("Starting meta-analysis");
            ProgressBar pb = new ProgressBar(snpList.length);

            snpLoop:
            for (int snp = 0; snp < snpList.length; snp++) {

                boolean printed = false;
                int[] sampleSizes = new int[datasets.length];
                int totalSampleSize = 0;
                Boolean[] flipZScores = new Boolean[datasets.length];
                String alleles = null;
                String alleleAssessed = null;

                // determine whether to flip the alleles for a certain dataset
                for (int d = 0; d < datasets.length; d++) {

                    int datasetSNPId = snpIndex[snp][d];
                    if (datasetSNPId == -9 && settings.isConfineSNPs()) {
                        continue snpLoop;
                    }
                    if (datasetSNPId != -9) {
                        sampleSizes[d] = datasets[d].getSampleSize(datasetSNPId);
                        totalSampleSize += sampleSizes[d];

                        if (alleles == null) {
                            flipZScores[d] = false;
                            alleles = datasets[d].getAlleles(datasetSNPId);
                            alleleAssessed = datasets[d].getAlleleAssessed(datasetSNPId);
                        } else {
                            String alleles2 = datasets[d].getAlleles(datasetSNPId);
                            String alleleAssessed2 = datasets[d].getAlleleAssessed(datasetSNPId);
                            flipZScores[d] = BaseAnnot.flipalleles(alleles, alleleAssessed, alleles2, alleleAssessed2);
                        }
                    }
                }
                Correlation.correlationToZScore(totalSampleSize + 10);

                Set cisProbes = null;
                if (!settings.isCis()) {
                    // do not test the cis probes
                    cisProbes = probeAnnotation.getMetatraits().getTraitInWindow(snpChr[snp], snpPositions[snp], settings.getTransdistance());
                }

                // iterate over the probe index
                float[][] finalZScores = new float[probeIndex.length][datasets.length];

                for (int d = 0; d < datasets.length; d++) {
                    if (datasets[d].getIsCisDataset()) {
                        System.err.println("ERROR: cannot run trans analysis on a cis dataset: " + settings.getDatasetlocations().get(d));
                        System.exit(-1);
                    }

                    if (flipZScores[d] == null) {
                        for (float[] zScore : finalZScores) {
                            zScore[d] = Float.NaN;
                        }
                    } else {
                        int datasetSNPId = snpIndex[snp][d];
                        float[] datasetZScores = datasets[d].getZScores(datasetSNPId);

// probeIndex[t.getMetaTraitId()][d] = p;
                        for (int p = 0; p < traitList.length; p++) {
                            MetaQTL4MetaTrait t = traitList[p];
                            if (cisProbes != null && cisProbes.contains(t)) {
                                finalZScores[p][d] = Float.NaN;
                            } else {
                                Integer datasetProbeId = probeIndex[p][d];
                                if (datasetProbeId != null) {
                                    finalZScores[p][d] = datasetZScores[datasetProbeId];
                                    if (flipZScores[d]) {
                                        finalZScores[p][d] *= -1;
                                    }
                                } else {
                                    finalZScores[p][d] = Float.NaN;
                                }
                            }
                        }
                    }
                }
                // meta-analyze!
                if (permutation > 0) {
                    double summedRsquare = 0;
                    for (int probe = 0; probe < traitList.length; probe++) {
                        double metaAnalysisZ = ZScores.getWeightedZ(finalZScores[probe], sampleSizes);
                        double tScore = ZScores.zScoreToCorrelation(metaAnalysisZ, totalSampleSize);
                        summedRsquare += tScore * tScore;
                    }
                    double newMetaZ = Correlation.convertCorrelationToZScore(totalSampleSize, Math.sqrt(summedRsquare));
                    double newMetaAnalysisP = Descriptives.convertZscoreToPvalue(newMetaZ);

                    // create output object
                    if (!Double.isNaN(newMetaAnalysisP)) {
                        MetaQTL4MetaTrait t = new MetaQTL4MetaTrait(21, "Microbe_Components", "-", -1, -1, "", traitList[0].getPlatformIds());
                        QTL q = new QTL(newMetaAnalysisP, t, snp, BaseAnnot.toByte(alleleAssessed), newMetaZ, BaseAnnot.toByteArray(alleles), new float[finalZScores[0].length], sampleSizes); // sort buffer if needed.
                        addEQTL(q);
                    } else {
                        System.out.println("Error in procedure.");
                    }
                } else {
                    double summedRsquare = 0;
                    float[] summedPerDataSet = new float[finalZScores[0].length];
                    for (int probe = 0; probe < traitList.length; probe++) {
                        double metaAnalysisZ = ZScores.getWeightedZ(finalZScores[probe], sampleSizes);
                        for (int i = 0; i < finalZScores[probe].length; i++) {
                            double tScore = ZScores.zScoreToCorrelation(finalZScores[probe][i], sampleSizes[i]);
                            summedPerDataSet[i] += tScore * tScore;
                        }
                        double tScore = ZScores.zScoreToCorrelation(metaAnalysisZ, totalSampleSize);
                        summedRsquare += tScore * tScore;
                    }

                    for (int i = 0; i < summedPerDataSet.length; i++) {
                        summedPerDataSet[i] = (float) Correlation.convertCorrelationToZScore(sampleSizes[i], Math.sqrt(summedPerDataSet[i]));
                    }
                    double newMetaZ = Correlation.convertCorrelationToZScore(totalSampleSize, Math.sqrt(summedRsquare));
                    double newMetaAnalysisP = Descriptives.convertZscoreToPvalue(newMetaZ);

                    // create output object
                    if (!Double.isNaN(newMetaAnalysisP)) {
                        MetaQTL4MetaTrait t = new MetaQTL4MetaTrait(21, "Microbe_Components", "-", -1, -1, "", traitList[0].getPlatformIds());
                        QTL q = new QTL(newMetaAnalysisP, t, snp, BaseAnnot.toByte(alleleAssessed), newMetaZ, BaseAnnot.toByteArray(alleles), summedPerDataSet, sampleSizes); // sort buffer if needed.
                        addEQTL(q);
                    } else {
                        System.out.println("Error in procedure.");
                    }
                }
                pb.iterate();
            }
            pb.close();

            for (BinaryMetaAnalysisDataset dataset : datasets) {
                dataset.close();
            }
            writeBuffer(outdir, permutation);
        }

        /*
         TODO:
         - ZSCORE RETRIEVAL
         - Plotting of z-scores
         - validation
         - multithreadalize
         */
    }

    private void createSNPIndex() throws IOException {

        HashSet confineToTheseSNPs = null;
        if (settings.getSNPSelection() != null) {
            System.out.println("Selecting SNPs from file: " + settings.getSNPSelection());
            confineToTheseSNPs = new HashSet();
            TextFile tf = new TextFile(settings.getSNPSelection(), TextFile.R);
            confineToTheseSNPs.addAll(tf.readAsArrayList());
            tf.close();

            System.out.println(confineToTheseSNPs.size() + " SNPs loaded.");
        }

        // create a list of all available SNPs
        HashSet allSNPs = new HashSet();
        for (BinaryMetaAnalysisDataset dataset : datasets) {
            String[] snps = dataset.getSNPs();
            for (String snp : snps) {
                if (confineToTheseSNPs == null || confineToTheseSNPs.contains(snp)) {
                    allSNPs.add(snp);
                }
            }

        }

        // create a temporary map that maps each SNP to a meta-analysis position
        int ctr = 0;
        TObjectIntHashMap snpMap = new TObjectIntHashMap(allSNPs.size(), 0.85f, -9);
        snpList = new String[allSNPs.size()];
        for (String s : allSNPs) {
            snpMap.put(s, ctr);
            snpList[ctr] = s;
            ctr++;
        }

        // fill index
        snpIndex = new int[allSNPs.size()][datasets.length];
        for (int d = 0; d < datasets.length; d++) {
            for (int s = 0; s < allSNPs.size(); s++) {
                snpIndex[s][d] = -9;
            }
        }
        for (int d = 0; d < datasets.length; d++) {
            String[] snps = datasets[d].getSNPs();
            for (int s = 0; s < snps.length; s++) {
                String snp = snps[s];
                int id = snpMap.get(snp);
                if (id != -9) {
                    snpIndex[id][d] = s;
                }
            }
        }
    }

    private void loadProbeAnnotation() throws IOException {

        HashSet platforms = new HashSet();
        platforms.addAll(settings.getDatasetannotations());
        probeAnnotation = new MetaQTL4TraitAnnotation(new File(settings.getProbetranslationfile()), platforms);
        traitList = new MetaQTL4MetaTrait[probeAnnotation.getMetatraits().size()];

        int q = 0;
        for (MetaQTL4MetaTrait t : probeAnnotation.getMetatraits()) {
            traitList[q] = t;
            traitMap.put(t, q);
            q++;
        }

    }

    private void loadSNPAnnotation() throws IOException {

        snpChr = new String[snpList.length];
        snpPositions = new int[snpList.length];
        for (int s = 0; s < snpList.length; s++) {
            snpChr[s] = "-10".intern();
            snpPositions[s] = -10;
        }

        TObjectIntHashMap snpMap = new TObjectIntHashMap(snpList.length);
        for (int s = 0; s < snpList.length; s++) {
            snpMap.put(snpList[s], s);
        }
        TextFile tf = new TextFile(settings.getSNPAnnotationFile(), TextFile.R);

        String[] elems = tf.readLineElems(TextFile.tab);
        while (elems != null) {
            String snp = elems[2];
            if (snpMap.contains(snp)) {
                int id = snpMap.get(snp);
                snpChr[id] = new String(elems[0].getBytes("UTF-8")).intern();
                snpPositions[id] = Integer.parseInt(elems[1]);
            }
            elems = tf.readLineElems(TextFile.tab);
        }
        tf.close();

    }

    // index the probes 
    private void createProbeIndex() {
        probeIndex = new Integer[traitList.length][datasets.length];
        for (int d = 0; d < datasets.length; d++) {
            String[] probes = datasets[d].getProbeList();
            int platformId = probeAnnotation.getPlatformId(settings.getDatasetannotations().get(d));
            for (int p = 0; p < probes.length; p++) {
                MetaQTL4MetaTrait t = probeAnnotation.getTraitForPlatformId(platformId, probes[p]);
                int index = traitMap.get(t);
                probeIndex[index][d] = p;
            }
        }
    }

    private void addEQTL(QTL q) {

        double pval = q.getPvalue();
        if (bufferHasOverFlown) {
            if (pval <= maxSavedPvalue) {

                finalEQTLs[locationToStoreResult] = q;
                locationToStoreResult++;

                if (locationToStoreResult == finalEQTLs.length) {
                    Arrays.sort(finalEQTLs);
                    locationToStoreResult = settings.getFinalEQTLBufferMaxLength();
                    maxSavedPvalue = finalEQTLs[(settings.getFinalEQTLBufferMaxLength() - 1)].getPvalue();
                }
            }

        } else {
            if (pval > maxSavedPvalue) {
                maxSavedPvalue = pval;
            }

            finalEQTLs[locationToStoreResult] = q;
            locationToStoreResult++;

            if (locationToStoreResult == settings.getFinalEQTLBufferMaxLength()) {
                bufferHasOverFlown = true;
            }
        }
    }

    private void writeBuffer(String outdir, int permutation) throws IOException {

        // sort the finalbuffer for a last time
        if (locationToStoreResult != 0) {
            Arrays.sort(finalEQTLs, 0, locationToStoreResult);
        }

        String outfilename = outdir + "eQTLs.txt.gz";
        if (permutation > 0) {
            outfilename = outdir + "PermutedEQTLsPermutationRound" + permutation + ".txt.gz";
        }

        System.out.println("Writing output: " + outfilename);

        TextFile output = new TextFile(outfilename, TextFile.W);
        String header = "PValue\t"
                + "SNPName\t"
                + "SNPChr\t"
                + "SNPChrPos\t"
                + "ProbeName\t"
                + "ProbeChr\t"
                + "ProbeCenterChrPos\t"
                + "CisTrans\t"
                + "SNPType\t"
                + "AlleleAssessed\t"
                + "OverallZScore\t"
                + "DatasetsWhereSNPProbePairIsAvailableAndPassesQC\t"
                + "DatasetsZScores\t"
                + "DatasetsNrSamples\t"
                + "IncludedDatasetsMeanProbeExpression\t"
                + "IncludedDatasetsProbeExpressionVariance\t"
                + "HGNCName\t"
                + "IncludedDatasetsCorrelationCoefficient\t"
                + "Meta-Beta (SE)\t"
                + "Beta (SE)\t"
                + "FoldChange";

        output.writeln(header);
// PValue	SNPName	SNPChr	SNPChrPos	ProbeName	ProbeChr	ProbeCenterChrPos	CisTrans	SNPType	AlleleAssessed	OverallZScore	DatasetsWhereSNPProbePairIsAvailableAndPassesQC	DatasetsZScores	DatasetsNrSamples	IncludedDatasetsMeanProbeExpression	IncludedDatasetsProbeExpressionVariance	HGNCName	IncludedDatasetsCorrelationCoefficient	Meta-Beta (SE)	Beta (SE)	FoldChange	FDR

        DecimalFormat format = new DecimalFormat("###.#######", new DecimalFormatSymbols(Locale.US));
        DecimalFormat smallFormat = new DecimalFormat("0.#####E0", new DecimalFormatSymbols(Locale.US));
        for (int i = 0; i < settings.getFinalEQTLBufferMaxLength(); i++) {
            QTL q = finalEQTLs[i];
            if (q != null) {
                StringBuilder sb = new StringBuilder();
                if (q.getPvalue() < 1E-4) {
                    sb.append(smallFormat.format(q.getPvalue()));
                } else {
                    sb.append(format.format(q.getPvalue()));
                }

                sb.append("\t");
                int snpId = q.getSNPId();
                sb.append(snpList[snpId]);
                sb.append("\t");
                sb.append(snpChr[snpId]);
                sb.append("\t");
                sb.append(snpPositions[snpId]);
                sb.append("\t");
                MetaQTL4MetaTrait t = q.getMetaTrait();
                sb.append(t.getMetaTraitName());
                sb.append("\t");
                sb.append(t.getChr());
                sb.append("\t");
                sb.append(t.getChrMidpoint());
                sb.append("\t");
                int dist = Math.abs(t.getChrMidpoint() - snpPositions[snpId]);
                boolean sameChr = t.getChr().equals(snpChr[snpId]);
                if (sameChr && dist < settings.getCisdistance()) {
                    sb.append("Cis");
                } else if (sameChr && dist < settings.getTransdistance()) {
                    sb.append("Greyzone");
                } else {
                    sb.append("Trans");
                }

                sb.append("\t");
                sb.append(q.getAlleles());
                sb.append("\t");
                sb.append(q.getAlleleAssessed());
                sb.append("\t");
                sb.append(format.format(q.getZscore()));

                float[] datasetZScores = q.getDatasetZScores();
                String[] dsBuilder = new String[datasets.length];
                String[] dsNBuilder = new String[datasets.length];
                for (int d = 0; d < datasetZScores.length; d++) {
                    if (!Float.isNaN(datasetZScores[d])) {
                        dsBuilder[d] = settings.getDatasetnames().get(d);
                        dsNBuilder[d] = "" + q.getDatasetSampleSizes()[d];
                    } else {
                        dsBuilder[d] = "-";
                        dsNBuilder[d] = "-";
                    }
                }

                sb.append("\t");
                sb.append(Strings.concat(dsBuilder, Strings.semicolon));

                sb.append("\t");
                sb.append(Strings.concat(datasetZScores, format, Strings.semicolon));

                sb.append("\t");
                sb.append(Strings.concat(dsNBuilder, Strings.semicolon));
                sb.append("\t-\t-\t");

                sb.append(t.getAnnotation());
                sb.append("\t-\t-\t-\t-");

                output.writeln(sb.toString());
            }
        }

        output.close();

        System.out.println(
                "Done.");
    }

    private void clearResultsBuffer() {
        Arrays.fill(finalEQTLs, null);
        bufferHasOverFlown = false;
        locationToStoreResult = 0;
        maxSavedPvalue = -Double.MAX_VALUE;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy