All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.gbs.repgen.RepGenLDAnalysisPlugin Maven / Gradle / Ivy

/**
 * 
 */
package net.maizegenetics.analysis.gbs.repgen;

import java.awt.Frame;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.IntStream;

import javax.swing.ImageIcon;

import org.apache.commons.math3.stat.correlation.PearsonsCorrelation;
import org.apache.commons.math3.stat.correlation.SpearmansCorrelation;
import org.apache.log4j.Logger;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;

import net.maizegenetics.analysis.popgen.LinkageDisequilibrium;
import net.maizegenetics.dna.tag.RepGenDataWriter;
import net.maizegenetics.dna.tag.RepGenSQLite;
import net.maizegenetics.dna.tag.Tag;
import net.maizegenetics.dna.tag.TaxaDistribution;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;

/**
 * This class takes a rAmpSeq (formerly RepGen) database and for
 * each tag in the tag table, performs the following tag-tag
 * correlations based on the taxa distribution for each tag
 *   (1) tag-tag Pearson's correlation
 *   (2) tag-tag Spearman's correlation
 *   (3) tag-tag presence/absence correlations
 *   (4) r-squared
 *   
 *   The vectors presented to the analysis methods represent
 *   a list of taxa and the number of times the tag was seen
 *   in that taxa.  The presence/absence vectors have a 1 or 0
 *   as values in each slot.
 * @author lcj34
 *
 */
public class RepGenLDAnalysisPlugin extends AbstractPlugin {
    private static final Logger myLogger = Logger.getLogger(RepGenLDAnalysisPlugin.class);
    
    private PluginParameter myDBFile = new PluginParameter.Builder("db", null, String.class).guiName("Input DB").required(true).inFile()
            .description("Input database file with tags and taxa distribution").build();
    private PluginParameter minTaxa = new PluginParameter.Builder<>("minTaxa", 20, Integer.class).guiName("Min Taxa for RSquared")
            .description("Minimum number of taxa that must be present for R-squared to be calculated.").build();    
    public RepGenLDAnalysisPlugin() {
        super(null, false);
    }

    public RepGenLDAnalysisPlugin(Frame parentFrame) {
        super(parentFrame, false);
    }

    public RepGenLDAnalysisPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }
    
    @Override
    public void postProcessParameters() {

        if (myDBFile.isEmpty() || !Files.exists(Paths.get(inputDB()))) {
            throw new IllegalArgumentException("RepGenLDAnalysisPlugin: postProcessParameters: Input DB not set or found");
        }
    }
    
    @Override
    public DataSet processData(DataSet input) {
        long totalTime = System.nanoTime();
        long time=System.nanoTime();
 
        try {           
            System.out.println("RepGenLDAnalysis:processData begin, get all tags/taxadist from db"); 
            RepGenDataWriter repGenData=new RepGenSQLite(inputDB());

            Map  tagTaxaMap = repGenData.getAllTagsTaxaMap();
            int tagcount = 0;

            int processedTags = 0;
            System.out.println("TIme to get all tags with taxa from db: " + (System.nanoTime() - totalTime)/1e9 + " seconds.\n");
            time = System.nanoTime();
            Multimap tagTagCorrelations = Multimaps.synchronizedMultimap(HashMultimap.create());
            System.out.println("\nStart processing tag correlations.  Number of tags in db: " + tagTaxaMap.keySet().size());
            Set tagSet = tagTaxaMap.keySet();
            List tagList = new ArrayList(tagSet);
            
            for (int tidx = 0; tidx < tagList.size(); tidx++) {
            //for (Tag tag1 : tagTaxaMap.keySet()) {
                Tag tag1 = tagList.get(tidx);
                tagcount++;
                processedTags++;
                // get dist for each taxa
                TaxaDistribution tag1TD = tagTaxaMap.get(tag1);
                if (tag1TD == null) {
                    ((RepGenSQLite)repGenData).close();
                    System.out.println("GetTagTaxaDist: got null tagTD at tagcount " + tagcount);
                    return null; // But this should return an error?
                }
                int[] depths1 = tag1TD.depths(); // gives us the depths for each taxon
                // This needs to be doubles for Pearson
                double[] ddepths1 = new double[depths1.length];
                
                // Apparently no shorter method of casting int to double 
                for (int idx = 0; idx < depths1.length; idx++) {
                    ddepths1[idx] = (double)depths1[idx];
                }

                double[] depthsPrime1 = new double[depths1.length];
                for (int idx = 0; idx < depthsPrime1.length; idx++) {
                    // boolean - presence/absence
                    if (ddepths1[idx] > 0) depthsPrime1[idx] = 1;
                    else depthsPrime1[idx] = 0;
                }
                final int tIdxFinal = tidx; // IntStream forEach must have "final" variable, tidx is not final
                IntStream.range(tidx+1,tagList.size()).parallel().forEach(item -> {
                    calculateCorrelations(tagTagCorrelations, tagTaxaMap, 
                            tagList.get(tIdxFinal), tagList.get(item), ddepths1, depthsPrime1);
                });
                
                if (tagcount > 1000) {
                    // comment out when run for real!
                    System.out.println("FInished processing " + processedTags + " tags, this set took " + (System.nanoTime() - time)/1e9 + " seconds, now load to db ..." );
                    time = System.nanoTime();
                    repGenData.putTagTagCorrelationMatrix(tagTagCorrelations);
                    System.out.println("Loading DB took " + (System.nanoTime() - time)/1e9 + " seconds.\n");
                    tagcount = 0;
                    tagTagCorrelations.clear(); // start fresh with next 1000
                    time = System.nanoTime();
                }
            
            }
            if (tagcount > 0 ) {
                System.out.println("Finished processing last tags, load to DB");
                time = System.nanoTime();
                repGenData.putTagTagCorrelationMatrix(tagTagCorrelations);
                System.out.println("Loading DB took " + (System.nanoTime() - time)/1e9 + " seconds.\n");
                tagcount = 0;
                tagTagCorrelations.clear(); // start fresh with next 1000
            }
            System.out.println("Total number of tags processed: " + processedTags );           
            
            ((RepGenSQLite)repGenData).close();
            
        } catch (Exception exc) {
            System.out.println("RepGenLDAnalysis:process_data:  processing error");
            exc.printStackTrace();
        }
        System.out.println("Process took " + (System.nanoTime() - totalTime)/1e9 + " seconds.\n");
        return null;
    }
    
    public void calculateCorrelations(Multimap tagTagCorrelations, Map  tagTaxaMap, 
            Tag tag1, Tag tag2, double[] ddepths1,double[] depthsPrime1) {
        TaxaDistribution tag2TD = tagTaxaMap.get(tag2);
        if (tag2TD == null) {                       
            System.out.println("GetTagTaxaDist: got null tagTD for sequence " + tag2.sequence());
            return ;
        }
        
        // I need doubles below !!
        int[] depths2 = tag2TD.depths(); // gives us the depths for each taxon
        double[] ddepths2 = new double[depths2.length];
        // Apparently no shorter method of casting int to double 
        for (int idx = 0; idx < depths2.length; idx++) {
            ddepths2[idx] = (double)depths2[idx];
        }
        double[] depthsPrime2 = new double[depths2.length];
        for (int idx = 0; idx < depthsPrime1.length; idx++) {
            // boolean - presence/absence
            if (ddepths2[idx] > 0) depthsPrime2[idx] = 1;
            else depthsPrime2[idx] = 0;
        }
        // From analysis.association.GenomicSelectionPlugin.java:
        PearsonsCorrelation Pearsons = new PearsonsCorrelation();
        double p1 = Pearsons.correlation(ddepths1,ddepths2);
        
        SpearmansCorrelation Spearmans = new SpearmansCorrelation();
        double spearval = Spearmans.correlation(ddepths1, ddepths2);
        
        double p2 = Pearsons.correlation(depthsPrime1, depthsPrime2);
        
        // Count number of times both tags appeared in a taxa, number
        // of times neither tag appeared in a taxa, number of times
        // tag1 appeared but not tag2, and number of times tag2 appeared by not tag1
        int t1Nott2 = 0;
        int t2Nott1 = 0;
        int neither = 0;
        int both = 0;
        
        for (int didx = 0; didx < depthsPrime2.length; didx++) {
            if (depthsPrime1[didx] > 0 && depthsPrime2[didx] > 0) both++;
            if (depthsPrime1[didx] == 0 && depthsPrime2[didx] == 0) neither++;
            if (depthsPrime1[didx] > 0 && depthsPrime2[didx] == 0) t1Nott2++;
            if (depthsPrime1[didx] == 0 && depthsPrime2[didx] > 0) t2Nott1++;
        }
        // Calculate r-squared based on presence/absence of tags at each taxa.
        double r2 = LinkageDisequilibrium.calculateRSqr(neither, t1Nott2, t2Nott1, both, minTaxa());
        TagCorrelationInfo tci = new TagCorrelationInfo(tag2,p1,spearval,p2,r2);
        tagTagCorrelations.put(tag1, tci);       
    }
    
    @Override
    public ImageIcon getIcon() {
        // TODO Auto-generated method stub
        return null;
    }

    @Override
    public String getButtonName() {
        // TODO Auto-generated method stub
        return null;
    }

    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
     public static void main(String[] args) {
         GeneratePluginCode.generate(RepGenLDAnalysisPlugin.class);
     }
    @Override
    public String getToolTipText() {
        // TODO Auto-generated method stub
        return null;
    }
    /**
     * Input database file with tags and taxa distribution
     *
     * @return Input DB
     */
    public String inputDB() {
        return myDBFile.value();
    }

    /**
     * Set Input DB. Input database file with tags and taxa
     * distribution
     *
     * @param value Input DB
     *
     * @return this plugin
     */
    public RepGenLDAnalysisPlugin inputDB(String value) {
        myDBFile = new PluginParameter<>(myDBFile, value);
        return this;
    }
    
    /**
     * Minimum number of taxa that must be present for R-squared
     * to be calculated.
     *
     * @return Min Taxa for RSquared
     */
    public Integer minTaxa() {
        return minTaxa.value();
    }

    /**
     * Set Min Taxa for RSquared. Minimum number of taxa that
     * must be present for R-squared to be calculated.
     *
     * @param value Min Taxa for RSquared
     *
     * @return this plugin
     */
    public RepGenLDAnalysisPlugin minTaxa(Integer value) {
        minTaxa = new PluginParameter<>(minTaxa, value);
        return this;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy