umcg.genetica.methylation.AssociatingPcasWithAnnotation Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of genetica-libraries Show documentation
There is a newer version: 1.0.7
package umcg.genetica.methylation;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.commons.collections.primitives.ArrayDoubleList;
import org.apache.commons.math3.stat.correlation.SpearmansCorrelation;
import umcg.genetica.io.text.TextFile;
import umcg.genetica.math.matrix.DoubleMatrixDataset;
import umcg.genetica.math.stats.Correlation;
import umcg.genetica.math.stats.TTest;
import umcg.genetica.math.stats.ZScores;

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
/**
 *
 * @author MarcJan & Juha
 */
public class AssociatingPcasWithAnnotation {

    private static Pattern SPLIT_ON_TAB = Pattern.compile("\\t");

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws IOException, ClassNotFoundException {

//        DoubleMatrixDataset ds = new DoubleMatrixDataset("/Data/Sasha/GeneGSEAgeCorrelationZScoresGPL570.txt").getTransposedDataset();
////        Integer get = ds.hashRows.get("GSE16716+GSE20194+GSE24080");
//        Integer get = ds.hashRows.get("GSE14924");
//        System.out.println(ArrayMath.max(ds.rawData[get]));
//        return;
        
//        FisherExactTest fe = new FisherExactTest();
//        double fisherPValue = fe.getFisherPValue(67, 420, 55, 19930);
//        System.out.println(fisherPValue);
//        return;
        
//        String fileWithAnnotation = "/Data/MJ/Annotation/GPL8490_family_annotation_mesh_2013_2.txt";
        String fileWithAnnotation = "/Data/Sasha/GPL96GPL570AgeSamplesWithRangesAveragedInfantsLeftOut.txt";
//        String eigenVectorFile = "/Data/MJ/PCA_GPL8490_19102012/eigenvectors_Filtered.txt";
//        String eigenVectorFile = "/Data/MJ/PCA_GPL8490_SexChrs-Filtered/eigenvectors_Filtered.txt";
//        String eigenVectorFile = "/Data/MJ/PCA_GPL8490_19102012/GPL8490_family_all.quantilenormalized-missingvaluesreplaced.txt";
//        String eigenVectorFile = "/Data/MJ/GPL8490_family_SexProbesRemoved.quantilenormalized.missingvaluesreplaced-transposed.binary";
        String datafile = "/Data/GeneExpressionFinal/PCA/GPL570/GPL570ExpressiondataQNOnlyHumanSamplesOnlyENSGsCollapsed.binary";
//        String datafile = "/Data/GeneExpressionFinal/PCA/GPL96/GPL96ExpressiondataQNOnlyHumanSamplesOnlyENSGsCollapsed.binary";

        System.out.print("Read annotation file .... ");
        HashMap sampleAnnotation = readAnnotationFile(fileWithAnnotation);
        System.out.println("done");

//        TextFile tf = new TextFile("/Data/MJ/Top500AgeProbes.txt", TextFile.R);
//        Set probes = new HashSet(tf.readAsArrayList());
//        System.out.println(probes.size() + " probes read");

        TextFile tf = new TextFile("/Data/GeneExpressionFinal/SampleAnnotation/GPL570/GPL570CellLineSamplesAsPerTextMiningAndCorrelationWithCellLineProfile.txt", TextFile.R);
//        TextFile tf = new TextFile("/Data/GeneExpressionFinal/SampleAnnotation/GPL96/GPL96CellLineSamplesAsPerTextMiningAndCorrelationWithCellLineProfile.txt", TextFile.R);
        ArrayList kickOutSamples = new ArrayList(tf.readAsArrayList());
        System.out.println(kickOutSamples.size() + " samples will be kicked out");

        System.out.print("Read data file .... ");
//        DoubleMatrixDataset data = readDoubleMatrixFile(datafile);
        DoubleMatrixDataset data = readDoubleMatrixFileWithOutGivenColumns(datafile, kickOutSamples);
//        eigenVectors = eigenVectors.getTransposedDataset();
//        eigenVectors.save("/Data/MJ/GPL8490_family_SexProbesRemoved.quantilenormalized.missingvaluesreplaced-transposed.binary");
//        data.save("/Data/GeneExpressionFinal/PCA/GPL96/GPL96ExpressiondataQNOnlyHumanSamplesOnlyENSGsCollapsed.binary");
        System.out.println("done");

//        String infoKey = "Gender";
        String infoKey = "Age";
//        ArrayList entries = new ArrayList();

        //entries.addAll(Arrays.asList("Male", "Female"));

        HashMap> interestSets;
        interestSets = selectSamplesWithInformationOfInterest(sampleAnnotation, infoKey, data, false);
        //interestSets = selectSamplesWithSeriesInformation(sampleAnnotation, eigenVectors);

        System.out.println("Number of interest sets: " + interestSets.size());

        //associateScoreAndItemOfInterest(eigenVectors, interestSets, entries);
        correlateScoreAndItemOfInterest(data, interestSets, "/Data/Sasha/GenesCorrelatedWithAgeGPL570CellLinesExcludedLeaveOneOut.txt", false);
    }

    /**
     * Read annotation file Tab separated file containing sample annotation
     * 
     * @param fileWithAnnotation
     * @return Sample annotation
     */
    private static HashMap readAnnotationFile(String fileWithAnnotation) throws IOException {

        TextFile tf = new TextFile("/Data/GeneExpressionFinal/SampleAnnotation/GSMToGenericGSEName-GSE2109SplitPerTissue.txt", false);
        Map gsm2gse = tf.readAsHashMap(0, 1);

        HashMap sampleInfo = new HashMap();

        try {
            TextFile in = new TextFile(fileWithAnnotation, TextFile.R);

            String str = in.readLine();

            String[] headers = SPLIT_ON_TAB.split(str);


            int meshInfoIndex = -1;

            for (int i = 1; i < headers.length; ++i) {
                if (headers[i].toLowerCase().contains("mesh")) {
                    meshInfoIndex = i;
                    break;
                }
            }

            while ((str = in.readLine()) != null) {
                String[] entries = SPLIT_ON_TAB.split(str);
                String gse = gsm2gse.get(entries[0]);
                if (gse == null) {
                    System.out.println("problem");
                }
                entries[2] = gse;
                SoftfileAnnotation tmp = new SoftfileAnnotation();

                tmp.setAccession(entries[0]);

                if (!(meshInfoIndex < 0)) {
                    tmp.setMeshTerms(entries[meshInfoIndex]);
                }

                for (int i = 1; i < entries.length; ++i) {

                    tmp.putAnnotationInformation(headers[i], entries[i]);
                }

                sampleInfo.put(entries[0], tmp);
            }
            in.close();
        } catch (IOException e) {
            System.out.println(e.getMessage());
            System.exit(-1);
        }

        return (sampleInfo);
    }

    /**
     * Read double matrix file
     * Eigenvector file / pc file / probe matrix
     * @param eigenVectorFile
     * @return 
     */
    private static DoubleMatrixDataset readDoubleMatrixFile(String eigenVectorFile) {

        return readDoubleMatrixFile(eigenVectorFile, null);
    }

    /**
     * Read double matrix file restricting to given rows
     * Eigenvector file / pc file / probe matrix
     * @param eigenVectorFile
     * @return 
     */
    private static DoubleMatrixDataset readDoubleMatrixFile(String eigenVectorFile, Set rowsToInclude) {

        DoubleMatrixDataset tmp = new DoubleMatrixDataset();
        try {
            if (rowsToInclude == null) {
                tmp = new DoubleMatrixDataset(eigenVectorFile);//, "\t");                
            } else {
                tmp = new DoubleMatrixDataset(eigenVectorFile, null, rowsToInclude);//, "\t");
            }
        } catch (IOException ex) {
            Logger.getLogger(AssociatingPcasWithAnnotation.class.getName()).log(Level.SEVERE, null, ex);
        }

        return (tmp);
    }

    /**
     * Read double matrix file not including given columns
     * Eigenvector file / pc file / probe matrix
     * @param eigenVectorFile
     * @return 
     */
    private static DoubleMatrixDataset readDoubleMatrixFileWithOutGivenColumns(String eigenVectorFile, ArrayList columnsToExclude) throws IOException, ClassNotFoundException {

        List