Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
umcg.genetica.methylation.AssociatingPcasWithAnnotation Maven / Gradle / Ivy
package umcg.genetica.methylation;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.commons.collections.primitives.ArrayDoubleList;
import org.apache.commons.math3.stat.correlation.SpearmansCorrelation;
import umcg.genetica.io.text.TextFile;
import umcg.genetica.math.matrix.DoubleMatrixDataset;
import umcg.genetica.math.stats.Correlation;
import umcg.genetica.math.stats.TTest;
import umcg.genetica.math.stats.ZScores;
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
/**
*
* @author MarcJan & Juha
*/
public class AssociatingPcasWithAnnotation {
private static Pattern SPLIT_ON_TAB = Pattern.compile("\\t");
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws IOException, ClassNotFoundException {
// DoubleMatrixDataset ds = new DoubleMatrixDataset("/Data/Sasha/GeneGSEAgeCorrelationZScoresGPL570.txt").getTransposedDataset();
//// Integer get = ds.hashRows.get("GSE16716+GSE20194+GSE24080");
// Integer get = ds.hashRows.get("GSE14924");
// System.out.println(ArrayMath.max(ds.rawData[get]));
// return;
// FisherExactTest fe = new FisherExactTest();
// double fisherPValue = fe.getFisherPValue(67, 420, 55, 19930);
// System.out.println(fisherPValue);
// return;
// String fileWithAnnotation = "/Data/MJ/Annotation/GPL8490_family_annotation_mesh_2013_2.txt";
String fileWithAnnotation = "/Data/Sasha/GPL96GPL570AgeSamplesWithRangesAveragedInfantsLeftOut.txt";
// String eigenVectorFile = "/Data/MJ/PCA_GPL8490_19102012/eigenvectors_Filtered.txt";
// String eigenVectorFile = "/Data/MJ/PCA_GPL8490_SexChrs-Filtered/eigenvectors_Filtered.txt";
// String eigenVectorFile = "/Data/MJ/PCA_GPL8490_19102012/GPL8490_family_all.quantilenormalized-missingvaluesreplaced.txt";
// String eigenVectorFile = "/Data/MJ/GPL8490_family_SexProbesRemoved.quantilenormalized.missingvaluesreplaced-transposed.binary";
String datafile = "/Data/GeneExpressionFinal/PCA/GPL570/GPL570ExpressiondataQNOnlyHumanSamplesOnlyENSGsCollapsed.binary";
// String datafile = "/Data/GeneExpressionFinal/PCA/GPL96/GPL96ExpressiondataQNOnlyHumanSamplesOnlyENSGsCollapsed.binary";
System.out.print("Read annotation file .... ");
HashMap sampleAnnotation = readAnnotationFile(fileWithAnnotation);
System.out.println("done");
// TextFile tf = new TextFile("/Data/MJ/Top500AgeProbes.txt", TextFile.R);
// Set probes = new HashSet(tf.readAsArrayList());
// System.out.println(probes.size() + " probes read");
TextFile tf = new TextFile("/Data/GeneExpressionFinal/SampleAnnotation/GPL570/GPL570CellLineSamplesAsPerTextMiningAndCorrelationWithCellLineProfile.txt", TextFile.R);
// TextFile tf = new TextFile("/Data/GeneExpressionFinal/SampleAnnotation/GPL96/GPL96CellLineSamplesAsPerTextMiningAndCorrelationWithCellLineProfile.txt", TextFile.R);
ArrayList kickOutSamples = new ArrayList(tf.readAsArrayList());
System.out.println(kickOutSamples.size() + " samples will be kicked out");
System.out.print("Read data file .... ");
// DoubleMatrixDataset data = readDoubleMatrixFile(datafile);
DoubleMatrixDataset data = readDoubleMatrixFileWithOutGivenColumns(datafile, kickOutSamples);
// eigenVectors = eigenVectors.getTransposedDataset();
// eigenVectors.save("/Data/MJ/GPL8490_family_SexProbesRemoved.quantilenormalized.missingvaluesreplaced-transposed.binary");
// data.save("/Data/GeneExpressionFinal/PCA/GPL96/GPL96ExpressiondataQNOnlyHumanSamplesOnlyENSGsCollapsed.binary");
System.out.println("done");
// String infoKey = "Gender";
String infoKey = "Age";
// ArrayList entries = new ArrayList();
//entries.addAll(Arrays.asList("Male", "Female"));
HashMap> interestSets;
interestSets = selectSamplesWithInformationOfInterest(sampleAnnotation, infoKey, data, false);
//interestSets = selectSamplesWithSeriesInformation(sampleAnnotation, eigenVectors);
System.out.println("Number of interest sets: " + interestSets.size());
//associateScoreAndItemOfInterest(eigenVectors, interestSets, entries);
correlateScoreAndItemOfInterest(data, interestSets, "/Data/Sasha/GenesCorrelatedWithAgeGPL570CellLinesExcludedLeaveOneOut.txt", false);
}
/**
* Read annotation file Tab separated file containing sample annotation
*
* @param fileWithAnnotation
* @return Sample annotation
*/
private static HashMap readAnnotationFile(String fileWithAnnotation) throws IOException {
TextFile tf = new TextFile("/Data/GeneExpressionFinal/SampleAnnotation/GSMToGenericGSEName-GSE2109SplitPerTissue.txt", false);
Map gsm2gse = tf.readAsHashMap(0, 1);
HashMap sampleInfo = new HashMap();
try {
TextFile in = new TextFile(fileWithAnnotation, TextFile.R);
String str = in.readLine();
String[] headers = SPLIT_ON_TAB.split(str);
int meshInfoIndex = -1;
for (int i = 1; i < headers.length; ++i) {
if (headers[i].toLowerCase().contains("mesh")) {
meshInfoIndex = i;
break;
}
}
while ((str = in.readLine()) != null) {
String[] entries = SPLIT_ON_TAB.split(str);
String gse = gsm2gse.get(entries[0]);
if (gse == null) {
System.out.println("problem");
}
entries[2] = gse;
SoftfileAnnotation tmp = new SoftfileAnnotation();
tmp.setAccession(entries[0]);
if (!(meshInfoIndex < 0)) {
tmp.setMeshTerms(entries[meshInfoIndex]);
}
for (int i = 1; i < entries.length; ++i) {
tmp.putAnnotationInformation(headers[i], entries[i]);
}
sampleInfo.put(entries[0], tmp);
}
in.close();
} catch (IOException e) {
System.out.println(e.getMessage());
System.exit(-1);
}
return (sampleInfo);
}
/**
* Read double matrix file
* Eigenvector file / pc file / probe matrix
* @param eigenVectorFile
* @return
*/
private static DoubleMatrixDataset readDoubleMatrixFile(String eigenVectorFile) {
return readDoubleMatrixFile(eigenVectorFile, null);
}
/**
* Read double matrix file restricting to given rows
* Eigenvector file / pc file / probe matrix
* @param eigenVectorFile
* @return
*/
private static DoubleMatrixDataset readDoubleMatrixFile(String eigenVectorFile, Set rowsToInclude) {
DoubleMatrixDataset tmp = new DoubleMatrixDataset();
try {
if (rowsToInclude == null) {
tmp = new DoubleMatrixDataset(eigenVectorFile);//, "\t");
} else {
tmp = new DoubleMatrixDataset(eigenVectorFile, null, rowsToInclude);//, "\t");
}
} catch (IOException ex) {
Logger.getLogger(AssociatingPcasWithAnnotation.class.getName()).log(Level.SEVERE, null, ex);
}
return (tmp);
}
/**
* Read double matrix file not including given columns
* Eigenvector file / pc file / probe matrix
* @param eigenVectorFile
* @return
*/
private static DoubleMatrixDataset readDoubleMatrixFileWithOutGivenColumns(String eigenVectorFile, ArrayList columnsToExclude) throws IOException, ClassNotFoundException {
List columnObjectsOnly = DoubleMatrixDataset.getColumnObjectsOnly(eigenVectorFile);
Set colsToRetain = new HashSet(columnObjectsOnly);
colsToRetain.removeAll(columnsToExclude);
DoubleMatrixDataset tmp = new DoubleMatrixDataset(eigenVectorFile, null, colsToRetain);
return (tmp);
}
/**
* Filter out interest GSE sets.
* Sets need to have at least 2 different values for the infoKey of interest.
* Automagicaly it checks if samples are in the double matrix dataset
*
* @param sampleAnnotation Annotation information
* @param infoKey Key for annotation of interest
* @param doubleMatrix double matrix dataset
* @return
*/
private static HashMap> selectSamplesWithInformationOfInterest(HashMap sampleAnnotation, String infoKey, DoubleMatrixDataset eigenVectors, boolean samplesOnRows) {
HashMap> gseSets = new HashMap>();
ArrayList removeSamples = new ArrayList();
for (Entry tmp : sampleAnnotation.entrySet()) {
if (!(tmp.getValue().getAnnotationInformation().containsKey(infoKey))) {
System.out.print("No " + infoKey + " information");
System.exit(0);
}
break;
}
for (Entry sample : sampleAnnotation.entrySet()) {
if (!(sample.getValue().getAnnotationInformation().get(infoKey).isEmpty()) || !(sample.getValue().getAnnotationInformation().get(infoKey).equals(""))) {
boolean contains;
if (samplesOnRows) {
contains = eigenVectors.rowObjects.contains(sample.getKey());
} else {
contains = eigenVectors.colObjects.contains(sample.getKey());
}
if (contains) {
String seriesId = sample.getValue().getAnnotationInformation().get("series id");
if (gseSets.containsKey(seriesId)) {
gseSets.get(seriesId).put(sample.getKey(), sample.getValue().getAnnotationInformation().get(infoKey));
} else {
HashMap tmp = new HashMap();
tmp.put(sample.getKey(), sample.getValue().getAnnotationInformation().get(infoKey));
gseSets.put(seriesId, tmp);
}
} else {
removeSamples.add(sample.getKey());
}
} else {
removeSamples.add(sample.getKey());
}
}
ArrayList removeGseSets = new ArrayList();
int numberOfInterestSets = 0;
int numberOfInterestSamples = 0;
if (gseSets.size() > 0) {
for (Entry> gse : gseSets.entrySet()) {
ArrayList uniqueValues = new ArrayList();
for (Entry sample : gse.getValue().entrySet()) {
if (!uniqueValues.contains(sample.getValue())) {
uniqueValues.add(sample.getValue());
}
}
if (uniqueValues.size() >= 2 && gse.getValue().size() >= 10) {
numberOfInterestSets++;
// System.out.println(gse.getKey());
for (Entry sample : gse.getValue().entrySet()) {
numberOfInterestSamples++;
//System.out.println("\t" + sample.getKey() + "\t" + sample.getValue());
}
} else {
removeGseSets.add(gse.getKey());
for (Entry sample : gse.getValue().entrySet()) {
removeSamples.add(sample.getKey());
}
}
}
} else {
System.out.println("Unforeseen error check Key and code");
System.exit(0);
}
System.out.println("Number of sets: " + numberOfInterestSets);
System.out.println("Total samples of interest: " + numberOfInterestSamples);
for (String removeEntry : removeGseSets) {
gseSets.remove(removeEntry);
}
for (String removeEntry : removeSamples) {
sampleAnnotation.remove(removeEntry);
}
return (gseSets);
}
/**
* Filter out interest GSE sets. Samples need to have a GSE id. Automagicaly
* it checks if samples are in the double matrix dataset
*
* @param sampleAnnotation Annotation information
* @param infoKey Key for annotation of interest
* @param doubleMatrix double matrix dataset
* @return
*/
private static HashMap> selectSamplesWithSeriesInformation(HashMap sampleAnnotation, DoubleMatrixDataset eigenVectors) {
HashMap> gseSets = new HashMap>();
ArrayList removeSamples = new ArrayList();
for (Entry sample : sampleAnnotation.entrySet()) {
if (!(sample.getValue().getAnnotationInformation().get("series id").isEmpty()) || !(sample.getValue().getAnnotationInformation().get("series id").equals(""))) {
if (eigenVectors.rowObjects.contains(sample.getKey())) {
String seriesId = sample.getValue().getAnnotationInformation().get("series id");
if (gseSets.containsKey(seriesId)) {
gseSets.get(seriesId).put(sample.getKey(), sample.getValue().getAnnotationInformation().get("series id"));
} else {
HashMap tmp = new HashMap();
tmp.put(sample.getKey(), sample.getValue().getAnnotationInformation().get("series id"));
gseSets.put(seriesId, tmp);
}
}
} else {
removeSamples.add(sample.getKey());
}
}
for (String removeEntry : removeSamples) {
sampleAnnotation.remove(removeEntry);
}
return (gseSets);
}
/**
* Test for difference between 2 groups with T-test.
*
* @param doubleMatrix
* @param interestSets
* @param entries names of the two groups
*/
public static void associateScoreAndItemOfInterest(DoubleMatrixDataset doubleMatrix, HashMap> interestSets, ArrayList entries) {
//System.out.println(eigenVectors.nrCols);
//System.out.println(eigenVectors.nrRows);
HashMap scorePerGse = new HashMap();
HashMap indeces = new HashMap();
for (Entry> set : interestSets.entrySet()) {
for (Entry sample : set.getValue().entrySet()) {
if (doubleMatrix.rowObjects.contains(sample.getKey())) {
int index = doubleMatrix.rowObjects.indexOf(sample.getKey());
indeces.put(sample.getKey(), index);
} else {
System.out.println("Potential mismatch between annotation and samples");
System.out.println(sample.getKey() + " is not in value matrix");
System.out.println("\n However :" + indeces.size() + " are in the matrix");
System.exit(0);
}
}
}
//System.out.println(indeces.size());
for (int i = 0; i < doubleMatrix.nrCols; ++i) {
//System.out.println(doubleMatrix.colObjects.get(i));
for (Entry> set : interestSets.entrySet()) {
ArrayDoubleList valueSet1 = new ArrayDoubleList();
ArrayDoubleList valueSet2 = new ArrayDoubleList();
//ArrayList keySet1 = new ArrayList();
//ArrayList keySet2 = new ArrayList();
for (Entry sample : set.getValue().entrySet()) {
if (sample.getValue().equals(entries.get(0))) {
//keySet1.add(sample.getKey());
valueSet1.add(doubleMatrix.rawData[indeces.get(sample.getKey())][i]);
} else if (sample.getValue().equals(entries.get(1))) {
//keySet2.add(sample.getKey());
valueSet2.add(doubleMatrix.rawData[indeces.get(sample.getKey())][i]);
}
}
double[] set1 = valueSet1.toArray(new double[0]);
double[] set2 = valueSet2.toArray(new double[0]);
if (set1.length > 2 && set2.length > 2) {
double zScore = TTest.testZscore(set1, set2);
//System.out.println(doubleMatrix.colObjects.get(i)+"_"+set.getKey()+"\t"+pValue);
scorePerGse.put(doubleMatrix.colObjects.get(i) + "_" + set.getKey(), zScore);
}
}
}
}
public static String[] readGeneNamesForProbes(String filename, List probes) throws IOException {
String[] geneNames = new String[probes.size()];
TextFile in = new TextFile(filename, TextFile.R);
String line;
in.readLine();
while ((line = in.readLine()) != null) {
String[] split = SPLIT_ON_TAB.split(line);
int indexOf = probes.indexOf(split[0]);
if (indexOf >= 0) {
geneNames[indexOf] = split[1];
}
}
in.close();
return geneNames;
}
/**
* Correlate values of interest to age
*
* @param doubleMatrix
* @param interestSets
*/
public static void correlateScoreAndItemOfInterest(DoubleMatrixDataset doubleMatrix, HashMap> interestSets, String outfile, boolean samplesOnRows) throws IOException {
HashMap indeces = new HashMap();
int largestSet = 0;
for (Entry> set : interestSets.entrySet()) {
for (Entry sample : set.getValue().entrySet()) {
int index;
if (samplesOnRows) {
index = doubleMatrix.rowObjects.indexOf(sample.getKey());
} else {
index = doubleMatrix.colObjects.indexOf(sample.getKey());
}
if (index > -1) {
indeces.put(sample.getKey(), index);
} else {
System.out.println("Potential mismatch between annotation and samples");
System.out.println(sample.getKey() + " is not in value matrix");
System.out.println("\n However :" + indeces.size() + " are in the matrix");
System.exit(0);
}
}
if (largestSet < set.getValue().size()) {
largestSet = set.getValue().size();
}
}
Correlation.correlationToZScore(largestSet);
// String[] geneNames = readGeneNamesForProbes("/Data/MJ/gpl_8490.chip", doubleMatrix.colObjects);
TextFile plos = new TextFile("/Data/MJ/Epigenome-Wide_Scans.txt", TextFile.R);
Map plosPValues = plos.readAsHashMap(0, 5);
int nrProbes;
if (samplesOnRows) {
nrProbes = doubleMatrix.nrCols;
} else {
nrProbes = doubleMatrix.nrRows;
}
double[] metaZ = new double[nrProbes];
double[][] probeGSEZ = new double[nrProbes][interestSets.size()];
double[][] leaveOneOutProbeGSEZ = new double[nrProbes][interestSets.size() + 1];
SpearmansCorrelation sc = new SpearmansCorrelation();
TextFile out = new TextFile(outfile, TextFile.W);
String[] setNames = new String[interestSets.size()];
int[] setSizes = new int[interestSets.size()];
for (int i = 0; i < nrProbes; ++i) {
double[] zScores = new double[interestSets.size()];
int index = 0;
for (Entry> set : interestSets.entrySet()) {
int sizeOfGseSet = set.getValue().size();
setSizes[index] = sizeOfGseSet;
setNames[index] = set.getKey();
ArrayDoubleList valueSet = new ArrayDoubleList();
ArrayDoubleList ageSet = new ArrayDoubleList();
for (Entry sample : set.getValue().entrySet()) {
if (samplesOnRows) {
valueSet.add(doubleMatrix.rawData[indeces.get(sample.getKey())][i]);
} else {
valueSet.add(doubleMatrix.rawData[i][indeces.get(sample.getKey())]);
}
try {
ageSet.add(Double.parseDouble(sample.getValue()));
} catch (NumberFormatException ex) { // data not numerical, assume gender here as a quick fix
ageSet.add("male".equals(sample.getValue().toLowerCase()) ? 1 : 2);
}
}
double[] setValues = valueSet.toArray(new double[0]);
double[] setAges = ageSet.toArray(new double[0]);
double spearman = sc.correlation(setValues, setAges);
// double correlation = JSci.maths.ArrayMath.correlation(setValues, setAges);
// double zScore = Correlation.convertCorrelationToZScore(sizeOfGseSet, correlation);
double zScore = Correlation.convertCorrelationToZScore(sizeOfGseSet, spearman);
zScores[index] = zScore;
index++;
// if (Math.abs(zScore) > 4.5 && "GSE16716+GSE20194+GSE24080".equals(set.getKey())) {
// if (Math.abs(zScore) > 6.66 && setSizes[index-1] < 35) {
if ("ENSG00000117525".equals(doubleMatrix.rowObjects.get(i))) {
// if ("GSE8167".equals(set.getKey()) && "ENSG00000088448".equals(doubleMatrix.rowObjects.get(i))) {
System.out.println(zScore + " " + spearman + " " + ZScores.zToP(zScore));
// for (int j = 0; j < setValues.length; j++) {
// System.out.println(doubleMatrix.rowObjects.get(i) + " " + setValues[j] + " " + setAges[j]);
// }
}
}
probeGSEZ[i] = zScores;
// leave-one-out z weighting
for (int leave = 0; leave < zScores.length; leave++) { // leave leave'th z score out
double[] zScoresLeft = new double[zScores.length - 1];
int[] setSizesLeft = new int[zScores.length - 1];
int zi = 0;
for (int j = 0; j < zScores.length; j++) {
if (j != leave) {
zScoresLeft[zi] = zScores[j];
setSizesLeft[zi] = setSizes[j];
zi++;
}
}
double leftZ = ZScores.getWeightedZ(zScoresLeft, setSizesLeft);
double p = ZScores.zToP(leftZ);
leaveOneOutProbeGSEZ[i][leave + 1] = leftZ;
// out.writeln(doubleMatrix.colObjects.get(i) + "\t" + geneNames[i] + "\t" + setNames[leave] + "\t" + leftZ + "\t" + p);
out.writeln(doubleMatrix.colObjects.get(i) + "\t" + setNames[leave] + "\t" + leftZ + "\t" + p);
}
metaZ[i] = ZScores.getWeightedZ(zScores, setSizes);
leaveOneOutProbeGSEZ[i][0] = metaZ[i];
double p = ZScores.zToP(metaZ[i]);
// String plosP = plosPValues.get(doubleMatrix.colObjects.get(i));
// out.writeln(doubleMatrix.colObjects.get(i) + "\t" + geneNames[i] + "\t-\t" + metaZ[i] + "\t" + p);
if (samplesOnRows) {
out.writeln(doubleMatrix.colObjects.get(i) + "\t-\t" + metaZ[i] + "\t" + p);
} else {
out.writeln(doubleMatrix.rowObjects.get(i) + "\t-\t" + metaZ[i] + "\t" + p);
}
}
out.close();
for (int i = 0; i < setNames.length; i++) {
System.out.println(setNames[i] + "\t" + setSizes[i]);
}
DoubleMatrixDataset probeGSEDataset = new DoubleMatrixDataset(probeGSEZ);
// probeGSEDataset.rowObjects = doubleMatrix.colObjects;
probeGSEDataset.rowObjects = doubleMatrix.rowObjects;
probeGSEDataset.colObjects = Arrays.asList(setNames);
// probeGSEDataset.removeColumnsWithNaNs();
// probeGSEDataset.save("/Data/MJ/ProbeGSEAgeCorrelationZScores.txt");
probeGSEDataset.save("/Data/Sasha/GeneGSEAgeCorrelationZScoresGPL570.txt");
//
// DoubleMatrixDataset leaveOneOutDataset = new DoubleMatrixDataset(leaveOneOutProbeGSEZ);
// leaveOneOutDataset.rowObjects = doubleMatrix.colObjects;
// List setNamesAbstractList = Arrays.asList(setNames);
// List setNamesList = new ArrayList(setNamesAbstractList);
// setNamesList.add(0, "meta");
// leaveOneOutDataset.colObjects = setNamesList;
// leaveOneOutDataset.removeColumnsWithNaNs();
// leaveOneOutDataset.save("/Data/MJ/ProbeGSEAgeCorrelationLeaveOneOutZScores.txt");
// for (Entry> set : interestSets.entrySet()) {
// for (Entry e : set.getValue().entrySet()) {
// Integer sampleIndex = indeces.get(e.getKey());
// double correlation = ArrayMath.correlation(doubleMatrix.rawData[sampleIndex], metaZ);
// System.out.println(e.getKey() + "\t" + e.getValue() + "\t" + correlation);
// }
// }
}
/**
* Test for difference between all GSE groups with Anova.
*
* @param doubleMatrix
* @param interestSets
* @param entries names of the two groups
*/
public static void associateAnovaScoreAndItemOfInterest(DoubleMatrixDataset doubleMatrix, HashMap> interestSets) {
//System.out.println(eigenVectors.nrCols);
//System.out.println(eigenVectors.nrRows);
// HashMap scorePerGse = new HashMap();
HashMap indeces = new HashMap();
for (Entry> set : interestSets.entrySet()) {
for (Entry sample : set.getValue().entrySet()) {
if (doubleMatrix.rowObjects.contains(sample.getKey())) {
int index = doubleMatrix.rowObjects.indexOf(sample.getKey());
indeces.put(sample.getKey(), index);
} else {
System.out.println("Potential mismatch between annotation and samples");
System.out.println(sample.getKey() + " is not in value matrix");
System.out.println("\n However :" + indeces.size() + " are in the matrix");
System.exit(0);
}
}
}
}
}