
umcg.genetica.methylation.MethylationAssociatoingAnnotationWithValues Maven / Gradle / Ivy
package umcg.genetica.methylation;
import JSci.maths.ArrayMath;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map.Entry;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.commons.collections.primitives.ArrayDoubleList;
import org.apache.commons.math3.stat.correlation.SpearmansCorrelation;
import org.apache.commons.math3.stat.inference.OneWayAnova;
import umcg.genetica.containers.Pair;
import umcg.genetica.math.matrix.DoubleMatrixDataset;
import umcg.genetica.math.stats.Correlation;
import umcg.genetica.math.stats.Heterogeneity;
import umcg.genetica.math.stats.TTest;
import umcg.genetica.math.stats.ZScores;
import static umcg.genetica.methylation.ParseTcgaMethylationFile.ENCODING;
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
/**
*
* @author MarcJan
*/
public class MethylationAssociatoingAnnotationWithValues {
private static Pattern SPLIT_ON_TAB = Pattern.compile("\\t");
private static Pattern SPLIT_PARTS = Pattern.compile("-");
/**
* @param args the command line arguments
*/
public static void main(String[] args) {
String fileWithAnnotation = "D:\\UMCG\\Methylation_GPL8490\\TCGA+GEO_14112012\\Annotation_AllSamples.txt";
String dataFile = "D:\\UMCG\\Methylation_GPL8490\\TCGA+GEO_14112012\\methylation_Matrix_SexFiltered.QuantileNormalized.txt";
//String dataFile = "D:\\UMCG\\Methylation_GPL8490\\TCGA+GEO_14112012\\NeverQN-methylation_Matrix_SexFiltered.txt";
System.out.print("Read annotation file .... ");
HashMap sampleAnnotation = readAnnotationFile(fileWithAnnotation);
System.out.println("done");
System.out.print("Read eigenvector file .... ");
DoubleMatrixDataset eigenVectors = readDoubleMatrixFile(dataFile);
System.out.println("done");
ArrayList setSelection = new ArrayList();
//Only Blood
//setSelection.addAll(Arrays.asList("GSE20236", "GSE23638","GSE19711","GSE20067","GSE41037"));
//No Cancer
setSelection.addAll(Arrays.asList("GSE20236","GSE23638","GSE19711","GSE20067","GSE15745 // GSE36194","GSE15745","GSE41037","GSE32393","GSE31979","GSE20242",
"GSE20080","GSE36194","GSE22595","GSE29661","GSE21232","GSE30653 // GSE30654","GSE27097","GSE37988","GSE32861 // GSE32867","GSE17448","GSE33422",
"GSE25033","GSE34035","GSE28746","GSE32396"));
//No Blood
// setSelection.addAll(Arrays.asList("GSE15745 // GSE36194","GSE15745","GSE32393","TCGA.brca","GSE31979","GSE30758 // GSE30760","GSE30759 // GSE30760",
// "GSE20080","TCGA.coad","TCGA.gbm","GSE36194","GSE22595","GSE29661","GSE21232","GSE30653 // GSE30654","TCGA.kirc","TCGA.kirp","GSE37988",
// "GSE32861 // GSE32867","TCGA.lusc","TCGA.luad","GSE17448","GSE33422","TCGA.ov","GSE25033","TCGA.read","GSE34035","GSE28746","TCGA.stad",
// "GSE30844 // GSE31337","TCGA.ucec"));
//
//Analysis type 1) Age
String infoKey = "Age";
//Analysis type 2) Gender
//String infoKey = "Gender";
//ArrayList entries = new ArrayList();
//entries.addAll(Arrays.asList("Male", "Female"));
//Analysis type 3) GSE //Infokey is not used
//String infoKey = "";
String nameSeriesInfoColumn = "series id";
LinkedHashMap> interestSets;
//Depending on the data we need to transpone the data
eigenVectors = eigenVectors.getTransposedDataset();
interestSets = selectSamplesWithInformationOfInterest(sampleAnnotation, nameSeriesInfoColumn, infoKey, eigenVectors, 25);
if(setSelection.size()>0){
ArrayList removeEntry = new ArrayList();
for(Entry> e : interestSets.entrySet()){
if(!(setSelection.contains(e.getKey()))){
removeEntry.add(e.getKey());
}
}
for(String s : removeEntry){
interestSets.remove(s);
}
}
//interestSets = selectSamplesWithSeriesInformation(sampleAnnotation, eigenVectors);//This needs a refresh
//Willen we hier wat mee?
//int maxSize = 150;
//interestSets = splitInterstingSetInPortions(interestSets, maxSize);
//System.exit(0);
for(Entry> tmp : interestSets.entrySet()){
System.out.println(tmp.getKey()+"\t"+tmp.getValue().size());
}
System.out.println("Number of interest sets: " + interestSets.size());
//Analysis type 1) Age
correlateScoreAndItemOfInterest(eigenVectors, interestSets);
//Analysis type 2) Gender
//associateTTestScoreAndItemOfInterest(eigenVectors, interestSets, entries);
//Analysis type 3) GSE
//associateAnovaScoreAndItemOfInterest(eigenVectors, interestSets);
}
/**
* Read annotation file Tab separated file containing sample annotation
*
* @param fileWithAnnotation
* @return Sample annotation
*/
private static HashMap readAnnotationFile(String fileWithAnnotation) {
HashMap sampleInfo = new HashMap();
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fileWithAnnotation)), ENCODING), 8096);
String str = in.readLine();
String[] headers = SPLIT_ON_TAB.split(str);
int meshInfoIndex = -1;
for (int i = 1; i < headers.length; ++i) {
if (headers[i].toLowerCase().contains("mesh")) {
meshInfoIndex = i;
break;
}
}
while ((str = in.readLine()) != null) {
String[] entries = SPLIT_ON_TAB.split(str);
SoftfileAnnotation tmp = new SoftfileAnnotation();
tmp.setAccession(entries[0]);
if (!(meshInfoIndex < 0)) {
tmp.setMeshTerms(entries[meshInfoIndex]);
}
for (int i = 1; i < entries.length; ++i) {
tmp.putAnnotationInformation(headers[i], entries[i]);
}
sampleInfo.put(entries[0], tmp);
}
in.close();
} catch (IOException e) {
System.out.println(e.getMessage());
System.exit(-1);
}
return (sampleInfo);
}
/**
* Read double matrix file Eigenvector file / pc file / probe matrix
*
* @param eigenVectorFile
* @return
*/
private static DoubleMatrixDataset readDoubleMatrixFile(String eigenVectorFile) {
DoubleMatrixDataset tmp = new DoubleMatrixDataset();
try {
tmp = new DoubleMatrixDataset(eigenVectorFile, "\t");
} catch (IOException ex) {
Logger.getLogger(MethylationAssociatoingAnnotationWithValues.class.getName()).log(Level.SEVERE, null, ex);
}
return (tmp);
}
/**
* Filter out interest GSE sets. Sets need to have at least 2 different
* values for the infoKey of interest. Automagicaly it checks if samples are
* in the double matrix dataset
*
* @param sampleAnnotation Annotation information
* @param infoKey Key for annotation of interest
* @param doubleMatrix double matrix dataset
* @return
*/
private static LinkedHashMap> selectSamplesWithInformationOfInterest(HashMap sampleAnnotation, String nameSeriesInfoColumn , String infoKey, DoubleMatrixDataset eigenVectors, int minimalNumberSamplesInSeries) {
LinkedHashMap> gseSets = new LinkedHashMap>();
ArrayList removeSamples = new ArrayList();
for (Entry tmp : sampleAnnotation.entrySet()) {
if (!(tmp.getValue().getAnnotationInformation().containsKey(infoKey))) {
System.out.print("No " + infoKey + " information");
System.exit(0);
}
break;
}
for(String sampleName : eigenVectors.rowObjects){
if(sampleAnnotation.containsKey(sampleName)){
SoftfileAnnotation sampleAnnot = sampleAnnotation.get(sampleName);
if(!(sampleAnnot.getAnnotationInformation().get(infoKey).isEmpty()) || !(sampleAnnot.getAnnotationInformation().get(infoKey).equals(""))){
String seriesId = sampleAnnot.getAnnotationInformation().get(nameSeriesInfoColumn);
if (gseSets.containsKey(seriesId)) {
System.out.println(sampleName+"\t"+ sampleAnnot.getAnnotationInformation().get(infoKey));
gseSets.get(seriesId).put(sampleName, sampleAnnot.getAnnotationInformation().get(infoKey));
} else {
HashMap tmp = new HashMap();
tmp.put(sampleName, sampleAnnot.getAnnotationInformation().get(infoKey));
gseSets.put(seriesId, tmp);
}
}
else{
System.out.println("No age info: " + sampleName);
removeSamples.add(sampleName);
}
} else if(sampleName.startsWith("TCGA-")){
String[] sampleIdParts = SPLIT_PARTS.split(sampleName);
String newSampleName = sampleIdParts[0] + "-" + sampleIdParts[1] + "-" + sampleIdParts[2];
if(sampleAnnotation.containsKey(newSampleName)){
SoftfileAnnotation sampleAnnot = sampleAnnotation.get(newSampleName);
if(!(sampleAnnot.getAnnotationInformation().get(infoKey).isEmpty()) || !(sampleAnnot.getAnnotationInformation().get(infoKey).equals(""))){
String seriesId = sampleAnnot.getAnnotationInformation().get(nameSeriesInfoColumn);
if (gseSets.containsKey(seriesId)) {
System.out.println(sampleName+"\t"+ sampleAnnot.getAnnotationInformation().get(infoKey));
gseSets.get(seriesId).put(sampleName, sampleAnnot.getAnnotationInformation().get(infoKey));
} else {
HashMap tmp = new HashMap();
tmp.put(sampleName, sampleAnnot.getAnnotationInformation().get(infoKey));
gseSets.put(seriesId, tmp);
}
}
else{
System.out.println("No age info: " + sampleName);
removeSamples.add(sampleName);
}
} else{
System.out.println("Not in matrix: " + sampleName);
removeSamples.add(sampleName);
}
} else{
System.out.println("Not in matrix: " + sampleName);
removeSamples.add(sampleName);
}
}
ArrayList removeGseSets = new ArrayList();
int numberOfInterestSets = 0;
int numberOfInterestSamples = 0;
if (gseSets.size() > 0) {
for (Entry> gse : gseSets.entrySet()) {
ArrayList uniqueValues = new ArrayList();
for (Entry sample : gse.getValue().entrySet()) {
if (!uniqueValues.contains(sample.getValue())) {
uniqueValues.add(sample.getValue());
}
}
if (uniqueValues.size() > 1 && gse.getValue().size()>=minimalNumberSamplesInSeries) {
numberOfInterestSets++;
// System.out.println(gse.getKey());
for (Entry sample : gse.getValue().entrySet()) {
numberOfInterestSamples++;
//System.out.println("\t" + sample.getKey() + "\t" + sample.getValue());
}
} else {
removeGseSets.add(gse.getKey());
for (Entry sample : gse.getValue().entrySet()) {
removeSamples.add(sample.getKey());
}
}
}
} else {
System.out.println("Unforeseen error check Key and code");
System.exit(0);
}
System.out.println("Number of sets: " + numberOfInterestSets);
System.out.println("Total samples of interest: " + numberOfInterestSamples);
for (String removeEntry : removeGseSets) {
gseSets.remove(removeEntry);
}
for (String removeEntry : removeSamples) {
sampleAnnotation.remove(removeEntry);
}
return (gseSets);
}
/**
* Filter out interest GSE sets. Samples need to have a GSE id. Automagicaly
* it checks if samples are in the double matrix dataset
*
* @param sampleAnnotation Annotation information
* @param infoKey Key for annotation of interest
* @param doubleMatrix double matrix dataset
* @return
*/
private static HashMap> selectSamplesWithSeriesInformation(HashMap sampleAnnotation, DoubleMatrixDataset eigenVectors) {
HashMap> gseSets = new HashMap>();
ArrayList removeSamples = new ArrayList();
for (Entry sample : sampleAnnotation.entrySet()) {
if (!(sample.getValue().getAnnotationInformation().get("series id").isEmpty()) || !(sample.getValue().getAnnotationInformation().get("series id").equals(""))) {
if (eigenVectors.rowObjects.contains(sample.getKey())) {
String seriesId = sample.getValue().getAnnotationInformation().get("series id");
if (gseSets.containsKey(seriesId)) {
gseSets.get(seriesId).put(sample.getKey(), sample.getValue().getAnnotationInformation().get("series id"));
} else {
HashMap tmp = new HashMap();
tmp.put(sample.getKey(), sample.getValue().getAnnotationInformation().get("series id"));
gseSets.put(seriesId, tmp);
}
}
} else {
removeSamples.add(sample.getKey());
}
}
for (String removeEntry : removeSamples) {
sampleAnnotation.remove(removeEntry);
}
return (gseSets);
}
/**
* Test for difference between 2 groups with T-test.
*
* @param doubleMatrix
* @param interestSets
* @param entries names of the two groups
*/
private static void associateTTestScoreAndItemOfInterest(DoubleMatrixDataset doubleMatrix, HashMap> interestSets, ArrayList entries) {
//System.out.println(eigenVectors.nrCols);
//System.out.println(eigenVectors.nrRows);
HashMap scorePerGse = new HashMap();
HashMap indeces = new HashMap();
for (Entry> set : interestSets.entrySet()) {
for (Entry sample : set.getValue().entrySet()) {
if (doubleMatrix.rowObjects.contains(sample.getKey())) {
int index = doubleMatrix.rowObjects.indexOf(sample.getKey());
indeces.put(sample.getKey(), index);
} else {
System.out.println("Potential mismatch between annotation and samples");
System.out.println(sample.getKey() + " is not in value matrix");
System.out.println("\n However :" + indeces.size() + " are in the matrix");
System.exit(0);
}
}
}
//System.out.println(indeces.size());
for (int i = 0; i < doubleMatrix.nrCols; ++i) {
//System.out.println(doubleMatrix.colObjects.get(i));
for (Entry> set : interestSets.entrySet()) {
ArrayDoubleList valueSet1 = new ArrayDoubleList();
ArrayDoubleList valueSet2 = new ArrayDoubleList();
//ArrayList keySet1 = new ArrayList();
//ArrayList keySet2 = new ArrayList();
for (Entry sample : set.getValue().entrySet()) {
if (sample.getValue().equals(entries.get(0))) {
//keySet1.add(sample.getKey());
valueSet1.add(doubleMatrix.rawData[indeces.get(sample.getKey())][i]);
} else if (sample.getValue().equals(entries.get(1))) {
//keySet2.add(sample.getKey());
valueSet2.add(doubleMatrix.rawData[indeces.get(sample.getKey())][i]);
}
}
double[] set1 = valueSet1.toArray(new double[0]);
double[] set2 = valueSet2.toArray(new double[0]);
if (set1.length > 2 && set2.length > 2) {
double zScore = TTest.testZscore(set1, set2);
//System.out.println(doubleMatrix.colObjects.get(i)+"_"+set.getKey()+"\t"+pValue);
scorePerGse.put(doubleMatrix.colObjects.get(i) + "_" + set.getKey(), zScore);
}
}
}
}
/**
* Correlate values of interest to age
*
* @param doubleMatrix
* @param interestSets
*/
private static void correlateScoreAndItemOfInterest(DoubleMatrixDataset doubleMatrix, LinkedHashMap> interestSets) {
//System.out.println(eigenVectors.nrCols);
//System.out.println(eigenVectors.nrRows);
HashMap scorePerGse = new HashMap();
HashMap indeces = new HashMap();
int largestSet = 0;
for (Entry> set : interestSets.entrySet()) {
for (Entry sample : set.getValue().entrySet()) {
if (doubleMatrix.rowObjects.contains(sample.getKey())) {
int index = doubleMatrix.rowObjects.indexOf(sample.getKey());
indeces.put(sample.getKey(), index);
} else {
System.out.println("Potential mismatch between annotation and samples");
System.out.println(sample.getKey() + " is not in value matrix");
System.out.println("\n However :" + indeces.size() + " are in the matrix");
System.exit(0);
}
}
if (largestSet < set.getValue().size()) {
largestSet = set.getValue().size();
}
}
Correlation.correlationToZScore(largestSet);
//System.out.println(indeces.size());
double[] metaZ = new double[doubleMatrix.nrCols];
System.out.println("Z-scores");
System.out.print("\tMeta Z\tpValue\tHeterogeneity\tHeterogeneity pValue");
for(String t : interestSets.keySet()){
System.out.print("\t"+t);
}
System.out.print("\t");
for(String t : interestSets.keySet()){
System.out.print("\t"+t);
}
System.out.println("");
SpearmansCorrelation sc = new SpearmansCorrelation();
for (int i = 0; i < doubleMatrix.nrCols; ++i) {
//System.out.println(doubleMatrix.colObjects.get(i));
double[] zScores = new double[interestSets.size()];
double[] correlations = new double[interestSets.size()];
int[] setSizes = new int[zScores.length];
int index = 0;
for (Entry> set : interestSets.entrySet()) {
int sizeOfGseSet = set.getValue().size();
setSizes[index] = sizeOfGseSet;
ArrayDoubleList valueSet = new ArrayDoubleList();
ArrayDoubleList ageSet = new ArrayDoubleList();
//ArrayList keySet1 = new ArrayList();
for (Entry sample : set.getValue().entrySet()) {
//keySet1.add(sample.getKey());
//System.out.println(doubleMatrix.rawData[indeces.get(sample.getKey())][i]);
//System.out.println(Double.parseDouble(sample.getValue()));
valueSet.add(doubleMatrix.rawData[indeces.get(sample.getKey())][i]);
try {
ageSet.add(Double.parseDouble(sample.getValue()));
} catch (NumberFormatException ex) { // data not numerical, assume gender here as a quick fix
ageSet.add("male".equals(sample.getValue().toLowerCase()) ? 1 : 2);
}
}
double[] setValues = valueSet.toArray(new double[0]);
double[] setAges = ageSet.toArray(new double[0]);
if (setValues.length > 2) {
double spearman = sc.correlation(setValues, setAges);
// double correlation = JSci.maths.ArrayMath.correlation(setValues, setAges);
// double zScore = Correlation.convertCorrelationToZScore(sizeOfGseSet, correlation);
double zScore = Correlation.convertCorrelationToZScore(sizeOfGseSet, spearman);
zScores[index] = zScore;
correlations[index] = spearman;
// System.out.println(doubleMatrix.colObjects.get(i) + "_" + set.getKey() + "\t" + correlation);
scorePerGse.put(doubleMatrix.colObjects.get(i) + "_" + set.getKey(), zScore);
} else {
zScores[index]=Double.NaN;
}
index++;
}
double zSum = 0;
double sampleSizeSum = 0;
for (int j = 0; j < zScores.length; j++) {
if (!Double.isNaN(zScores[j])) {
zSum += Math.sqrt(setSizes[j]) * zScores[j];
sampleSizeSum += setSizes[j];
}
}
zSum = zSum / Math.sqrt(sampleSizeSum);
double p = ZScores.zToP(zSum);
Pair hg = Heterogeneity.getISq(zScores, setSizes);
//System.out.print("\tMeta Z\tpValue\tHeterogeneity\tHeterogeneity pValue");
System.out.print(doubleMatrix.colObjects.get(i) +"\t"+zSum+"\t"+p+ "\t" + hg.getLeft()+"\t"+hg.getRight());
for (double z : zScores) {
System.out.print("\t" + z);
}
System.out.print("\t");
for (double r : correlations) {
System.out.print("\t" + r);
}
System.out.println("");
metaZ[i] = zSum;
}
System.out.println("");
for (Entry> set : interestSets.entrySet()) {
for (Entry e : set.getValue().entrySet()) {
Integer sampleIndex = indeces.get(e.getKey());
double correlation = ArrayMath.correlation(doubleMatrix.rawData[sampleIndex], metaZ);
System.out.println(e.getKey() + "\t" + e.getValue() + "\t" + correlation);
}
}
// RankDoubleArray ra = new RankDoubleArray();
// double[] rank = ra.rank(metaZ);
// System.out.println(rank[0]);
}
/**
* Test for difference between all GSE groups with Anova.
*
* @param doubleMatrix
* @param interestSets
* @param entries names of the two groups
*/
private static void associateAnovaScoreAndItemOfInterest(DoubleMatrixDataset doubleMatrix, HashMap> interestSets) {
//System.out.println(eigenVectors.nrCols);
//System.out.println(eigenVectors.nrRows);
//HashMap scorePerGse = new HashMap();
HashMap indeces = new HashMap();
for (Entry> set : interestSets.entrySet()) {
for (Entry sample : set.getValue().entrySet()) {
if (doubleMatrix.rowObjects.contains(sample.getKey())) {
int index = doubleMatrix.rowObjects.indexOf(sample.getKey());
indeces.put(sample.getKey(), index);
} else {
System.out.println("Potential mismatch between annotation and samples");
System.out.println(sample.getKey() + " is not in value matrix");
System.out.println("\n However :" + indeces.size() + " are in the matrix");
System.exit(0);
}
}
}
//System.out.println(indeces.size());
for (int i = 0; i < doubleMatrix.nrCols; ++i) {
//System.out.println(doubleMatrix.colObjects.get(i));
//ArrayList keyList = new ArrayList();
ArrayList valueSets = new ArrayList();
for (Entry> set : interestSets.entrySet()) {
ArrayDoubleList valueSet = new ArrayDoubleList();
for (Entry sample : set.getValue().entrySet()) {
valueSet.add(doubleMatrix.rawData[indeces.get(sample.getKey())][i]);
}
if (valueSet.size() > 2) {
//keyList.add(set.getKey());
valueSets.add(valueSet.toArray(new double[0]));
}
}
OneWayAnova anova = new OneWayAnova();
double pValue = -1;
try {
pValue = anova.anovaPValue(valueSets);
} catch (IllegalArgumentException ex) {
Logger.getLogger(MethylationAssociatoingAnnotationWithValues.class.getName()).log(Level.SEVERE, null, ex);
}
System.out.println("Component: " + doubleMatrix.colObjects.get(i) + " capable of discriminating between the sets, with p-value: " + pValue);
}
}
private static LinkedHashMap> splitInterstingSetInPortions(LinkedHashMap> interestSets, int maxSize) {
//HashMap setsToSplit = new HashMap();
LinkedHashMap> newInterestingSets = new LinkedHashMap>();
for(Entry> series : interestSets.entrySet()){
if(series.getValue().size()>maxSize){
System.out.println(series.getValue().size()%maxSize);
}
}
return(newInterestingSets);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy