
umcg.genetica.io.geofiles.ParseSoftFile Maven / Gradle / Ivy
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package umcg.genetica.io.geofiles;
import java.io.*;
import java.util.*;
import java.util.regex.Pattern;
import umcg.genetica.math.matrix.DoubleMatrixDataset;
import umcg.genetica.methylation.SoftfileAnnotation;
/**
*
* @author Lude & Marc Jan
*/
public class ParseSoftFile {
private static Pattern SPLIT_ON_TAB = Pattern.compile("\\t");
private static Pattern SPLIT_ON_EQUALS = Pattern.compile(" = ");
protected static final String ENCODING = "ISO-8859-1";
public static HashMap importAnnotationFromSOFTFile(String fileLocation) throws Exception {
HashMap dataAnnotation = new HashMap();
if (fileLocation.equals("") || !fileLocation.endsWith(".soft")) {
throw new Exception("No (correct) file specified");
}
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fileLocation)), ENCODING), 8096);
String str;
while ((str = in.readLine()) != null) {
if (str.startsWith("^SAMPLE")) {
String sampleName = SPLIT_ON_EQUALS.split(str)[1];
SoftfileAnnotation tmp = new SoftfileAnnotation();
while ((str = in.readLine()) != null) {
if (str.startsWith("^SAMPLE")) {
tmp.putAnnotationInformation("Error_In_SoftFile", "True");
dataAnnotation.put(sampleName, tmp);
sampleName = SPLIT_ON_EQUALS.split(str)[1];
tmp = new SoftfileAnnotation();
}
if (str.startsWith("!Sample_title = ")) {
tmp.setTitle(SPLIT_ON_EQUALS.split(str)[1]);
} else if (str.startsWith("!Sample_geo_accession = ")) {
tmp.setAccession(SPLIT_ON_EQUALS.split(str)[1]);
}
if (str.startsWith("!Sample_") && str.contains(" = ")) {
String[] stringSplit = SPLIT_ON_EQUALS.split(str);
if (stringSplit.length == 2) {
if (tmp.getAnnotationInformation().containsKey(SPLIT_ON_EQUALS.split(str)[0])) {
tmp.putAnnotationInformation(SPLIT_ON_EQUALS.split(str)[0], tmp.getAnnotationInformation().get(SPLIT_ON_EQUALS.split(str)[0]) + " // " + SPLIT_ON_EQUALS.split(str)[1]);
} else {
tmp.putAnnotationInformation(SPLIT_ON_EQUALS.split(str)[0], SPLIT_ON_EQUALS.split(str)[1]);
}
} else {
//System.out.println(str);
}
} else if (str.startsWith("#") && str.contains(" = ")) {
String[] stringSplit = SPLIT_ON_EQUALS.split(str);
if (stringSplit.length == 2) {
if (tmp.getAnnotationInformation().containsKey(SPLIT_ON_EQUALS.split(str)[0])) {
tmp.putAnnotationInformation(SPLIT_ON_EQUALS.split(str)[0], tmp.getAnnotationInformation().get(SPLIT_ON_EQUALS.split(str)[0]) + " " + SPLIT_ON_EQUALS.split(str)[1]);
} else {
tmp.putAnnotationInformation(SPLIT_ON_EQUALS.split(str)[0], SPLIT_ON_EQUALS.split(str)[1]);
}
} else {
//System.out.println(str);
}
}
if (str.startsWith("!sample_table_begin")) {
dataAnnotation.put(sampleName, tmp);
break;
}
}
}
}
in.close();
} catch (IOException e) {
System.out.println(e.getMessage());
System.exit(-1);
}
return (dataAnnotation);
}
public static DoubleMatrixDataset importSOFTFile(String fileLocation, int numberOfSamples, int numberOfProbes) throws Exception {
if (fileLocation.equals("") || !fileLocation.endsWith(".soft")) {
throw new Exception("No (correct) file specified");
}
boolean debug = false;
String probeHeader = "ID_REF";
String valueHeader = "VALUE";
String intensityHeader = "Intensity";
DoubleMatrixDataset dataset = new DoubleMatrixDataset(numberOfProbes, numberOfSamples);
HashMap hashUniqueProbes = new HashMap();
ArrayList uniqueProbes = new ArrayList();
ArrayList uniqueSamples = new ArrayList();
int sampleID = 0;
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fileLocation)), ENCODING), 8096);
String str;
while ((str = in.readLine()) != null) {
if (str.startsWith("^")) {
if (debug) {
System.out.println(str);
}
if (str.startsWith("^SAMPLE")) {
String sampleName = SPLIT_ON_EQUALS.split(str)[1];
uniqueSamples.add(str);
while ((str = in.readLine()) != null) {
//System.out.println(str);
int nrProbesThisSample = 0;
if (str.startsWith("!Sample_supplementary_file") && debug) {
System.out.println(str);
}
if (str.startsWith("!Sample_characteristics_ch1") && debug) {
if (str.toLowerCase().contains("male") || str.toLowerCase().contains("female")) {
System.out.println(sampleName + "\t" + str);
}
}
if (str.startsWith("!sample_table_begin")) {
str = in.readLine();
if (debug) {
System.out.println(str);
}
String[] data = SPLIT_ON_TAB.split(str);
int probeHeaderColumn = -1;
int valueHeaderColumn = -1;
int intensityHeaderColumn = -1;
double[] valsValue = new double[numberOfProbes];
for (int d = 0; d < data.length; ++d) {
if (data[d].equals(probeHeader)) {
probeHeaderColumn = d;
}
if (data[d].equals(valueHeader)) {
valueHeaderColumn = d;
}
if (data[d].equals(intensityHeader)) {
intensityHeaderColumn = d;
}
}
if (intensityHeaderColumn != -1) {
//valueHeaderColumn = intensityHeaderColumn;
}
int nrMissingValues = 0;
while ((str = in.readLine()) != null) {
if (str.startsWith("!sample_table_end")) {
break;
}
if (valueHeaderColumn != -1) {
data = SPLIT_ON_TAB.split(str);
double value;
if (data.length <= valueHeaderColumn || data[valueHeaderColumn] == null || data[valueHeaderColumn].length() == 0 || data[valueHeaderColumn].equalsIgnoreCase("null")) {
value = -999;
nrMissingValues++;
} else {
value = Double.parseDouble(data[valueHeaderColumn]);
nrProbesThisSample++;
}
if (!hashUniqueProbes.containsKey(data[probeHeaderColumn])) {
int probeID = hashUniqueProbes.size();
hashUniqueProbes.put(data[probeHeaderColumn], probeID);
uniqueProbes.add(data[probeHeaderColumn]);
valsValue[probeID] = value;
} else {
int probeID = ((Integer) hashUniqueProbes.get(data[probeHeaderColumn])).intValue();
valsValue[probeID] = value;
}
}
}
if (probeHeaderColumn != -1) {
for (int p = 0; p < valsValue.length; p++) {
dataset.rawData[p][sampleID] = valsValue[p];
}
dataset.colObjects.set(sampleID, sampleName);
if (debug) {
System.out.println(sampleName + "\t" + sampleID + "\tNrProbesThisSample:\t" + nrProbesThisSample + "\tNrMissingProbeValues:\t" + nrMissingValues + "\t" + hashUniqueProbes.size() + "\t" + uniqueSamples.size());
}
sampleID++;
}
break;
}
}
}
}
}
in.close();
} catch (IOException e) {
System.out.println(e.getMessage());
System.exit(-1);
}
if (debug) {
System.out.println("Total number of samples:\t" + sampleID);
}
for (int r = 0; r < dataset.nrRows; r++) {
dataset.rowObjects.set(r, (String) uniqueProbes.get(r));
}
dataset.recalculateHashMaps();
return (dataset);
}
public static DoubleMatrixDataset importSOFTFileSelection(String fileLocation, int numberOfSamples, int numberOfProbes, int samplesPerDataset) throws Exception {
if (fileLocation.equals("") || !fileLocation.endsWith(".soft")) {
throw new Exception("No (correct) file specified");
}
boolean debug = false;
String probeHeader = "ID_REF";
String valueHeader = "VALUE";
String intensityHeader = "Intensity";
HashMap SelectionBasedOnSeriesId = new HashMap();
DoubleMatrixDataset dataset = new DoubleMatrixDataset(numberOfProbes, numberOfSamples);
HashMap hashUniqueProbes = new HashMap();
ArrayList uniqueProbes = new ArrayList();
ArrayList uniqueSamples = new ArrayList();
int sampleID = 0;
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fileLocation)), ENCODING), 8096);
String str;
while ((str = in.readLine()) != null) {
if (str.startsWith("^")) {
if (debug) {
System.out.println(str);
}
if (str.startsWith("^SAMPLE")) {
String sampleName = SPLIT_ON_EQUALS.split(str)[1];
String sampleSeriesId = "";
while ((str = in.readLine()) != null) {
//System.out.println(str);
int nrProbesThisSample = 0;
if (str.startsWith("!Sample_series_id")) {
if (debug) {
System.out.println(str);
}
sampleSeriesId = SPLIT_ON_EQUALS.split(str)[1];
}
if (str.startsWith("!Sample_supplementary_file") && debug) {
System.out.println(str);
}
if (str.startsWith("!Sample_characteristics_ch1") && debug) {
if (str.toLowerCase().contains("male") || str.toLowerCase().contains("female")) {
System.out.println(sampleName + "\t" + str);
}
}
if (str.startsWith("!sample_table_begin")) {
str = in.readLine();
if (debug) {
System.out.println(str);
}
String[] data = SPLIT_ON_TAB.split(str);
int probeHeaderColumn = -1;
int valueHeaderColumn = -1;
int intensityHeaderColumn = -1;
double[] valsValue = new double[numberOfProbes];
for (int d = 0; d < data.length; ++d) {
if (data[d].equals(probeHeader)) {
probeHeaderColumn = d;
}
if (data[d].equals(valueHeader)) {
valueHeaderColumn = d;
}
if (data[d].equals(intensityHeader)) {
intensityHeaderColumn = d;
}
}
if (intensityHeaderColumn != -1) {
//valueHeaderColumn = intensityHeaderColumn;
}
int nrMissingValues = 0;
while ((str = in.readLine()) != null) {
if (str.startsWith("!sample_table_end")) {
break;
}
if (valueHeaderColumn != -1) {
data = SPLIT_ON_TAB.split(str);
double value;
if (data.length <= valueHeaderColumn || data[valueHeaderColumn] == null || data[valueHeaderColumn].length() == 0 || data[valueHeaderColumn].equalsIgnoreCase("null")) {
value = -999;
nrMissingValues++;
} else {
value = Double.parseDouble(data[valueHeaderColumn]);
nrProbesThisSample++;
}
if (!hashUniqueProbes.containsKey(data[probeHeaderColumn])) {
int probeID = hashUniqueProbes.size();
hashUniqueProbes.put(data[probeHeaderColumn], probeID);
uniqueProbes.add(data[probeHeaderColumn]);
valsValue[probeID] = value;
} else {
int probeID = ((Integer) hashUniqueProbes.get(data[probeHeaderColumn])).intValue();
valsValue[probeID] = value;
}
}
}
if (SelectionBasedOnSeriesId.containsKey(sampleSeriesId)) {
SelectionBasedOnSeriesId.put(sampleSeriesId, SelectionBasedOnSeriesId.get(sampleSeriesId) + 1);
} else {
SelectionBasedOnSeriesId.put(sampleSeriesId, 1);
}
if (probeHeaderColumn != -1 && !(SelectionBasedOnSeriesId.get(sampleSeriesId) > samplesPerDataset)) {
if (debug) {
System.out.println(sampleSeriesId + "\t" + SelectionBasedOnSeriesId.get(sampleSeriesId));
}
uniqueSamples.add(sampleName);
for (int p = 0; p < valsValue.length; p++) {
dataset.rawData[p][sampleID] = valsValue[p];
}
dataset.colObjects.set(sampleID, sampleName);
if (debug) {
System.out.println(sampleName + "\t" + sampleID + "\tNrProbesThisSample:\t" + nrProbesThisSample + "\tNrMissingProbeValues:\t" + nrMissingValues + "\t" + hashUniqueProbes.size() + "\t" + uniqueSamples.size());
}
sampleID++;
}
break;
}
}
}
}
}
in.close();
} catch (IOException e) {
System.out.println(e.getMessage());
System.exit(-1);
}
if (debug) {
System.out.println("Total number of samples:\t" + sampleID);
}
for (int r = 0; r < dataset.nrRows; r++) {
dataset.rowObjects.set(r, (String) uniqueProbes.get(r));
}
dataset.recalculateHashMaps();
return (dataset);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy