smile.data.parser.microarray.GCTParser Maven / Gradle / Ivy
The newest version!
/*******************************************************************************
* Copyright (c) 2010 Haifeng Li
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package smile.data.parser.microarray;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.text.ParseException;
import smile.data.Attribute;
import smile.data.AttributeDataset;
import smile.data.Datum;
import smile.data.NumericAttribute;
/**
* Gene Cluster Text file parser. The GCT format is a tab delimited
* file format that describes a gene expression dataset. It is organized as
* follows:
*
* The first line contains the version string and is always the same for this
* file format. Therefore, the first line must be as follows:
*
* #1.2
*
* The second line contains numbers indicating the size of the data table
* that is contained in the remainder of the file. Note that the name and
* description columns are not included in the number of data columns.
*
* Line format:
*
* (# of data rows) (tab) (# of data columns)
*
* Example:
*
* 7129 58
*
* The third line contains a list of identifiers for the samples associated
* with each of the columns in the remainder of the file.
*
* Line format:
*
* Name(tab)Description(tab)(sample 1 name)(tab)(sample 2 name) (tab) ... (sample N name)
*
* Example:
*
* Name Description DLBC1_1 DLBC2_1 ... DLBC58_0
*
* The remainder of the data file contains data for each of the genes. There
* is one row for each gene and one column for each of the samples. The number
* of rows and columns should agree with the number of rows and columns
* specified on line 2. Each row contains a name, a description, and an
* intensity value for each sample. Names and descriptions can contain spaces,
* but may not be empty. If no description is available, enter a text string
* such as NA or NULL. Intensity values may be missing. To specify a missing
* intensity value, leave the field empty: ...(tab)(tab)....
*
* Line format:
*
* (gene name) (tab) (gene description) (tab) (col 1 data) (tab) (col 2 data) (tab) ... (col N data)
*
* Example:
*
* AFFX-BioB-5_at AFFX-BioB-5_at (endogenous control) -104 -152 -158 ... -44
*
*
* @author Haifeng Li
*/
public class GCTParser {
/**
* Constructor.
*/
public GCTParser() {
}
/**
* Parse a GCT dataset from given URI.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(URI uri) throws FileNotFoundException, IOException, ParseException {
return parse(new File(uri));
}
/**
* Parse a GCT dataset from given URI.
* @param uri the URI of data source.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String name, URI uri) throws FileNotFoundException, IOException, ParseException {
return parse(name, new File(uri));
}
/**
* Parse a GCT dataset from given file.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String path) throws FileNotFoundException, IOException, ParseException {
return parse(new File(path));
}
/**
* Parse a GCT dataset from given file.
* @param path the file path of data source.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String name, String path) throws FileNotFoundException, IOException, ParseException {
return parse(name, new File(path));
}
/**
* Parse a GCT dataset from given file.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(File file) throws FileNotFoundException, IOException, ParseException {
return parse(file.getPath(), new FileInputStream(file));
}
/**
* Parse a GCT dataset from given file.
* @param file the file of data source.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String name, File file) throws FileNotFoundException, IOException, ParseException {
return parse(name, new FileInputStream(file));
}
/**
* Parse a GCT dataset from an input stream.
* @param name the name of dataset.
* @param stream the input stream of data.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
String line = reader.readLine();
if (line == null) {
throw new IOException("Empty data source.");
}
if (!line.equals("#1.2")) {
throw new IOException("Invalid version.");
}
line = reader.readLine();
if (line == null) {
throw new IOException("Premature end of file.");
}
String[] tokens = line.split("\t", -1);
if (tokens.length != 2) {
throw new IOException("Invalid data size inforamation.");
}
int n = Integer.valueOf(tokens[0]);
int p = Integer.valueOf(tokens[1]);
if (n <= 0 || p <= 0) {
throw new IOException(String.format("Invalid data size %d x %d.", n, p));
}
Attribute[] attributes = new Attribute[p];
line = reader.readLine();
if (line == null) {
throw new IOException("Premature end of file.");
}
tokens = line.split("\t", -1);
if (tokens.length != p + 2) {
throw new IOException("Invalid title header.");
}
for (int i = 0; i < p; i++) {
attributes[i] = new NumericAttribute(tokens[i+2]);
}
AttributeDataset data = new AttributeDataset(name, attributes);
for (int i = 0; i < n; i++) {
line = reader.readLine();
if (line == null) {
throw new IOException("Premature end of file.");
}
tokens = line.split("\t", -1);
if (tokens.length != p + 2) {
throw new IOException(String.format("Invalid number of elements of line %d: %d", i+4, tokens.length));
}
double[] x = new double[p];
for (int j = 0; j < p; j++) {
if (tokens[j+2].isEmpty()) {
x[j] = Double.NaN;
} else {
x[j] = Double.valueOf(tokens[j+2]);
}
}
Datum datum = new Datum(x);
datum.name = tokens[0];
datum.description = tokens[1];
data.add(datum);
}
reader.close();
return data;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy