smile.data.parser.microarray.PCLParser Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2010 Haifeng Li
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package smile.data.parser.microarray;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.text.ParseException;
import smile.data.Attribute;
import smile.data.AttributeDataset;
import smile.data.Datum;
import smile.data.NumericAttribute;
/**
* Stanford cDNA file parser. The PCL file format is a tab delimited
* file format that describes a gene expression dataset. The first three
* columns are as follows:
*
* - Name
* - This column will contain a name ascribed to the entity on that row, such
* as ORF name, or CLONEID. The column itself can be named anything, but by
* convention is named YORF when it contains yeast ORF names, CLID when it
* contains clone IDs, and LUID when it contains LUIDs. This column MUST contain
* some text on every row.
* - Description
* - This column can contain descriptive information about the entity, e.g.
* process or function, or gene symbol. It too can be named anything. It can
* optionally be left blank, but the column itself must be present.
* - GWEIGHT
* - This column allows you to weight genes with different weights, for
* instance if a gene appears on an array twice, you may want to give them
* a weight of 0.5 each. For the most part people leave this column with a
* value of 1 for every gene. This column must be present, and each row must
* have an entry.
*
* In addition the file must begin with the following two rows:
*
* - Row 1
* - This contains the column headers as described above for columns 1, 2
* and 3, then contains the experiment names for all the data columns that
* exist in the file. Each data column must have a text entry as a name for
* that column.
* - Row 2
* - This is the EWEIGHT row. The entry in the first column for this row
* should say EWEIGHT, then for each experiment, there should be an EWEIGHT
* value. This will usually be 1, but if the same experiment is duplicated
* twice, you may want to give these repeats an EWEIGHT of 0.5.
*
* The remaining cells in the file contain the actual data, such that the row
* and column specifies to which gene and which experiment a particular piece
* of data corresponds.
*
* In general the PCL file will contain log-transformed data, which is needed
* for clustering to work properly.
*
* @author Haifeng Li
*/
public class PCLParser {
/**
* Constructor.
*/
public PCLParser() {
}
/**
* Parse a PCL dataset from given URI.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(URI uri) throws FileNotFoundException, IOException, ParseException {
return parse(new File(uri));
}
/**
* Parse a PCL dataset from given URI.
* @param uri the URI of data source.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String name, URI uri) throws FileNotFoundException, IOException, ParseException {
return parse(name, new File(uri));
}
/**
* Parse a PCL dataset from given file.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String path) throws FileNotFoundException, IOException, ParseException {
return parse(new File(path));
}
/**
* Parse a PCL dataset from given file.
* @param path the file path of data source.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String name, String path) throws FileNotFoundException, IOException, ParseException {
return parse(name, new File(path));
}
/**
* Parse a PCL dataset from given file.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(File file) throws FileNotFoundException, IOException, ParseException {
return parse(file.getPath(), new FileInputStream(file));
}
/**
* Parse a PCL dataset from given file.
* @param file the file of data source.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String name, File file) throws FileNotFoundException, IOException, ParseException {
return parse(name, new FileInputStream(file));
}
/**
* Parse a PCL dataset from an input stream.
* @param name the name of dataset.
* @param stream the input stream of data.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
String line = reader.readLine();
if (line == null) {
throw new IOException("Empty data source.");
}
String[] tokens = line.split("\t", -1);
int p = tokens.length - 3;
line = reader.readLine();
if (line == null) {
throw new IOException("Premature end of file.");
}
String[] weight = line.split("\t", -1);
if (weight.length != tokens.length) {
throw new IOException("Invalid sample weight header.");
}
Attribute[] attributes = new Attribute[p];
for (int i = 0; i < p; i++) {
attributes[i] = new NumericAttribute(tokens[i+3], null, Double.valueOf(weight[i+3]));
}
AttributeDataset data = new AttributeDataset(name, attributes);
for (int i = 3; (line = reader.readLine()) != null; i++) {
tokens = line.split("\t", -1);
if (tokens.length != weight.length) {
throw new IOException(String.format("Invalid number of elements of line %d: %d", i, tokens.length));
}
double[] x = new double[p];
for (int j = 0; j < p; j++) {
if (tokens[j+3].isEmpty()) {
x[j] = Double.NaN;
} else {
x[j] = Double.valueOf(tokens[j+3]);
}
}
Datum datum = new Datum(x);
datum.name = tokens[0];
datum.description = tokens[1];
datum.weight = Double.valueOf(tokens[2]);
data.add(datum);
}
reader.close();
return data;
}
}