All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.data.parser.microarray.PCLParser Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package smile.data.parser.microarray;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.text.ParseException;
import smile.data.Attribute;
import smile.data.AttributeDataset;
import smile.data.Datum;
import smile.data.NumericAttribute;

/**
 * Stanford cDNA file parser. The PCL file format is a tab delimited
 * file format that describes a gene expression dataset. The first three
 * columns are as follows:
 * 
*
Name
*
This column will contain a name ascribed to the entity on that row, such * as ORF name, or CLONEID. The column itself can be named anything, but by * convention is named YORF when it contains yeast ORF names, CLID when it * contains clone IDs, and LUID when it contains LUIDs. This column MUST contain * some text on every row.
*
Description
*
This column can contain descriptive information about the entity, e.g. * process or function, or gene symbol. It too can be named anything. It can * optionally be left blank, but the column itself must be present.
*
GWEIGHT
*
This column allows you to weight genes with different weights, for * instance if a gene appears on an array twice, you may want to give them * a weight of 0.5 each. For the most part people leave this column with a * value of 1 for every gene. This column must be present, and each row must * have an entry.
*
* In addition the file must begin with the following two rows: *
*
Row 1
*
This contains the column headers as described above for columns 1, 2 * and 3, then contains the experiment names for all the data columns that * exist in the file. Each data column must have a text entry as a name for * that column.
*
Row 2
*
This is the EWEIGHT row. The entry in the first column for this row * should say EWEIGHT, then for each experiment, there should be an EWEIGHT * value. This will usually be 1, but if the same experiment is duplicated * twice, you may want to give these repeats an EWEIGHT of 0.5.
*
* The remaining cells in the file contain the actual data, such that the row * and column specifies to which gene and which experiment a particular piece * of data corresponds. *

* In general the PCL file will contain log-transformed data, which is needed * for clustering to work properly. * * @author Haifeng Li */ public class PCLParser { /** * Constructor. */ public PCLParser() { } /** * Parse a PCL dataset from given URI. * @throws java.io.FileNotFoundException */ public AttributeDataset parse(URI uri) throws FileNotFoundException, IOException, ParseException { return parse(new File(uri)); } /** * Parse a PCL dataset from given URI. * @param uri the URI of data source. * @throws java.io.FileNotFoundException */ public AttributeDataset parse(String name, URI uri) throws FileNotFoundException, IOException, ParseException { return parse(name, new File(uri)); } /** * Parse a PCL dataset from given file. * @throws java.io.FileNotFoundException */ public AttributeDataset parse(String path) throws FileNotFoundException, IOException, ParseException { return parse(new File(path)); } /** * Parse a PCL dataset from given file. * @param path the file path of data source. * @throws java.io.FileNotFoundException */ public AttributeDataset parse(String name, String path) throws FileNotFoundException, IOException, ParseException { return parse(name, new File(path)); } /** * Parse a PCL dataset from given file. * @throws java.io.FileNotFoundException */ public AttributeDataset parse(File file) throws FileNotFoundException, IOException, ParseException { return parse(file.getPath(), new FileInputStream(file)); } /** * Parse a PCL dataset from given file. * @param file the file of data source. * @throws java.io.FileNotFoundException */ public AttributeDataset parse(String name, File file) throws FileNotFoundException, IOException, ParseException { return parse(name, new FileInputStream(file)); } /** * Parse a PCL dataset from an input stream. * @param name the name of dataset. * @param stream the input stream of data. * @throws java.io.FileNotFoundException */ public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException { BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); String line = reader.readLine(); if (line == null) { throw new IOException("Empty data source."); } String[] tokens = line.split("\t", -1); int p = tokens.length - 3; line = reader.readLine(); if (line == null) { throw new IOException("Premature end of file."); } String[] weight = line.split("\t", -1); if (weight.length != tokens.length) { throw new IOException("Invalid sample weight header."); } Attribute[] attributes = new Attribute[p]; for (int i = 0; i < p; i++) { attributes[i] = new NumericAttribute(tokens[i+3], null, Double.valueOf(weight[i+3])); } AttributeDataset data = new AttributeDataset(name, attributes); for (int i = 3; (line = reader.readLine()) != null; i++) { tokens = line.split("\t", -1); if (tokens.length != weight.length) { throw new IOException(String.format("Invalid number of elements of line %d: %d", i, tokens.length)); } double[] x = new double[p]; for (int j = 0; j < p; j++) { if (tokens[j+3].isEmpty()) { x[j] = Double.NaN; } else { x[j] = Double.valueOf(tokens[j+3]); } } Datum datum = new Datum(x); datum.name = tokens[0]; datum.description = tokens[1]; datum.weight = Double.valueOf(tokens[2]); data.add(datum); } reader.close(); return data; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy