All Downloads are FREE. Search and download functionalities are using the official Maven repository.

umcg.genetica.io.geofiles.ParseTextTable Maven / Gradle / Ivy

There is a newer version: 1.0.7
Show newest version
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package umcg.genetica.io.geofiles;

import cern.colt.matrix.tdouble.DoubleMatrix2D;
import cern.colt.matrix.tdouble.impl.DenseDoubleMatrix2D;
import cern.colt.matrix.tdouble.impl.DenseLargeDoubleMatrix2D;
import java.io.IOException;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.commons.lang.math.NumberUtils;
import umcg.genetica.io.text.TextFile;
import umcg.genetica.math.matrix2.DoubleMatrixDataset;

/**
 *
 * @author MarcJan
 */
public class ParseTextTable {

    private static Pattern SPLIT_ON_TAB = Pattern.compile("\\t");
    protected static final String ENCODING = "ISO-8859-1";
    static final Logger LOGGER = Logger.getLogger(DoubleMatrixDataset.class.getName());

    /**
     * Geo text tables for HT12 v3 / v4 data
     *
     * @param fileInput
     * @return all data out of the TCGA files Methylated / un-methylated and
     * beta-values
     */
    public static DoubleMatrixDataset parseGeoTables(String fileInput, boolean debug) throws IOException {
        System.out.println("\tNow parsing file: "+fileInput);
        LinkedHashSet desiredColPos = new LinkedHashSet();
        
        int columnOffset = 1;
        int rowOffset = 1;
        TextFile in = new TextFile(fileInput, TextFile.R);
        String str = in.readLine(); // header
        while (str.startsWith("#") || str.startsWith("\"#") || str.matches("^\\s*$") || str.equals("") || str.startsWith("This is our raw data.") || str.matches("^GSM[0-9]+.*")
                || str.startsWith("Illumina Inc. GenomeStudio") || str.startsWith("Array Content =") || str.startsWith("Normalization =") || str.startsWith("log")) {
            str = in.readLine();
            rowOffset++;
        }

        String[] headerData = SPLIT_ON_TAB.split(str);
        String str2 = in.readLine();
        String[] nextRowData = SPLIT_ON_TAB.split(str2);

        for (int s = 0; s < headerData.length; s++) {
            if ((headerData[s].toLowerCase().contains("probe") && headerData[s].toLowerCase().contains("id")) || (headerData[s].toLowerCase().contains("ref") && headerData[s].toLowerCase().contains("id")) || (headerData[s].toLowerCase().contains("array") && headerData[s].toLowerCase().contains("address"))) {
                columnOffset = (s + 1);
                break;
            }
        }
        int tmpCols = (headerData.length - columnOffset);

        LinkedHashMap colMap = new LinkedHashMap((int) Math.ceil(tmpCols / 0.75));

        int storedCols = 0;
        
        for (int s = 0; s < tmpCols; s++) {
            String colName = headerData[s + columnOffset];
            if (!colMap.containsKey(colName) && !colName.equals("") && !colName.equalsIgnoreCase("Target ID") && !colName.equalsIgnoreCase("TargetID") && !colName.equalsIgnoreCase("Probe ID") && !colName.toLowerCase().contains(".p=") && !colName.toLowerCase().contains("pval") && !colName.toLowerCase().contains("detection") && !colName.toLowerCase().contains("p-val") && !colName.toLowerCase().contains("array") && !colName.toLowerCase().contains("bead")) {
                if((nextRowData.length > (s+columnOffset))){
                    if((isNumeric(nextRowData[ s + columnOffset]))) {
                        colMap.put(colName, storedCols);
                        desiredColPos.add(s+columnOffset);
                        storedCols++;
                    } else if(debug) {
                        System.out.println("In non-numeric, entry: "+nextRowData[s + columnOffset]);
                        System.out.println("###############################");
                    }
                }
            } else if(colMap.containsKey(colName)) {
                LOGGER.warning("Duplicated column name:" + colName + "! In file: " + fileInput);
                throw new IOException("Problem with parsing file");
            } 
            else if (debug) {
                System.out.println("Empthy colname:"+colName.equals(""));
                System.out.println("Colname contains \"target id\":"+colName.equalsIgnoreCase("Target ID"));
                System.out.println("Colname contains \"probe id\":"+colName.equalsIgnoreCase("Probe ID"));
                System.out.println("Colname contains \".p\":"+colName.toLowerCase().contains(".p="));
                System.out.println("Colname contains \"pval\":"+colName.toLowerCase().contains("pval"));
                System.out.println("Colname contains \"p-val\":"+colName.toLowerCase().contains("p-val"));
                System.out.println("Colname contains \"detection\":"+colName.toLowerCase().contains("detection"));
                System.out.println("Colname contains \"array\":"+colName.toLowerCase().contains("array"));
                System.out.println("Colname contains \"bead\":"+colName.toLowerCase().contains("bead"));
                System.out.println("###############################");
            }
        }
        if (colMap.size() == 0) {
            if(debug){
                System.out.println("#Nothing added for this file: "+fileInput+". First two rows:");
                System.out.println("Parsing of values oke? "+ isNumeric(nextRowData[columnOffset]));
                System.out.println(str);
                System.out.println(str2);
                System.out.println(fileInput);
            }
            return null;
        }
        int tmpRows = 1;

        while (in.readLine() != null) {
            tmpRows++;
        }

        in.close();

        double[][] initialMatrix = new double[tmpRows][storedCols];

        in.open();
        String headerRow = null;
        for (int i = 0; i < rowOffset; ++i) {
            headerRow = in.readLine(); // read header
        }
        int row = 0;

        LinkedHashMap rowMap = new LinkedHashMap((int) Math.ceil(tmpRows / 0.75));

        boolean correctData = true;
        while ((str = in.readLine()) != null) {
            String[] data = SPLIT_ON_TAB.split(str);
            if(data.length == headerData.length){
               if (!rowMap.containsKey(data[columnOffset - 1])) {
                    rowMap.put(data[columnOffset - 1], row);
                    int columnToPut = 0;
                    for (int s : desiredColPos) {
                        double d;
                        try {
                            d = Double.parseDouble(data[s]);
                        } catch (NumberFormatException e) {
                            correctData = false;
                            d = Double.NaN;
                        }
                        initialMatrix[row][columnToPut] = d;
                        columnToPut++;
                    }
                    row++;
                } else {
                    LOGGER.warning("Duplicated row name: "+data[columnOffset - 1]);
                    System.out.println(str);
                    throw new IOException("Problem in reading file.");
                }
            }
        }
        if (!correctData) {
            LOGGER.warning("Your data contains NaN/unparseable values!");
        }
        in.close();

        DoubleMatrixDataset dataset;
        
        DoubleMatrix2D mat;
        
        if ((tmpRows * tmpCols) < (Integer.MAX_VALUE - 2)) {
            mat = new DenseDoubleMatrix2D(initialMatrix);
        } else {
            mat = new DenseLargeDoubleMatrix2D(tmpRows, tmpCols);
            mat.assign(initialMatrix);            
        }
        
        dataset = new DoubleMatrixDataset(mat, rowMap, colMap);
        
        LOGGER.log(Level.INFO, "''{0}'' has been loaded, nrRows: {1} nrCols: {2}", new Object[]{fileInput, dataset.rows(), dataset.columns()});
        return dataset;
    }

    public static boolean isNumeric(String str) {
        if (!NumberUtils.isNumber(str)) {
            return false;
        } else {
            return true;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy