All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencb.cellbase.build.transform.ConservedRegionParser Maven / Gradle / Ivy

The newest version!
package org.opencb.cellbase.build.transform;

import org.opencb.cellbase.core.common.ConservedRegionChunk;
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.*;
import java.util.*;
import java.util.zip.GZIPInputStream;

public class ConservedRegionParser extends CellBaseParser {

	private static int CHUNKSIZE = 2000;

    private Logger logger;
    private Path conservedRegionPath;
    private int chunksize;

    // Download data:
    // for i in `seq 1 22`; do wget ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/phastCons46way/primates/chr$i.phastCons46way.primates.wigFix.gz; done
    // ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/phyloP46way/primates/

    public ConservedRegionParser(Path conservedRegionPath, int chunksize, CellBaseSerializer serializer) {
        super(serializer);
        this.conservedRegionPath = conservedRegionPath;
        this.chunksize = chunksize;
        logger = LoggerFactory.getLogger(ConservedRegionParser.class);
    }

    @Override
    public void parse() throws IOException {
        Map files = new HashMap<>();
        String chromosome;
        Set chromosomes = new HashSet<>();

        // Reading all files in phastCons folder
        DirectoryStream directoryStream = Files.newDirectoryStream(conservedRegionPath.resolve("phastCons"));
        for(Path path: directoryStream) {
            chromosome = path.getFileName().toString().split("\\.")[0].replace("chr", "");
            chromosomes.add(chromosome);
            files.put(chromosome+"phastCons", path);
        }

        // Reading all files in phylop folder
        directoryStream = Files.newDirectoryStream(conservedRegionPath.resolve("phylop"));
        for(Path path: directoryStream) {
            chromosome = path.getFileName().toString().split("\\.")[0].replace("chr", "");
            chromosomes.add(chromosome);
            files.put(chromosome+"phylop", path);
        }

        /**
         * Now we can iterate over all the chromosomes found and process the files
         */
        logger.debug("Chromosomes found {}", chromosomes.toString());
        for(String chr : chromosomes){
            logger.debug("Processing chromosome {}, file {}", chr, files.get(chr+"phastCons"));
            processFile(files.get(chr+"phastCons"), "phastCons");

            logger.debug("Processing chromosome {}, file {}", chr, files.get(chr+"phylop"));
            processFile(files.get(chr+"phylop"), "phylop");
        }
    }


    private void processFile(Path inGzPath, String conservedType) throws IOException {

        BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(inGzPath))));

        String line = null;
        String chromosome = "";
        int start = 0, end=0;
        float value;
        Map attributes = new HashMap<>();
//        ConservedRegion conservedRegion =  null;
        List values = new ArrayList<>();

        ConservedRegionChunk conservedRegion =  null;

        while ((line = br.readLine()) != null) {
            if (line.startsWith("fixedStep")) {
                //new group, save last
                if(conservedRegion != null){
                    conservedRegion.setEnd(end);
                    conservedRegion = new ConservedRegionChunk(chromosome, start, end, conservedType, start/CHUNKSIZE, values);
                    serializer.serialize(conservedRegion);
                }

//                offset = 0;
                attributes.clear();
                String[] attrFields = line.split(" ");
                String[] attrKeyValue;
                for (String attrField : attrFields) {
                    if (!attrField.equalsIgnoreCase("fixedStep")) {
                        attrKeyValue = attrField.split("=");
                        attributes.put(attrKeyValue[0].toLowerCase(), attrKeyValue[1]);
                    }
                }
                chromosome = attributes.get("chrom").replace("chr", "");
                start = Integer.parseInt(attributes.get("start"));
                end = Integer.parseInt(attributes.get("start"));

                values = new ArrayList<>(2000);
            } else {
                int startChunk = start/CHUNKSIZE;
                end++;
                int endChunk = end/CHUNKSIZE;

                if(startChunk != endChunk) {
                    conservedRegion = new ConservedRegionChunk(chromosome, start, end-1, conservedType, startChunk, values);
                    serializer.serialize(conservedRegion);
                    values.clear();
                    start = end;
                }

                value = Float.parseFloat(line.trim());
                values.add(value);
            }
        }
        //write last
        conservedRegion = new ConservedRegionChunk(chromosome, start, end, conservedType, start/CHUNKSIZE, values);
        serializer.serialize(conservedRegion);
        br.close();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy