All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencb.cellbase.build.transform.WigParser Maven / Gradle / Ivy

The newest version!
package org.opencb.cellbase.build.transform;

import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.opencb.cellbase.build.transform.formats.ConservedRegionFeature;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

/**
 * @author lcruz
 * @since 03/11/2014
 */
public class WigParser extends CellBaseParser{
    private final Path wigsFolder;
    private int chunkSize;
    private String type;
    private Pattern pattern;

    public WigParser(Path conservationFilesFolder, int chunkSize, String type, CellBaseSerializer serializer){
        super(serializer);
        this.wigsFolder = conservationFilesFolder;
        this.chunkSize = chunkSize;
        this.type = type;

        String regEx = ".*chrom=chr(.*) start=(.*) step=(.*)";
        pattern = Pattern.compile(regEx);
    }

    public void parse() throws Exception {
        if (Files.exists(wigsFolder)) {
            DirectoryStream directoryStream = Files.newDirectoryStream(wigsFolder.resolve(type));

            Map files = new HashMap<>();
            String chromosome;
            Set chromosomes = new HashSet<>();

            // Reading all files in conservation folder
            for (Path wig_file : directoryStream) {
                chromosome = wig_file.getFileName().toString().split("\\.")[0].replace("chr", "");
                chromosomes.add(chromosome);
                files.put(chromosome + type, wig_file);
            }

            logger.debug("Chromosomes found {}", chromosomes.toString());
            for (String chr : chromosomes) {
                logger.debug("Processing chromosome {}, file {}", chr, files.get(chr + type));
                processFile(files.get(chr + type));
            }
        }
    }

    private void processFile(Path inputFilePath) throws IOException {
        BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(inputFilePath))));
        String line;
        Integer position = null;
        Integer previous_chunk = -1;
        String chunk_chr = "";

        // Build the first object
        ConservedRegionFeature newConservedRegion = null;

        while ((line = br.readLine()) != null) if (line.startsWith("fixedStep")) {
            // set position to one before of the start position
            position = setNewPositionAfterJump(line);
            // set chromosome of the chunks, it will change if there is more than one chromosome in the same file
            chunk_chr = setChromosomeAfterJump(line);
        } else {
            // set the new position
            position++;
            // set chunk id
            Integer position_chunk = (position / chunkSize);
            // if position belong to a new chunk print the position before and initialize a new one
            if (!Objects.equals(previous_chunk, position_chunk)) {
                previous_chunk = position_chunk;
                if (newConservedRegion != null) {
                    serializer.serialize(newConservedRegion);
                }

                //Initialize a new chunk
                Integer start = (position / chunkSize) * chunkSize;
                Integer end = start + chunkSize - 1;
                newConservedRegion = new ConservedRegionFeature(chunk_chr, start, end, position_chunk);

                Float[] values = new Float[chunkSize];
                newConservedRegion.addSource(type, Arrays.asList(values));

            }
            if (newConservedRegion != null) {
                newConservedRegion.getSources().get(0).getValues().set(position % chunkSize, Float.parseFloat(line.trim()));
            }

        }

        serializer.serialize(newConservedRegion);
        br.close();
    }

    private String setChromosomeAfterJump(String line) {
        Matcher matcher = pattern.matcher(line);
        matcher.matches();
        return matcher.group(1);
    }

    private Integer setNewPositionAfterJump(String line) {
        Matcher matcher = pattern.matcher(line);
        matcher.matches();
        return Integer.parseInt(matcher.group(2))-1;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy