All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.dna.snp.io.ReadBedfile Maven / Gradle / Ivy

/*
 *  ReadBedfile
 *
 *  Created on Feb 15, 2017
 */
package net.maizegenetics.dna.snp.io;

import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.RangeSet;
import com.google.common.collect.TreeRangeMap;
import com.google.common.collect.TreeRangeSet;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.map.PositionList;
import net.maizegenetics.dna.map.PositionListBuilder;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;

import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

import static java.util.stream.Collectors.collectingAndThen;

/**
 * @author Terry Casstevens
 */
public class ReadBedfile {

    private static final Logger myLogger = Logger.getLogger(ReadBedfile.class);

    private ReadBedfile() {
        // utility
    }

    /**
     * Function to parse the bedFile and create a List of BedFileRanges.
     *
     * The positions stored in BedFileRange are 1-based inclusive exclusive.
     * This is done by adding 1 to both the start and end position from the BED file.
     * @param bedFile
     * @return
     */
    public static List getRanges(String bedFile) {

        List result = new ArrayList<>();

        String line = null;
        try (BufferedReader reader = Utils.getBufferedReader(bedFile)) {
            int lineNum = 1;
            line = reader.readLine();
            while (line != null) {
                String[] tokens = line.trim().split("\t");
                if (tokens.length < 3) {
                    throw new IllegalStateException("getRanges: Expecting at least 3 columns on line: " + lineNum);
                }

                // tokens[0] is chromosome
                // tokens[1] is start postion from bed file.
                // plus one because bed files are 0-base
                int startPos = Integer.parseInt(tokens[1]) + 1;

                // tokens[2] is start postion from bed file.
                // plus one because bed files are 0-base
                int endPos = Integer.parseInt(tokens[2]) + 1;

                // tokens[3] is name from bed file
                String name = null;
                if (tokens.length > 3) {
                    if (tokens[3] == null || tokens[3].isEmpty()) {
                        name = null;
                    } else {
                        name = tokens[3];
                    }
                }

                result.add(new BedFileRange(tokens[0], startPos, endPos, name));

                line = reader.readLine();
                lineNum++;
            }
        } catch (Exception e) {
            myLogger.debug(e.getMessage(), e);
            throw new IllegalStateException("getRanges: problem reading: " + bedFile + " line: " + line);
        }

        return result;

    }

    /**
     * Gets position list from specified bed file.
     *
     * @return position list
     */
    public static PositionList getPositionList(String bedfile) {
        PositionListBuilder builder = new PositionListBuilder();
        getRanges(bedfile).stream().forEach(range -> {
            for (int pos = range.start(); pos < range.end(); pos++) {
                builder.add(Position.of(range.chr(), pos));
            }
        });
        return builder.build();
    }

    /**
     * Function that returns the 1-based Position ranges from a BED file as a RangeSet of Positions.
     * NOTE: getRanges(bedFile) will be called which will shift the start and end positions in the BED file up by 1.
     *       Because of this the ranges returned will be 1-based Closed-Open(Inclusive-Exclusive).
     *       This is NOT returning ranges in BED specification(0-based Inclusive-Exclusive).
     * @param bedfile
     * @return
     */
    public static RangeSet getRangesAsPositions(String bedfile) {
        return getRanges(bedfile).stream()
                //This needs to be closedOpen as we retain inclusive-exclusive.
                //Also using bedFileRange.myChr instead of bedFileRange.myChrInt as converting a String -> Int -> String is lossy when the string is non-numeric.
                .map(bedFileRange -> Range.closedOpen(Position.of(bedFileRange.myChr, bedFileRange.myStartPos),
                        Position.of(bedFileRange.myChr, bedFileRange.myEndPos)))
                .collect(collectingAndThen(Collectors.toSet(), TreeRangeSet::create));
    }

    /**
     * Function that returns the 1-based closed Position ranges from a BED file as a RangeSet of Positions.
     * NOTE: getRanges(bedFile) will be called which will shift the start in the BED file up by 1.
     *       Because of this the ranges returned will be 1-based Closed(Inclusive-Inclusive).
     *       This is NOT returning ranges in BED specification(0-based Inclusive-Exclusive).
     * @param bedfile
     * @return
     */
    public static RangeSet getClosedRangesAsPositions(String bedfile) {
        return getRanges(bedfile).stream()
                //This needs to be closedOpen as we retain inclusive-exclusive.
                //Also using bedFileRange.myChr instead of bedFileRange.myChrInt as converting a String -> Int -> String is lossy when the string is non-numeric.
                .map(bedFileRange -> Range.closed(Position.of(bedFileRange.myChr, bedFileRange.myStartPos),
                        Position.of(bedFileRange.myChr, bedFileRange.myEndPos-1)))
                .collect(collectingAndThen(Collectors.toSet(), TreeRangeSet::create));
    }

    /**
     * Function that returns the 1-based Position ranges from a BED file as a RangeMap of Positions to the annotated name of the region.
     * NOTE: getRanges(bedFile) will be called which will shift the start and end positions in the BED file up by 1.
     *       Because of this the ranges returned will be 1-based Closed-Open(Inclusive-Exclusive).
     *       This is NOT returning ranges in BED specification(0-based Inclusive-Exclusive).
     * @param bedfile
     * @return
     */
    public static RangeMap getRangesAsPositionMap(String bedfile) {
        TreeRangeMap positionNameRangeMap = TreeRangeMap.create();
        for (BedFileRange bedFileRange : getRanges(bedfile)) {
            //This needs to be closedOpen as we retain inclusive-exclusive.
            //Also using bedFileRange.myChr instead of bedFileRange.myChrInt as converting a String -> Int -> String is lossy when the string is non-numeric.
            positionNameRangeMap.put(Range.closedOpen(Position.of(bedFileRange.myChr, bedFileRange.myStartPos),
                    Position.of(bedFileRange.myChr, bedFileRange.myEndPos)),
                    bedFileRange.myName);
        }
        return positionNameRangeMap;
    }

    public static class BedFileRange implements Comparable {

        private final String myChr;
        private final int myChrInt;
        private final int myStartPos;
        private final int myEndPos;
        private final String myName;

        public BedFileRange(String chr, int startPos, int endPos, String name) {
            myChr = chr;
            int temp;
            try {
                temp = Integer.parseInt(chr);
            } catch (Exception e) {
                temp = -1;
            }
            myChrInt = temp;
            myStartPos = startPos;
            myEndPos = endPos;
            myName = name;
        }

        /**
         * Return chromosome
         *
         * @return chromosome
         */
        public String chr() {
            return myChr;
        }

        /**
         * Returns start position (inclusive)
         *
         * @return start position
         */
        public int start() {
            return myStartPos;
        }

        /**
         * Returns end position (exclusive)
         *
         * @return end position
         */
        public int end() {
            return myEndPos;
        }

        public String name() {
            return myName;
        }

        @Override
        public int compareTo(BedFileRange o) {

            if (myChrInt != -1) {
                if (myChrInt < o.myChrInt) {
                    return -1;
                } else if (myChrInt > o.myChrInt) {
                    return 1;
                }
            } else if (!myChr.equals(o.myChr)) {
                return myChr.compareTo(o.myChr);
            }

            if (myStartPos < o.myStartPos) {
                return -1;
            } else if (myStartPos > o.myStartPos) {
                return 1;
            }

            if (myEndPos < o.myEndPos) {
                return -1;
            } else if (myEndPos > o.myEndPos) {
                return 1;
            } else {
                return 0;
            }

        }

    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy