All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.genome.util.ChromosomeMappingTools Maven / Gradle / Ivy

package org.biojava.nbio.genome.util;

import com.google.common.collect.Range;

import org.biojava.nbio.genome.parsers.genename.ChromPos;
import org.biojava.nbio.genome.parsers.genename.GeneChromosomePosition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;

/**
 *  A class that can map chromosomal positions to mRNA (coding sequence) positions.
 *
 *  @author Andreas Prlic
 */

public class ChromosomeMappingTools {

    private static final Logger logger = LoggerFactory.getLogger(ChromosomeMappingTools.class);


    private static final String newline = System.getProperty("line.separator");


    public static final String CHROMOSOME = "CHROMOSOME";
    public static final String CDS = "CDS";


    /** Pretty print the details of a GeneChromosomePosition to a String
     *
     * @param chromosomePosition
     * @return
     */
    public static String formatExonStructure(GeneChromosomePosition chromosomePosition ){
        if ( chromosomePosition.getOrientation() == '+')
            return formatExonStructureForward(chromosomePosition);

        return formatExonStructureReverse(chromosomePosition);

    }


    private static String formatExonStructureForward(GeneChromosomePosition chromPos) {

        StringWriter s = new StringWriter();

        List exonStarts = chromPos.getExonStarts();
        List exonEnds   = chromPos.getExonEnds();

        int cdsStart = chromPos.getCdsStart();
        int cdsEnd   = chromPos.getCdsEnd();

        boolean inCoding = false;
        int codingLength = 0;

        for (int i = 0; i < exonStarts.size(); i++) {

            int start = exonStarts.get(i);
            int end = exonEnds.get(i);

            if (start <= cdsStart +1 && end >= cdsStart+1) {

                inCoding = true;
                codingLength += (end - cdsStart);
                s.append("     UTR         : ").append(format(start)).append(" - ").append(format(cdsStart));
                s.append(newline);
                s.append(" ->  Exon        : ").append(format(cdsStart + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - cdsStart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
                s.append(newline);

            } else if (start <= cdsEnd && end >= cdsEnd) {
                //logger.debug(" <-- CDS end at: " + cdsEnd );
                inCoding = false;
                codingLength += (cdsEnd - start);

                s.append(" <-  Exon        : ").append(format(start + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
                s.append(newline);
                s.append("     UTR         : " + (cdsEnd +1) + " - " + format(end));
                s.append(newline);


            } else if (inCoding) {
                // full exon is coding
                codingLength += (end - start);

                s.append("     Exon        : ").append(format(start + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
                s.append(newline);
            }

        }
        s.append("Coding Length: ");
        s.append((codingLength-3)+"");
        s.append(newline);
        return s.toString();
    }



    private static String showGenePosLink(GeneChromosomePosition chromPos, Integer pos ) {

        String spos = format(pos);

        StringBuffer buf = new StringBuffer();
        buf.append("");
        buf.append(spos);
        buf.append("");

        return buf.toString();
    }


    private static String formatExonStructureReverse(GeneChromosomePosition chromPos) {
        StringWriter s = new StringWriter();

        List exonStarts = chromPos.getExonStarts();
        List exonEnds   = chromPos.getExonEnds();


        int cdsStart = chromPos.getCdsStart();
        int cdsEnd   = chromPos.getCdsEnd();

        // logger.debug("CDS START:" +format(cdsStart) + " - " + format(cdsEnd));

        boolean inCoding = false;
        int codingLength = 0;

        if (cdsEnd < cdsStart) {
            int tmp = cdsEnd;
            cdsEnd = cdsStart;
            cdsStart = tmp;
        }

        // map reverse
        for (int i = exonStarts.size() - 1; i >= 0; i--) {

            int end = exonStarts.get(i);
            int start = exonEnds.get(i);

            if (end < start) {
                int tmp = end;
                end = start;
                start = tmp;
            }

            if (start <= cdsEnd && end >= cdsEnd) {
                inCoding = true;

                int tmpstart = start;
                if (start < cdsStart) {
                    tmpstart = cdsStart;
                }
                codingLength += (cdsEnd - tmpstart);

                s.append("     UTR         :" + format(cdsEnd + 1) + " | " + format(end));
                s.append(newline);
                if (tmpstart == start)
                    s.append(" ->  ");
                else
                    s.append(" <-> ");
                s.append("Exon        :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - tmpstart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
                s.append(newline);
                // single exon with UTR on both ends
                if (tmpstart != start)
                    s.append("     UTR         :" + format(cdsStart ) + " - " + format(start + 1));
                s.append(newline);

            } else if (start <= cdsStart && end >= cdsStart) {
                inCoding = false;
                codingLength += (end - cdsStart);

                s.append(" <-  Exon        : " + format(cdsStart+1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
                s.append(newline);
                s.append("     UTR         : " + format(start+1) + " - " + format(cdsStart ));
                s.append(newline);


            } else if (inCoding) {
                // full exon is coding
                codingLength += (end - start);

                s.append("     Exon        : " + format(start+1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
                s.append(newline);
            } else {
                // e.g. see UBQLN3
                s.append(" no translation! UTR: ").append(format(start)).append(" - ").append(format(end));
                s.append(newline);
            }
        }

        s.append("CDS length: ").append(Integer.toString(codingLength - 3));
        s.append(newline);

        return s.toString();
    }

    /**
     * Get the length of the CDS in nucleotides.
     *
     * @param chromPos
     * @return length of the CDS in nucleotides.
     */
    public static int getCDSLength(GeneChromosomePosition chromPos) {

        logger.debug(chromPos.toString());

        logger.debug("chromosomal information: ");

        logger.debug("Gene:" + chromPos.getGeneName());
        logger.debug("  Transcription (including UTRs): " + chromPos.getTranscriptionStart() + " - " + chromPos.getTranscriptionEnd() + " (length:" + (chromPos.getTranscriptionEnd() - chromPos.getTranscriptionStart()) + ")");
        logger.debug("  Orientation: " + chromPos.getOrientation());
        logger.debug("  CDS: " + (chromPos.getCdsStart()) + " - " + chromPos.getCdsEnd() + " (length: " + (chromPos.getCdsEnd() - chromPos.getCdsStart()) + ")");


        List exonStarts = chromPos.getExonStarts();
        List exonEnds = chromPos.getExonEnds();

        logger.debug("Exons:" + exonStarts.size());

        int cdsStart = chromPos.getCdsStart();
        int cdsEnd = chromPos.getCdsEnd();


        int codingLength;
        if (chromPos.getOrientation().equals('+'))
            codingLength = ChromosomeMappingTools.getCDSLengthForward(exonStarts, exonEnds, cdsStart, cdsEnd);
        else
            codingLength = ChromosomeMappingTools.getCDSLengthReverse(exonStarts, exonEnds, cdsStart, cdsEnd);
        return codingLength;
    }


    /**
     * maps the position of a CDS nucleotide back to the genome
     *
     * @param cdsNucleotidePosition
     * @return a ChromPos object
     */
    public static ChromPos getChromosomePosForCDScoordinate(int cdsNucleotidePosition, GeneChromosomePosition chromPos) {

        logger.debug(" ? Checking chromosome position for CDS position " + cdsNucleotidePosition);

        List exonStarts = chromPos.getExonStarts();
        List exonEnds = chromPos.getExonEnds();

        logger.debug(" Exons:" + exonStarts.size());

        int cdsStart = chromPos.getCdsStart();
        int cdsEnd = chromPos.getCdsEnd();


        ChromPos chromosomePos = null;

        if (chromPos.getOrientation().equals('+'))

            chromosomePos = ChromosomeMappingTools.getChromPosForward(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd);
        else
            chromosomePos = ChromosomeMappingTools.getChromPosReverse(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd);

        logger.debug("=> CDS pos " + cdsNucleotidePosition + " for " + chromPos.getGeneName() + " is on chromosome at  " + chromosomePos);
        return chromosomePos;

    }

    /** returns a nicely formatted representation of the position
     *
     * @param chromosomePosition
     * @return
     */
    private static String format(int chromosomePosition){


        return String.format("%,d", chromosomePosition);
    }

    /**
     * Get the CDS position mapped on the chromosome position
     *
     * @param exonStarts
     * @param exonEnds
     * @param cdsStart
     * @param cdsEnd
     * @return
     */
    public static ChromPos getChromPosReverse(int cdsPos, List exonStarts,
                                              List exonEnds, int cdsStart, int cdsEnd) {

        boolean inCoding = false;
        int codingLength = 0;

        if (cdsEnd < cdsStart) {
            int tmp = cdsEnd;
            cdsEnd = cdsStart;
            cdsStart = tmp;
        }

        int lengthExons = 0;

        // map reverse
        for (int i = exonStarts.size() - 1; i >= 0; i--) {

            logger.debug("Exon #" + (i+1) + "/" + exonStarts.size());
            int end = exonStarts.get(i);
            int start = exonEnds.get(i);

            if (end < start) {
                int tmp = end;
                end = start;
                start = tmp;
            }
            lengthExons += end - start;

            logger.debug("     is " + cdsPos + " part of Reverse exon? " + format(start+1) + " - " + format(end) + " | " + (end - start+1));
            logger.debug("     CDS start: " + format(cdsStart+1) + "-" + format(cdsEnd) + " coding length counter:" + codingLength);

            if (start+1 <= cdsEnd && end >= cdsEnd) {


                // FIRST EXON
                inCoding = true;


                int tmpstart = start;
                if (start < cdsStart) {
                    tmpstart = cdsStart;
                }

                // here one of the few places where we don't say start+1
                int check = codingLength + cdsEnd - tmpstart ;

                logger.debug("First Exon    | " + (check) + " | " + format(start+1) + " " + format(end) + " | " + (cdsEnd - tmpstart) + " | " + cdsPos );


                if ( ( check > cdsPos)  )
                {

                    int tmp = cdsPos - codingLength ;


                    logger.debug(" -> found position in UTR exon:  " + format(cdsPos) + " " + format(tmpstart+1) + " tmp:" + format(tmp) + " cs:" + format(cdsStart+1) + " ce:" + format(cdsEnd) + " cl:" + codingLength);

                    return new ChromPos((cdsEnd - tmp), -1) ;
                }


                // don't add 1 here
                codingLength += (cdsEnd - tmpstart );

                boolean debug = logger.isDebugEnabled();


                if ( debug ) {

                    StringBuffer b = new StringBuffer();

                    b.append("     UTR         :" + format(cdsEnd + 1) + " - " + format(end) + newline);
                    if (tmpstart == start)
                        b.append(" ->  ");
                    else
                        b.append(" <-> ");
                    b.append("Exon        :" + format(tmpstart + 1) + " - " + (cdsEnd) + " | " + format(cdsEnd - tmpstart + 1) + " - " + codingLength + " | " + (codingLength % 3) + newline);

                    // single exon with UTR on both ends
                    if (tmpstart != start)
                        b.append("     UTR         :" + format(cdsStart) + " - " + format(start + 1) + newline);

                    logger.debug(b.toString());
                }
            } else if (start <= cdsStart && end >= cdsStart) {

                // LAST EXON
                inCoding = false;

                if (codingLength + end - cdsStart >= cdsPos) {

                    // how many remaining coding nucleotides?
                    int tmp = codingLength + end - cdsStart - cdsPos ;

                    logger.debug("cdl: " +codingLength + " tmp:" + tmp + " cdsStart: " + format(cdsStart));

                    logger.debug(" -> XXX found position noncoding exon:  cdsPos:" + cdsPos + " s:" + format(start + 1) + " tmp:" + format(tmp) + " cdsStart:" + (cdsStart + 1) + " codingLength:" + codingLength + " cdsEnd:" + format(cdsEnd));

                    return new ChromPos((cdsStart + tmp),-1);
                }

                codingLength += (end - cdsStart);

                logger.debug(" <-  Exon        : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3));
                logger.debug("     UTR         : " + format(start+1) + " - " + format(cdsStart ));

            } else if (inCoding) {


                if (codingLength + end - start -1  >= cdsPos) {

                    int tmp = cdsPos - codingLength ;

                    if ( tmp > (end - start ) ) {

                        tmp = (end - start );

                        logger.debug("changing tmp to " + tmp);

                    }

                    logger.debug("     " + cdsPos + " " + codingLength + " | " + (cdsPos - codingLength) + " | " + (end -start) + " | " + tmp);
                    logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3));
                    logger.debug(" ->  RRR found position coding exon:  " + cdsPos + " " + format(start+1) + " " + format(end) + " " + tmp + " " + format(cdsStart+1) + " " + codingLength);

                    return new ChromPos((end - tmp),cdsPos %3);
                }

                // full exon is coding
                codingLength += (end - start) ;

                logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start+1) + " | " + codingLength + " | " + (codingLength % 3));
            } else {
                // e.g. see UBQLN3

                logger.debug(" no translation!");
            }


            logger.debug("     coding length: " + codingLength + "(phase:" + (codingLength % 3) + ") CDS POS trying to map:" + cdsPos);


        }

        logger.debug("length exons: " + lengthExons);
        // could not map, or map over the full length??
        return new ChromPos(-1,-1);


    }

    /**
     * Get the CDS position mapped onto the chromosome position
     *
     * @param exonStarts
     * @param exonEnds
     * @param cdsStart
     * @param cdsEnd
     * @return
     */
    public static ChromPos getChromPosForward(int cdsPos, List exonStarts, List exonEnds,
                                              int cdsStart, int cdsEnd) {
        boolean inCoding = false;
        int codingLength = 0;

        int lengthExons = 0;
        // map forward
        for (int i = 0; i < exonStarts.size(); i++) {


            // start can include UTR
            int start = exonStarts.get(i);
            int end = exonEnds.get(i);

            lengthExons += end - start;


            if (start <= cdsStart +1 && end >= cdsStart+1) {
                // first exon with UTR
                if (codingLength + (end - cdsStart-1) >= cdsPos) {
                    // we are reaching our target position
                    int tmp = cdsPos - codingLength;


                    logger.debug(cdsStart + " | " + codingLength + " | " + tmp);
                    logger.debug(" -> found position in UTR exon:  #"+(i+1)+ " cdsPos:" + cdsPos +
                            " return:"+(cdsStart +1 + tmp) +" start:" + format(start + 1) + " " + format(tmp) + " " + cdsStart + " " + codingLength);

                    // we start 1 after cdsStart...
                    return new ChromPos((cdsStart +1 + tmp),-1);
                }
                inCoding = true;
                codingLength += (end - cdsStart);

                logger.debug("     UTR         : " + format(start+1) + " - " + (cdsStart ));
                logger.debug(" ->  Exon        : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));

            } else if (start+1 <= cdsEnd && end >= cdsEnd) {
                // LAST EXON with UTR
                //logger.debug(" <-- CDS end at: " + cdsEnd );
                inCoding = false;
                if (codingLength + (cdsEnd - start-1) >= cdsPos) {
                    int tmp = cdsPos - codingLength;


                    logger.debug(" <-  Exon        : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3));
                    logger.debug("     UTR         : " + format(cdsEnd + 1) + " - " + format(end));
                    logger.debug( codingLength + " | " + tmp + " | " + format(start+1));
                    logger.debug(" -> chromPosForward found position in non coding exon:  " + cdsPos + " " + format(start+1) + " " + format(tmp) + " " + format(cdsStart) + " " + codingLength);

                    return new ChromPos((start +1 + tmp),cdsPos%3);
                }
                codingLength += (cdsEnd - start-1);

                logger.debug(" <-  Exon        : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3));
                logger.debug("     UTR         : " + format(cdsEnd + 1) + " - " + format(end));


            } else if (inCoding) {
                // A standard coding Exon
                // tests for the maximum length of this coding exon
                if (codingLength + (end - start -1)  >= cdsPos) {

                    // we are within the range of this exon
                    int tmp = cdsPos - codingLength ;


                    logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + tmp + " | " + codingLength);
                    logger.debug(" -> found chr position in coding exon #" + (i+1) + ":  cdsPos:" + format(cdsPos) + " s:" + format(start) + "-" + format(end) + " tmp:" + format(tmp) + " cdsStart:" + format(cdsStart) + " codingLength:" + codingLength);

                    return new ChromPos((start +1 + tmp),cdsPos%3);
                }
                // full exon is coding
                codingLength += (end - start );

                logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3));
            }
            //			if ( inCoding )
            //				logger.debug("exon phase at end:" + (codingLength % 3));
            //
            //			logger.debug("   coding length: " + codingLength);


        }

        //logger.debug("length exons: " + lengthExons);
        //return codingLength - 3;

        // could not map!

        return new ChromPos(-1,-1);
    }


    /**
     * Get the length of the coding sequence
     *
     * @param exonStarts
     * @param exonEnds
     * @param cdsStart
     * @param cdsEnd
     * @return
     */
    public static int getCDSLengthReverse(List exonStarts,
                                          List exonEnds, int cdsStart, int cdsEnd) {

        boolean inCoding = false;
        int codingLength = 0;

        if (cdsEnd < cdsStart) {
            int tmp = cdsEnd;
            cdsEnd = cdsStart;
            cdsStart = tmp;
        }

        int lengthExons = 0;

        // map reverse
        for (int i = exonStarts.size() - 1; i >= 0; i--) {

            int end = exonStarts.get(i);
            int start = exonEnds.get(i);

            if (end < start) {
                int tmp = end;
                end = start;
                start = tmp;
            }
            lengthExons += end - start;

            if (start <= cdsEnd && end >= cdsEnd) {
                inCoding = true;


                int tmpstart = start;
                if (start < cdsStart) {
                    tmpstart = cdsStart;
                }
                codingLength += (cdsEnd - tmpstart);

                boolean debug = logger.isDebugEnabled();

                if ( debug) {

                    StringBuffer b = new StringBuffer();

                    b.append("     UTR         :" + (cdsEnd + 1) + " - " + (end) + newline);
                    if (tmpstart == start)
                        b.append(" ->  ");
                    else
                        b.append(" <-> ");
                    b.append("Exon        :" + tmpstart + " - " + cdsEnd + " | " + (cdsEnd - tmpstart) + " | " + codingLength + " | " + (codingLength % 3) + newline);
                    // single exon with UTR on both ends
                    if (tmpstart != start)
                        b.append("     UTR         :" + (cdsStart - 1) + " - " + start  + newline);
                    logger.debug(b.toString());

                }
            } else if (start <= cdsStart && end >= cdsStart) {
                inCoding = false;
                codingLength += (end - cdsStart);


                logger.debug(" <-  Exon        : " + (cdsStart+1) + " - " + end + " | " + (end - cdsStart) + " | " + (codingLength-3)  + " | " + (codingLength % 3));
                logger.debug("     UTR         : " + start + " - " + (cdsStart ));



            } else if (inCoding) {
                // full exon is coding
                codingLength += (end - start);

                logger.debug("     Exon        : " + start + " - " + end + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
            } else {
                // e.g. see UBQLN3

                logger.debug(" no translation!");
            }

        }

        logger.debug("length exons: " + lengthExons + " codin length: " + (codingLength - 3));
        return codingLength - 3;
    }

    /**
     * Get the length of the coding sequence
     *
     * @param exonStarts
     * @param exonEnds
     * @param cdsStart
     * @param cdsEnd
     * @return
     */
    public static int getCDSLengthForward(List exonStarts, List exonEnds,
                                          int cdsStart, int cdsEnd) {
        boolean inCoding = false;
        int codingLength = 0;

        int lengthExons = 0;
        // map forward
        for (int i = 0; i < exonStarts.size(); i++) {

            int start = exonStarts.get(i);
            int end = exonEnds.get(i);
            lengthExons += end - start;


            logger.debug("forward exon: " + (start+1) + " - " + end + " | " + (end - start));

            if (start+1 <= cdsStart +1 && end >= cdsStart+1) {

                inCoding = true;
                codingLength += (end - cdsStart);

                logger.debug("     UTR         : " + start + " - " + (cdsStart ));
                logger.debug(" ->  Exon        : " + (cdsStart+1) + " - " + end + " | " + (end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3));

            } else if (start+1 <= cdsEnd && end >= cdsEnd) {

                inCoding = false;
                codingLength += (cdsEnd - start);

                logger.debug(" <-  Exon        : " + (start +1)+ " - " + cdsEnd + " | " + (cdsEnd - start+1) + " | " + codingLength + " | " + (codingLength % 3));
                logger.debug("     UTR         : " + cdsEnd + 1 + " - " + end);


            } else if (inCoding) {
                // full exon is coding
                codingLength += (end - start);

                logger.debug("     Exon        :" + (start+1) + " - " + end + " | " + (end - start+1) + " | " + codingLength + " | " + (codingLength % 3));
            }


        }

        logger.debug("length exons: " + Integer.toString(lengthExons));
        logger.debug("CDS length:" + Integer.toString((codingLength-3)));

        return codingLength-3 ;
    }



    /** Extracts the exon boundaries in CDS coordinates. (needs to be divided by 3 to get AA positions)
     *
     * @param chromPos
     * @return
     */
    public static List> getCDSExonRanges(GeneChromosomePosition chromPos){
        if ( chromPos.getOrientation() == '+')

            return getCDSExonRangesForward(chromPos,CDS);

        return getCDSExonRangesReverse(chromPos,CDS);
    }


    /** Extracts the boundaries of the coding regions in chromosomal coordinates
     *
     * @param chromPos
     * @return
     */
    public static List> getChromosomalRangesForCDS(GeneChromosomePosition chromPos){
        if ( chromPos.getOrientation() == '+')
            return getCDSExonRangesForward(chromPos,CHROMOSOME);

        return getCDSExonRangesReverse(chromPos,CHROMOSOME);
    }

    private static List> getCDSExonRangesReverse(GeneChromosomePosition chromPos,
                                                                String responseType) {

        List exonStarts = chromPos.getExonStarts();
        List exonEnds   = chromPos.getExonEnds();

        List> data = new ArrayList<>();
        int cdsStart = chromPos.getCdsStart();
        int cdsEnd   = chromPos.getCdsEnd();

        boolean inCoding = false;
        int codingLength = 0;

        if (cdsEnd < cdsStart) {
            int tmp = cdsEnd;
            cdsEnd = cdsStart;
            cdsStart = tmp;
        }

        java.lang.StringBuffer s =null;

        boolean debug = logger.isDebugEnabled();


        if ( debug)
            s = new StringBuffer();

        int lengthExons = 0;

        // map reverse
        for (int i = exonStarts.size() - 1; i >= 0; i--) {

            int end = exonStarts.get(i);
            int start = exonEnds.get(i);

            if (end < start) {
                int tmp = end;
                end = start;
                start = tmp;
            }
            lengthExons += end - start;
            //s.append("Reverse exon: " + end + " - " + start + " | " + (end - start));
            //s.append(newline);

            if (start <= cdsEnd && end >= cdsEnd) {
                inCoding = true;


                int tmpstart = start;
                if (start < cdsStart) {
                    tmpstart = cdsStart;
                }
                codingLength += (cdsEnd - tmpstart);
                if ( debug ) {
                    s.append("     UTR         :").append(format(cdsEnd + 1)).append(" | ").append(format(end));
                    s.append(newline);
                    if (tmpstart == start)
                        s.append(" ->  ");
                    else
                        s.append(" <-> ");
                    s.append("Exon        :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(cdsEnd - tmpstart).append(" | ").append(codingLength).append(" | ").append(codingLength % 3);
                    s.append(newline);
                    // single exon with UTR on both ends
                    if (tmpstart != start)
                        s.append("     UTR         :").append(format(cdsStart)).append(" - ").append(format(start + 1));
                    s.append(newline);
                }


                Range r ;
                if ( responseType.equals(CDS))
                    r = Range.closed(0,codingLength);
                else
                    r = Range.closed(tmpstart,cdsEnd);

                data.add(r);

            } else if (start <= cdsStart && end >= cdsStart) {
                inCoding = false;

                Range r;
                if ( responseType.equals(CDS))
                    r = Range.closed(codingLength,codingLength+(end-cdsStart));
                else
                    r = Range.closed(cdsStart+1,end);

                data.add(r);

                codingLength += (end - cdsStart);
                if  (debug) {
                    s.append(" <-  Exon        : " + format(cdsStart + 1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
                    s.append(newline);
                    s.append("     UTR         : ").append(format(start + 1)).append(" - ").append(format(cdsStart));
                    s.append(newline);
                }


            } else if (inCoding) {
                // full exon is coding

                Range r;
                if ( responseType.equals(CDS))
                    r = Range.closed(codingLength,codingLength+(end-start));
                else
                    r = Range.closed(start,end);
                data.add(r);

                codingLength += (end - start);
                if  (debug) {
                    s.append("     Exon        : " + format(start + 1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
                    s.append(newline);
                }
            } else {
                // e.g. see UBQLN3
                if ( debug ) {
                    s.append(" no translation! UTR: " + format(start) + " - " + format(end));
                    s.append(newline);
                }
            }
        }
        if ( debug ) {
            s.append("CDS length: ").append(Integer.toString(codingLength - 3));
            s.append(newline);
            logger.debug(s.toString());
        }

        return data;
    }



    private static List> getCDSExonRangesForward(GeneChromosomePosition chromPos,
                                                                String responseType) {

        List> data = new ArrayList<>();
        List exonStarts = chromPos.getExonStarts();
        List exonEnds   = chromPos.getExonEnds();


        int cdsStart = chromPos.getCdsStart();
        int cdsEnd   = chromPos.getCdsEnd();

        boolean inCoding = false;
        int codingLength = 0;

        for (int i = 0; i < exonStarts.size(); i++) {

            int start = exonStarts.get(i);
            int end = exonEnds.get(i);

            if (start <= cdsStart && end >= cdsStart) {

                inCoding = true;
                codingLength += (end - cdsStart);
//


                Range r;
                if ( responseType.equals(CDS))
                    r = Range.closed(0,codingLength);
                else
                    r = Range.closed(cdsStart,end);
                data.add(r);

            } else if (start <= cdsEnd && end >= cdsEnd) {
                //logger.debug(" <-- CDS end at: " + cdsEnd );
                inCoding = false;


                Range r;
                if ( responseType.equals(CDS))
                    r = Range.closed(codingLength,codingLength+(cdsEnd-start));
                else
                    r = Range.closed(start,cdsEnd);
                data.add(r);
                codingLength += (cdsEnd - start);

            } else if (inCoding) {
                // full exon is coding

                Range r;
                if ( responseType.equals(CDS))
                    r = Range.closed(codingLength,codingLength+(end-start));
                else
                    r = Range.closed(start,end);
                data.add(r);
                codingLength += (end - start);

            }

        }

        return data;
    }
//

    /**
     * I have a genomic coordinate, where is it on the mRNA
     *
     * @param coordinate
     * @param chromosomePosition
     * @return
     */
    public static int getCDSPosForChromosomeCoordinate(int coordinate, GeneChromosomePosition chromosomePosition) {

        if ( chromosomePosition.getOrientation() == '+')
        	return getCDSPosForward(coordinate,
                    chromosomePosition.getExonStarts(),
                    chromosomePosition.getExonEnds(),
                    chromosomePosition.getCdsStart(),
                    chromosomePosition.getCdsEnd());

        return getCDSPosReverse(coordinate,
                chromosomePosition.getExonStarts(),
                chromosomePosition.getExonEnds(),
                chromosomePosition.getCdsStart(),
                chromosomePosition.getCdsEnd());
    }

	/** Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene
	 * living on the forward DNA strand.
	 *
	 * @param chromPos The genetic coordinate on a chromosome
     * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
     * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
     * @param cdsStart The start position of a coding region
     * @param cdsEnd The end position of a coding region
     *
     * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1)
	 *
	 * @author Yana Valasatava
	 */
    public static int getCDSPosForward(int chromPos, List exonStarts, List exonEnds,
            int cdsStart, int cdsEnd) {

    	// the genetic coordinate is not in a coding region
        if ( (chromPos < (cdsStart+1) ) || ( chromPos > (cdsEnd+1) ) ) {
        	logger.debug("The "+format(chromPos)+" position is not in a coding region");
            return -1;
        }

        logger.debug("looking for CDS position for " +format(chromPos));

        // remove exons that are fully landed in UTRs
        List tmpS = new ArrayList(exonStarts);
        List tmpE = new ArrayList(exonEnds);

        int j=0;
        for (int i = 0; i < tmpS.size(); i++) {
        	if ( ( tmpE.get(i) < cdsStart) || ( tmpS.get(i) > cdsEnd) ) {
        		exonStarts.remove(j);
        		exonEnds.remove(j);
        	}
        	else {
        		j++;
        	}
        }

        // remove untranslated regions from exons
        int nExons = exonStarts.size();
        exonStarts.remove(0);
        exonStarts.add(0, cdsStart);
        exonEnds.remove(nExons-1);
        exonEnds.add(cdsEnd);

        int codingLength = 0;
        int lengthExon = 0;

        // map the genetic coordinate on a stretch of a reverse strand
        for (int i = 0; i < nExons; i++) {

		    int start = exonStarts.get(i);
		    int end = exonEnds.get(i);

		    lengthExon = end - start;

		    if (start+1 <= chromPos && end >= chromPos ) {
		    	return codingLength + (chromPos-start);
		    }
	        else {
	        	codingLength += lengthExon;
	        }
        }
        return -1;
    }

	/** Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene
	 * living on the reverse DNA strand.
	 *
	 * @param chromPos The genetic coordinate on a chromosome
     * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
     * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
     * @param cdsStart The start position of a coding region
     * @param cdsEnd The end position of a coding region
     *
     * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1)
	 *
	 * @author Yana Valasatava
	 */
    public static int getCDSPosReverse(int chromPos, List exonStarts, List exonEnds,
            int cdsStart, int cdsEnd) {

    	// the genetic coordinate is not in a coding region
        if ( (chromPos < (cdsStart+1)) || ( chromPos > (cdsEnd+1) ) ) {
        	logger.debug("The "+format(chromPos)+" position is not in a coding region");
            return -1;
        }

        logger.debug("looking for CDS position for " +format(chromPos));

        // remove exons that are fully landed in UTRs
        List tmpS = new ArrayList(exonStarts);
        List tmpE = new ArrayList(exonEnds);

        int j=0;
        for (int i = tmpS.size() - 1; i >= 0; i--) {
        	if ( ( tmpE.get(i) < cdsStart) || ( tmpS.get(i) > cdsEnd) ) {
        		exonStarts.remove(j);
        		exonEnds.remove(j);
        	}
        	else {
        		j++;
        	}
        }

        // remove untranslated regions from exons
        int nExons = exonStarts.size();
        exonStarts.remove(0);
        exonStarts.add(0, cdsStart);
        exonEnds.remove(nExons-1);
        exonEnds.add(cdsEnd);

        int codingLength = 0;
        int lengthExon = 0;
        
        // map the genetic coordinate on a stretch of a reverse strand
        for (int i = nExons - 1; i >= 0; i--) {

		    int start = exonStarts.get(i);
		    int end = exonEnds.get(i);

		    lengthExon = end - start;
		    // +1 offset to be a base 1
		    if (start+1 <= chromPos && end >= chromPos ) {
		    	return codingLength + (end-chromPos+1);
		    }
	        else {
	        	codingLength += lengthExon;
	        }
        }
        return -1;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy