org.biojava.nbio.genome.util.ChromosomeMappingTools Maven / Gradle / Ivy
package org.biojava.nbio.genome.util;
import com.google.common.collect.Range;
import org.biojava.nbio.genome.parsers.genename.ChromPos;
import org.biojava.nbio.genome.parsers.genename.GeneChromosomePosition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
/**
* A class that can map chromosomal positions to mRNA (coding sequence) positions.
*
* @author Andreas Prlic
*/
public class ChromosomeMappingTools {
private static final Logger logger = LoggerFactory.getLogger(ChromosomeMappingTools.class);
private static final String newline = System.getProperty("line.separator");
public static final String CHROMOSOME = "CHROMOSOME";
public static final String CDS = "CDS";
/** Pretty print the details of a GeneChromosomePosition to a String
*
* @param chromosomePosition
* @return
*/
public static String formatExonStructure(GeneChromosomePosition chromosomePosition ){
if ( chromosomePosition.getOrientation() == '+')
return formatExonStructureForward(chromosomePosition);
return formatExonStructureReverse(chromosomePosition);
}
private static String formatExonStructureForward(GeneChromosomePosition chromPos) {
StringWriter s = new StringWriter();
List exonStarts = chromPos.getExonStarts();
List exonEnds = chromPos.getExonEnds();
int cdsStart = chromPos.getCdsStart();
int cdsEnd = chromPos.getCdsEnd();
boolean inCoding = false;
int codingLength = 0;
for (int i = 0; i < exonStarts.size(); i++) {
int start = exonStarts.get(i);
int end = exonEnds.get(i);
if (start <= cdsStart +1 && end >= cdsStart+1) {
inCoding = true;
codingLength += (end - cdsStart);
s.append(" UTR : ").append(format(start)).append(" - ").append(format(cdsStart));
s.append(newline);
s.append(" -> Exon : ").append(format(cdsStart + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - cdsStart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
s.append(newline);
} else if (start <= cdsEnd && end >= cdsEnd) {
//logger.debug(" <-- CDS end at: " + cdsEnd );
inCoding = false;
codingLength += (cdsEnd - start);
s.append(" <- Exon : ").append(format(start + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
s.append(newline);
s.append(" UTR : " + (cdsEnd +1) + " - " + format(end));
s.append(newline);
} else if (inCoding) {
// full exon is coding
codingLength += (end - start);
s.append(" Exon : ").append(format(start + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
s.append(newline);
}
}
s.append("Coding Length: ");
s.append((codingLength-3)+"");
s.append(newline);
return s.toString();
}
private static String showGenePosLink(GeneChromosomePosition chromPos, Integer pos ) {
String spos = format(pos);
StringBuffer buf = new StringBuffer();
buf.append("");
buf.append(spos);
buf.append("");
return buf.toString();
}
private static String formatExonStructureReverse(GeneChromosomePosition chromPos) {
StringWriter s = new StringWriter();
List exonStarts = chromPos.getExonStarts();
List exonEnds = chromPos.getExonEnds();
int cdsStart = chromPos.getCdsStart();
int cdsEnd = chromPos.getCdsEnd();
// logger.debug("CDS START:" +format(cdsStart) + " - " + format(cdsEnd));
boolean inCoding = false;
int codingLength = 0;
if (cdsEnd < cdsStart) {
int tmp = cdsEnd;
cdsEnd = cdsStart;
cdsStart = tmp;
}
// map reverse
for (int i = exonStarts.size() - 1; i >= 0; i--) {
int end = exonStarts.get(i);
int start = exonEnds.get(i);
if (end < start) {
int tmp = end;
end = start;
start = tmp;
}
if (start <= cdsEnd && end >= cdsEnd) {
inCoding = true;
int tmpstart = start;
if (start < cdsStart) {
tmpstart = cdsStart;
}
codingLength += (cdsEnd - tmpstart);
s.append(" UTR :" + format(cdsEnd + 1) + " | " + format(end));
s.append(newline);
if (tmpstart == start)
s.append(" -> ");
else
s.append(" <-> ");
s.append("Exon :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - tmpstart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
s.append(newline);
// single exon with UTR on both ends
if (tmpstart != start)
s.append(" UTR :" + format(cdsStart ) + " - " + format(start + 1));
s.append(newline);
} else if (start <= cdsStart && end >= cdsStart) {
inCoding = false;
codingLength += (end - cdsStart);
s.append(" <- Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
s.append(newline);
s.append(" UTR : " + format(start+1) + " - " + format(cdsStart ));
s.append(newline);
} else if (inCoding) {
// full exon is coding
codingLength += (end - start);
s.append(" Exon : " + format(start+1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
s.append(newline);
} else {
// e.g. see UBQLN3
s.append(" no translation! UTR: ").append(format(start)).append(" - ").append(format(end));
s.append(newline);
}
}
s.append("CDS length: ").append(Integer.toString(codingLength - 3));
s.append(newline);
return s.toString();
}
/**
* Get the length of the CDS in nucleotides.
*
* @param chromPos
* @return length of the CDS in nucleotides.
*/
public static int getCDSLength(GeneChromosomePosition chromPos) {
logger.debug(chromPos.toString());
logger.debug("chromosomal information: ");
logger.debug("Gene:" + chromPos.getGeneName());
logger.debug(" Transcription (including UTRs): " + chromPos.getTranscriptionStart() + " - " + chromPos.getTranscriptionEnd() + " (length:" + (chromPos.getTranscriptionEnd() - chromPos.getTranscriptionStart()) + ")");
logger.debug(" Orientation: " + chromPos.getOrientation());
logger.debug(" CDS: " + (chromPos.getCdsStart()) + " - " + chromPos.getCdsEnd() + " (length: " + (chromPos.getCdsEnd() - chromPos.getCdsStart()) + ")");
List exonStarts = chromPos.getExonStarts();
List exonEnds = chromPos.getExonEnds();
logger.debug("Exons:" + exonStarts.size());
int cdsStart = chromPos.getCdsStart();
int cdsEnd = chromPos.getCdsEnd();
int codingLength;
if (chromPos.getOrientation().equals('+'))
codingLength = ChromosomeMappingTools.getCDSLengthForward(exonStarts, exonEnds, cdsStart, cdsEnd);
else
codingLength = ChromosomeMappingTools.getCDSLengthReverse(exonStarts, exonEnds, cdsStart, cdsEnd);
return codingLength;
}
/**
* maps the position of a CDS nucleotide back to the genome
*
* @param cdsNucleotidePosition
* @return a ChromPos object
*/
public static ChromPos getChromosomePosForCDScoordinate(int cdsNucleotidePosition, GeneChromosomePosition chromPos) {
logger.debug(" ? Checking chromosome position for CDS position " + cdsNucleotidePosition);
List exonStarts = chromPos.getExonStarts();
List exonEnds = chromPos.getExonEnds();
logger.debug(" Exons:" + exonStarts.size());
int cdsStart = chromPos.getCdsStart();
int cdsEnd = chromPos.getCdsEnd();
ChromPos chromosomePos = null;
if (chromPos.getOrientation().equals('+'))
chromosomePos = ChromosomeMappingTools.getChromPosForward(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd);
else
chromosomePos = ChromosomeMappingTools.getChromPosReverse(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd);
logger.debug("=> CDS pos " + cdsNucleotidePosition + " for " + chromPos.getGeneName() + " is on chromosome at " + chromosomePos);
return chromosomePos;
}
/** returns a nicely formatted representation of the position
*
* @param chromosomePosition
* @return
*/
private static String format(int chromosomePosition){
return String.format("%,d", chromosomePosition);
}
/**
* Get the CDS position mapped on the chromosome position
*
* @param exonStarts
* @param exonEnds
* @param cdsStart
* @param cdsEnd
* @return
*/
public static ChromPos getChromPosReverse(int cdsPos, List exonStarts,
List exonEnds, int cdsStart, int cdsEnd) {
boolean inCoding = false;
int codingLength = 0;
if (cdsEnd < cdsStart) {
int tmp = cdsEnd;
cdsEnd = cdsStart;
cdsStart = tmp;
}
int lengthExons = 0;
// map reverse
for (int i = exonStarts.size() - 1; i >= 0; i--) {
logger.debug("Exon #" + (i+1) + "/" + exonStarts.size());
int end = exonStarts.get(i);
int start = exonEnds.get(i);
if (end < start) {
int tmp = end;
end = start;
start = tmp;
}
lengthExons += end - start;
logger.debug(" is " + cdsPos + " part of Reverse exon? " + format(start+1) + " - " + format(end) + " | " + (end - start+1));
logger.debug(" CDS start: " + format(cdsStart+1) + "-" + format(cdsEnd) + " coding length counter:" + codingLength);
if (start+1 <= cdsEnd && end >= cdsEnd) {
// FIRST EXON
inCoding = true;
int tmpstart = start;
if (start < cdsStart) {
tmpstart = cdsStart;
}
// here one of the few places where we don't say start+1
int check = codingLength + cdsEnd - tmpstart ;
logger.debug("First Exon | " + (check) + " | " + format(start+1) + " " + format(end) + " | " + (cdsEnd - tmpstart) + " | " + cdsPos );
if ( ( check > cdsPos) )
{
int tmp = cdsPos - codingLength ;
logger.debug(" -> found position in UTR exon: " + format(cdsPos) + " " + format(tmpstart+1) + " tmp:" + format(tmp) + " cs:" + format(cdsStart+1) + " ce:" + format(cdsEnd) + " cl:" + codingLength);
return new ChromPos((cdsEnd - tmp), -1) ;
}
// don't add 1 here
codingLength += (cdsEnd - tmpstart );
boolean debug = logger.isDebugEnabled();
if ( debug ) {
StringBuffer b = new StringBuffer();
b.append(" UTR :" + format(cdsEnd + 1) + " - " + format(end) + newline);
if (tmpstart == start)
b.append(" -> ");
else
b.append(" <-> ");
b.append("Exon :" + format(tmpstart + 1) + " - " + (cdsEnd) + " | " + format(cdsEnd - tmpstart + 1) + " - " + codingLength + " | " + (codingLength % 3) + newline);
// single exon with UTR on both ends
if (tmpstart != start)
b.append(" UTR :" + format(cdsStart) + " - " + format(start + 1) + newline);
logger.debug(b.toString());
}
} else if (start <= cdsStart && end >= cdsStart) {
// LAST EXON
inCoding = false;
if (codingLength + end - cdsStart >= cdsPos) {
// how many remaining coding nucleotides?
int tmp = codingLength + end - cdsStart - cdsPos ;
logger.debug("cdl: " +codingLength + " tmp:" + tmp + " cdsStart: " + format(cdsStart));
logger.debug(" -> XXX found position noncoding exon: cdsPos:" + cdsPos + " s:" + format(start + 1) + " tmp:" + format(tmp) + " cdsStart:" + (cdsStart + 1) + " codingLength:" + codingLength + " cdsEnd:" + format(cdsEnd));
return new ChromPos((cdsStart + tmp),-1);
}
codingLength += (end - cdsStart);
logger.debug(" <- Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3));
logger.debug(" UTR : " + format(start+1) + " - " + format(cdsStart ));
} else if (inCoding) {
if (codingLength + end - start -1 >= cdsPos) {
int tmp = cdsPos - codingLength ;
if ( tmp > (end - start ) ) {
tmp = (end - start );
logger.debug("changing tmp to " + tmp);
}
logger.debug(" " + cdsPos + " " + codingLength + " | " + (cdsPos - codingLength) + " | " + (end -start) + " | " + tmp);
logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3));
logger.debug(" -> RRR found position coding exon: " + cdsPos + " " + format(start+1) + " " + format(end) + " " + tmp + " " + format(cdsStart+1) + " " + codingLength);
return new ChromPos((end - tmp),cdsPos %3);
}
// full exon is coding
codingLength += (end - start) ;
logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start+1) + " | " + codingLength + " | " + (codingLength % 3));
} else {
// e.g. see UBQLN3
logger.debug(" no translation!");
}
logger.debug(" coding length: " + codingLength + "(phase:" + (codingLength % 3) + ") CDS POS trying to map:" + cdsPos);
}
logger.debug("length exons: " + lengthExons);
// could not map, or map over the full length??
return new ChromPos(-1,-1);
}
/**
* Get the CDS position mapped onto the chromosome position
*
* @param exonStarts
* @param exonEnds
* @param cdsStart
* @param cdsEnd
* @return
*/
public static ChromPos getChromPosForward(int cdsPos, List exonStarts, List exonEnds,
int cdsStart, int cdsEnd) {
boolean inCoding = false;
int codingLength = 0;
int lengthExons = 0;
// map forward
for (int i = 0; i < exonStarts.size(); i++) {
// start can include UTR
int start = exonStarts.get(i);
int end = exonEnds.get(i);
lengthExons += end - start;
if (start <= cdsStart +1 && end >= cdsStart+1) {
// first exon with UTR
if (codingLength + (end - cdsStart-1) >= cdsPos) {
// we are reaching our target position
int tmp = cdsPos - codingLength;
logger.debug(cdsStart + " | " + codingLength + " | " + tmp);
logger.debug(" -> found position in UTR exon: #"+(i+1)+ " cdsPos:" + cdsPos +
" return:"+(cdsStart +1 + tmp) +" start:" + format(start + 1) + " " + format(tmp) + " " + cdsStart + " " + codingLength);
// we start 1 after cdsStart...
return new ChromPos((cdsStart +1 + tmp),-1);
}
inCoding = true;
codingLength += (end - cdsStart);
logger.debug(" UTR : " + format(start+1) + " - " + (cdsStart ));
logger.debug(" -> Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
} else if (start+1 <= cdsEnd && end >= cdsEnd) {
// LAST EXON with UTR
//logger.debug(" <-- CDS end at: " + cdsEnd );
inCoding = false;
if (codingLength + (cdsEnd - start-1) >= cdsPos) {
int tmp = cdsPos - codingLength;
logger.debug(" <- Exon : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3));
logger.debug(" UTR : " + format(cdsEnd + 1) + " - " + format(end));
logger.debug( codingLength + " | " + tmp + " | " + format(start+1));
logger.debug(" -> chromPosForward found position in non coding exon: " + cdsPos + " " + format(start+1) + " " + format(tmp) + " " + format(cdsStart) + " " + codingLength);
return new ChromPos((start +1 + tmp),cdsPos%3);
}
codingLength += (cdsEnd - start-1);
logger.debug(" <- Exon : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3));
logger.debug(" UTR : " + format(cdsEnd + 1) + " - " + format(end));
} else if (inCoding) {
// A standard coding Exon
// tests for the maximum length of this coding exon
if (codingLength + (end - start -1) >= cdsPos) {
// we are within the range of this exon
int tmp = cdsPos - codingLength ;
logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + tmp + " | " + codingLength);
logger.debug(" -> found chr position in coding exon #" + (i+1) + ": cdsPos:" + format(cdsPos) + " s:" + format(start) + "-" + format(end) + " tmp:" + format(tmp) + " cdsStart:" + format(cdsStart) + " codingLength:" + codingLength);
return new ChromPos((start +1 + tmp),cdsPos%3);
}
// full exon is coding
codingLength += (end - start );
logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3));
}
// if ( inCoding )
// logger.debug("exon phase at end:" + (codingLength % 3));
//
// logger.debug(" coding length: " + codingLength);
}
//logger.debug("length exons: " + lengthExons);
//return codingLength - 3;
// could not map!
return new ChromPos(-1,-1);
}
/**
* Get the length of the coding sequence
*
* @param exonStarts
* @param exonEnds
* @param cdsStart
* @param cdsEnd
* @return
*/
public static int getCDSLengthReverse(List exonStarts,
List exonEnds, int cdsStart, int cdsEnd) {
boolean inCoding = false;
int codingLength = 0;
if (cdsEnd < cdsStart) {
int tmp = cdsEnd;
cdsEnd = cdsStart;
cdsStart = tmp;
}
int lengthExons = 0;
// map reverse
for (int i = exonStarts.size() - 1; i >= 0; i--) {
int end = exonStarts.get(i);
int start = exonEnds.get(i);
if (end < start) {
int tmp = end;
end = start;
start = tmp;
}
lengthExons += end - start;
if (start <= cdsEnd && end >= cdsEnd) {
inCoding = true;
int tmpstart = start;
if (start < cdsStart) {
tmpstart = cdsStart;
}
codingLength += (cdsEnd - tmpstart);
boolean debug = logger.isDebugEnabled();
if ( debug) {
StringBuffer b = new StringBuffer();
b.append(" UTR :" + (cdsEnd + 1) + " - " + (end) + newline);
if (tmpstart == start)
b.append(" -> ");
else
b.append(" <-> ");
b.append("Exon :" + tmpstart + " - " + cdsEnd + " | " + (cdsEnd - tmpstart) + " | " + codingLength + " | " + (codingLength % 3) + newline);
// single exon with UTR on both ends
if (tmpstart != start)
b.append(" UTR :" + (cdsStart - 1) + " - " + start + newline);
logger.debug(b.toString());
}
} else if (start <= cdsStart && end >= cdsStart) {
inCoding = false;
codingLength += (end - cdsStart);
logger.debug(" <- Exon : " + (cdsStart+1) + " - " + end + " | " + (end - cdsStart) + " | " + (codingLength-3) + " | " + (codingLength % 3));
logger.debug(" UTR : " + start + " - " + (cdsStart ));
} else if (inCoding) {
// full exon is coding
codingLength += (end - start);
logger.debug(" Exon : " + start + " - " + end + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
} else {
// e.g. see UBQLN3
logger.debug(" no translation!");
}
}
logger.debug("length exons: " + lengthExons + " codin length: " + (codingLength - 3));
return codingLength - 3;
}
/**
* Get the length of the coding sequence
*
* @param exonStarts
* @param exonEnds
* @param cdsStart
* @param cdsEnd
* @return
*/
public static int getCDSLengthForward(List exonStarts, List exonEnds,
int cdsStart, int cdsEnd) {
boolean inCoding = false;
int codingLength = 0;
int lengthExons = 0;
// map forward
for (int i = 0; i < exonStarts.size(); i++) {
int start = exonStarts.get(i);
int end = exonEnds.get(i);
lengthExons += end - start;
logger.debug("forward exon: " + (start+1) + " - " + end + " | " + (end - start));
if (start+1 <= cdsStart +1 && end >= cdsStart+1) {
inCoding = true;
codingLength += (end - cdsStart);
logger.debug(" UTR : " + start + " - " + (cdsStart ));
logger.debug(" -> Exon : " + (cdsStart+1) + " - " + end + " | " + (end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3));
} else if (start+1 <= cdsEnd && end >= cdsEnd) {
inCoding = false;
codingLength += (cdsEnd - start);
logger.debug(" <- Exon : " + (start +1)+ " - " + cdsEnd + " | " + (cdsEnd - start+1) + " | " + codingLength + " | " + (codingLength % 3));
logger.debug(" UTR : " + cdsEnd + 1 + " - " + end);
} else if (inCoding) {
// full exon is coding
codingLength += (end - start);
logger.debug(" Exon :" + (start+1) + " - " + end + " | " + (end - start+1) + " | " + codingLength + " | " + (codingLength % 3));
}
}
logger.debug("length exons: " + Integer.toString(lengthExons));
logger.debug("CDS length:" + Integer.toString((codingLength-3)));
return codingLength-3 ;
}
/** Extracts the exon boundaries in CDS coordinates. (needs to be divided by 3 to get AA positions)
*
* @param chromPos
* @return
*/
public static List> getCDSExonRanges(GeneChromosomePosition chromPos){
if ( chromPos.getOrientation() == '+')
return getCDSExonRangesForward(chromPos,CDS);
return getCDSExonRangesReverse(chromPos,CDS);
}
/** Extracts the boundaries of the coding regions in chromosomal coordinates
*
* @param chromPos
* @return
*/
public static List> getChromosomalRangesForCDS(GeneChromosomePosition chromPos){
if ( chromPos.getOrientation() == '+')
return getCDSExonRangesForward(chromPos,CHROMOSOME);
return getCDSExonRangesReverse(chromPos,CHROMOSOME);
}
private static List> getCDSExonRangesReverse(GeneChromosomePosition chromPos,
String responseType) {
List exonStarts = chromPos.getExonStarts();
List exonEnds = chromPos.getExonEnds();
List> data = new ArrayList<>();
int cdsStart = chromPos.getCdsStart();
int cdsEnd = chromPos.getCdsEnd();
boolean inCoding = false;
int codingLength = 0;
if (cdsEnd < cdsStart) {
int tmp = cdsEnd;
cdsEnd = cdsStart;
cdsStart = tmp;
}
java.lang.StringBuffer s =null;
boolean debug = logger.isDebugEnabled();
if ( debug)
s = new StringBuffer();
int lengthExons = 0;
// map reverse
for (int i = exonStarts.size() - 1; i >= 0; i--) {
int end = exonStarts.get(i);
int start = exonEnds.get(i);
if (end < start) {
int tmp = end;
end = start;
start = tmp;
}
lengthExons += end - start;
//s.append("Reverse exon: " + end + " - " + start + " | " + (end - start));
//s.append(newline);
if (start <= cdsEnd && end >= cdsEnd) {
inCoding = true;
int tmpstart = start;
if (start < cdsStart) {
tmpstart = cdsStart;
}
codingLength += (cdsEnd - tmpstart);
if ( debug ) {
s.append(" UTR :").append(format(cdsEnd + 1)).append(" | ").append(format(end));
s.append(newline);
if (tmpstart == start)
s.append(" -> ");
else
s.append(" <-> ");
s.append("Exon :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(cdsEnd - tmpstart).append(" | ").append(codingLength).append(" | ").append(codingLength % 3);
s.append(newline);
// single exon with UTR on both ends
if (tmpstart != start)
s.append(" UTR :").append(format(cdsStart)).append(" - ").append(format(start + 1));
s.append(newline);
}
Range r ;
if ( responseType.equals(CDS))
r = Range.closed(0,codingLength);
else
r = Range.closed(tmpstart,cdsEnd);
data.add(r);
} else if (start <= cdsStart && end >= cdsStart) {
inCoding = false;
Range r;
if ( responseType.equals(CDS))
r = Range.closed(codingLength,codingLength+(end-cdsStart));
else
r = Range.closed(cdsStart+1,end);
data.add(r);
codingLength += (end - cdsStart);
if (debug) {
s.append(" <- Exon : " + format(cdsStart + 1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
s.append(newline);
s.append(" UTR : ").append(format(start + 1)).append(" - ").append(format(cdsStart));
s.append(newline);
}
} else if (inCoding) {
// full exon is coding
Range r;
if ( responseType.equals(CDS))
r = Range.closed(codingLength,codingLength+(end-start));
else
r = Range.closed(start,end);
data.add(r);
codingLength += (end - start);
if (debug) {
s.append(" Exon : " + format(start + 1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
s.append(newline);
}
} else {
// e.g. see UBQLN3
if ( debug ) {
s.append(" no translation! UTR: " + format(start) + " - " + format(end));
s.append(newline);
}
}
}
if ( debug ) {
s.append("CDS length: ").append(Integer.toString(codingLength - 3));
s.append(newline);
logger.debug(s.toString());
}
return data;
}
private static List> getCDSExonRangesForward(GeneChromosomePosition chromPos,
String responseType) {
List> data = new ArrayList<>();
List exonStarts = chromPos.getExonStarts();
List exonEnds = chromPos.getExonEnds();
int cdsStart = chromPos.getCdsStart();
int cdsEnd = chromPos.getCdsEnd();
boolean inCoding = false;
int codingLength = 0;
for (int i = 0; i < exonStarts.size(); i++) {
int start = exonStarts.get(i);
int end = exonEnds.get(i);
if (start <= cdsStart && end >= cdsStart) {
inCoding = true;
codingLength += (end - cdsStart);
//
Range r;
if ( responseType.equals(CDS))
r = Range.closed(0,codingLength);
else
r = Range.closed(cdsStart,end);
data.add(r);
} else if (start <= cdsEnd && end >= cdsEnd) {
//logger.debug(" <-- CDS end at: " + cdsEnd );
inCoding = false;
Range r;
if ( responseType.equals(CDS))
r = Range.closed(codingLength,codingLength+(cdsEnd-start));
else
r = Range.closed(start,cdsEnd);
data.add(r);
codingLength += (cdsEnd - start);
} else if (inCoding) {
// full exon is coding
Range r;
if ( responseType.equals(CDS))
r = Range.closed(codingLength,codingLength+(end-start));
else
r = Range.closed(start,end);
data.add(r);
codingLength += (end - start);
}
}
return data;
}
//
/**
* I have a genomic coordinate, where is it on the mRNA
*
* @param coordinate
* @param chromosomePosition
* @return
*/
public static int getCDSPosForChromosomeCoordinate(int coordinate, GeneChromosomePosition chromosomePosition) {
if ( chromosomePosition.getOrientation() == '+')
return getCDSPosForward(coordinate,
chromosomePosition.getExonStarts(),
chromosomePosition.getExonEnds(),
chromosomePosition.getCdsStart(),
chromosomePosition.getCdsEnd());
return getCDSPosReverse(coordinate,
chromosomePosition.getExonStarts(),
chromosomePosition.getExonEnds(),
chromosomePosition.getCdsStart(),
chromosomePosition.getCdsEnd());
}
/** Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene
* living on the forward DNA strand.
*
* @param chromPos The genetic coordinate on a chromosome
* @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
* @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
* @param cdsStart The start position of a coding region
* @param cdsEnd The end position of a coding region
*
* @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1)
*
* @author Yana Valasatava
*/
public static int getCDSPosForward(int chromPos, List exonStarts, List exonEnds,
int cdsStart, int cdsEnd) {
// the genetic coordinate is not in a coding region
if ( (chromPos < (cdsStart+1) ) || ( chromPos > (cdsEnd+1) ) ) {
logger.debug("The "+format(chromPos)+" position is not in a coding region");
return -1;
}
logger.debug("looking for CDS position for " +format(chromPos));
// remove exons that are fully landed in UTRs
List tmpS = new ArrayList(exonStarts);
List tmpE = new ArrayList(exonEnds);
int j=0;
for (int i = 0; i < tmpS.size(); i++) {
if ( ( tmpE.get(i) < cdsStart) || ( tmpS.get(i) > cdsEnd) ) {
exonStarts.remove(j);
exonEnds.remove(j);
}
else {
j++;
}
}
// remove untranslated regions from exons
int nExons = exonStarts.size();
exonStarts.remove(0);
exonStarts.add(0, cdsStart);
exonEnds.remove(nExons-1);
exonEnds.add(cdsEnd);
int codingLength = 0;
int lengthExon = 0;
// map the genetic coordinate on a stretch of a reverse strand
for (int i = 0; i < nExons; i++) {
int start = exonStarts.get(i);
int end = exonEnds.get(i);
lengthExon = end - start;
if (start+1 <= chromPos && end >= chromPos ) {
return codingLength + (chromPos-start);
}
else {
codingLength += lengthExon;
}
}
return -1;
}
/** Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene
* living on the reverse DNA strand.
*
* @param chromPos The genetic coordinate on a chromosome
* @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
* @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
* @param cdsStart The start position of a coding region
* @param cdsEnd The end position of a coding region
*
* @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1)
*
* @author Yana Valasatava
*/
public static int getCDSPosReverse(int chromPos, List exonStarts, List exonEnds,
int cdsStart, int cdsEnd) {
// the genetic coordinate is not in a coding region
if ( (chromPos < (cdsStart+1)) || ( chromPos > (cdsEnd+1) ) ) {
logger.debug("The "+format(chromPos)+" position is not in a coding region");
return -1;
}
logger.debug("looking for CDS position for " +format(chromPos));
// remove exons that are fully landed in UTRs
List tmpS = new ArrayList(exonStarts);
List tmpE = new ArrayList(exonEnds);
int j=0;
for (int i = tmpS.size() - 1; i >= 0; i--) {
if ( ( tmpE.get(i) < cdsStart) || ( tmpS.get(i) > cdsEnd) ) {
exonStarts.remove(j);
exonEnds.remove(j);
}
else {
j++;
}
}
// remove untranslated regions from exons
int nExons = exonStarts.size();
exonStarts.remove(0);
exonStarts.add(0, cdsStart);
exonEnds.remove(nExons-1);
exonEnds.add(cdsEnd);
int codingLength = 0;
int lengthExon = 0;
// map the genetic coordinate on a stretch of a reverse strand
for (int i = nExons - 1; i >= 0; i--) {
int start = exonStarts.get(i);
int end = exonEnds.get(i);
lengthExon = end - start;
// +1 offset to be a base 1
if (start+1 <= chromPos && end >= chromPos ) {
return codingLength + (end-chromPos+1);
}
else {
codingLength += lengthExon;
}
}
return -1;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy