
org.biojava.nbio.genome.util.ChromosomeMappingTools Maven / Gradle / Ivy
The newest version!
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.nbio.genome.util;
import com.google.common.collect.Range;
import org.biojava.nbio.core.sequence.DNASequence;
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
import org.biojava.nbio.core.sequence.template.SequenceView;
import org.biojava.nbio.genome.parsers.genename.ChromPos;
import org.biojava.nbio.genome.parsers.genename.GeneChromosomePosition;
import org.biojava.nbio.genome.parsers.twobit.TwoBitFacade;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
/**
* A class that can map chromosomal positions to mRNA (coding sequence) positions.
*
* @author Andreas Prlic
*/
public class ChromosomeMappingTools {
private static final Logger logger = LoggerFactory.getLogger(ChromosomeMappingTools.class);
private static final String newline = System.getProperty("line.separator");
public static final String CHROMOSOME = "CHROMOSOME";
public static final String CDS = "CDS";
private static int base = 1;
public static void setCoordinateSystem(int baseInt) {
base = baseInt;
}
/**
* Pretty print the details of a GeneChromosomePosition to a String
*
* @param chromosomePosition
* @return
*/
public static String formatExonStructure(GeneChromosomePosition chromosomePosition ){
if ( chromosomePosition.getOrientation() == '+')
return formatExonStructureForward(chromosomePosition);
return formatExonStructureReverse(chromosomePosition);
}
private static String formatExonStructureForward(GeneChromosomePosition chromPos) {
StringWriter s = new StringWriter();
List exonStarts = chromPos.getExonStarts();
List exonEnds = chromPos.getExonEnds();
int cdsStart = chromPos.getCdsStart();
int cdsEnd = chromPos.getCdsEnd();
boolean inCoding = false;
int codingLength = 0;
for (int i = 0; i < exonStarts.size(); i++) {
int start = exonStarts.get(i);
int end = exonEnds.get(i);
if (start <= cdsStart +1 && end >= cdsStart+1) {
inCoding = true;
codingLength += (end - cdsStart);
s.append(" UTR : ").append(format(start)).append(" - ").append(format(cdsStart));
s.append(newline);
s.append(" -> Exon : ").append(format(cdsStart + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - cdsStart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
s.append(newline);
} else if (start <= cdsEnd && end >= cdsEnd) {
//logger.debug(" <-- CDS end at: " + cdsEnd );
inCoding = false;
codingLength += (cdsEnd - start);
s.append(" <- Exon : ").append(format(start + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
s.append(newline);
s.append(" UTR : " + (cdsEnd +1) + " - " + format(end));
s.append(newline);
} else if (inCoding) {
// full exon is coding
codingLength += (end - start);
s.append(" Exon : ").append(format(start + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
s.append(newline);
}
}
s.append("Coding Length: ");
s.append((codingLength-3)+"");
s.append(newline);
return s.toString();
}
private static String formatExonStructureReverse(GeneChromosomePosition chromPos) {
StringWriter s = new StringWriter();
List exonStarts = chromPos.getExonStarts();
List exonEnds = chromPos.getExonEnds();
int cdsStart = chromPos.getCdsStart();
int cdsEnd = chromPos.getCdsEnd();
// logger.debug("CDS START:" +format(cdsStart) + " - " + format(cdsEnd));
boolean inCoding = false;
int codingLength = 0;
if (cdsEnd < cdsStart) {
int tmp = cdsEnd;
cdsEnd = cdsStart;
cdsStart = tmp;
}
// map reverse
for (int i = exonStarts.size() - 1; i >= 0; i--) {
int end = exonStarts.get(i);
int start = exonEnds.get(i);
if (end < start) {
int tmp = end;
end = start;
start = tmp;
}
if (start <= cdsEnd && end >= cdsEnd) {
inCoding = true;
int tmpstart = start;
if (start < cdsStart) {
tmpstart = cdsStart;
}
codingLength += (cdsEnd - tmpstart);
s.append(" UTR :" + format(cdsEnd + 1) + " | " + format(end));
s.append(newline);
if (tmpstart == start)
s.append(" -> ");
else
s.append(" <-> ");
s.append("Exon :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - tmpstart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
s.append(newline);
// single exon with UTR on both ends
if (tmpstart != start)
s.append(" UTR :" + format(cdsStart ) + " - " + format(start + 1));
s.append(newline);
} else if (start <= cdsStart && end >= cdsStart) {
inCoding = false;
codingLength += (end - cdsStart);
s.append(" <- Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
s.append(newline);
s.append(" UTR : " + format(start+1) + " - " + format(cdsStart ));
s.append(newline);
} else if (inCoding) {
// full exon is coding
codingLength += (end - start);
s.append(" Exon : " + format(start+1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
s.append(newline);
} else {
// e.g. see UBQLN3
s.append(" no translation! UTR: ").append(format(start)).append(" - ").append(format(end));
s.append(newline);
}
}
s.append("CDS length: ").append(Integer.toString(codingLength - 3));
s.append(newline);
return s.toString();
}
/**
* Get the length of the CDS in nucleotides.
*
* @param chromPos
* @return length of the CDS in nucleotides.
*/
public static int getCDSLength(GeneChromosomePosition chromPos) {
List exonStarts = chromPos.getExonStarts();
List exonEnds = chromPos.getExonEnds();
int cdsStart = chromPos.getCdsStart();
int cdsEnd = chromPos.getCdsEnd();
int codingLength;
if (chromPos.getOrientation().equals('+'))
codingLength = ChromosomeMappingTools.getCDSLengthForward(exonStarts, exonEnds, cdsStart, cdsEnd);
else
codingLength = ChromosomeMappingTools.getCDSLengthReverse(exonStarts, exonEnds, cdsStart, cdsEnd);
return codingLength;
}
/**
* Maps the position of a CDS nucleotide back to the genome
*
* @param cdsNucleotidePosition
* @return a ChromPos object
*/
public static ChromPos getChromosomePosForCDScoordinate(int cdsNucleotidePosition, GeneChromosomePosition chromPos) {
logger.debug(" ? Checking chromosome position for CDS position {}", cdsNucleotidePosition);
List exonStarts = chromPos.getExonStarts();
List exonEnds = chromPos.getExonEnds();
logger.debug(" Exons:{}", exonStarts.size());
int cdsStart = chromPos.getCdsStart();
int cdsEnd = chromPos.getCdsEnd();
ChromPos chromosomePos = null;
if (chromPos.getOrientation().equals('+'))
chromosomePos = ChromosomeMappingTools.getChromPosForward(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd);
else
chromosomePos = ChromosomeMappingTools.getChromPosReverse(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd);
logger.debug("=> CDS pos " + cdsNucleotidePosition + " for " + chromPos.getGeneName() + " is on chromosome at " + chromosomePos);
return chromosomePos;
}
/**
* Returns a nicely formatted representation of the position
*
* @param chromosomePosition
* @return
*/
private static String format(int chromosomePosition){
return String.format("%,d", chromosomePosition);
}
/**
* Get the CDS position mapped on the chromosome position
*
* @param exonStarts
* @param exonEnds
* @param cdsStart
* @param cdsEnd
* @return
*/
public static ChromPos getChromPosReverse(int cdsPos, List exonStarts, List exonEnds, int cdsStart, int cdsEnd) {
boolean inCoding = false;
int codingLength = 0;
if (cdsEnd < cdsStart) {
int tmp = cdsEnd;
cdsEnd = cdsStart;
cdsStart = tmp;
}
int lengthExons = 0;
// map reverse
for (int i = exonStarts.size() - 1; i >= 0; i--) {
logger.debug("Exon #" + (i+1) + "/" + exonStarts.size());
int end = exonStarts.get(i);
int start = exonEnds.get(i);
if (end < start) {
int tmp = end;
end = start;
start = tmp;
}
lengthExons += end - start;
logger.debug(" is " + cdsPos + " part of Reverse exon? " + format(start+1) + " - " + format(end) + " | " + (end - start+1));
logger.debug(" CDS start: " + format(cdsStart+1) + "-" + format(cdsEnd) + " coding length counter:" + codingLength);
if (start+1 <= cdsEnd && end >= cdsEnd) {
// FIRST EXON
inCoding = true;
int tmpstart = start;
if (start < cdsStart) {
tmpstart = cdsStart;
}
// here one of the few places where we don't say start+1
int check = codingLength + cdsEnd - tmpstart ;
logger.debug("First Exon | " + (check) + " | " + format(start+1) + " " + format(end) + " | " + (cdsEnd - tmpstart) + " | " + cdsPos );
if ( ( check > cdsPos) ) {
int tmp = cdsPos - codingLength ;
logger.debug(" -> found position in UTR exon: " + format(cdsPos) + " " + format(tmpstart+1) + " tmp:" + format(tmp) + " cs:" + format(cdsStart+1) + " ce:" + format(cdsEnd) + " cl:" + codingLength);
return new ChromPos((cdsEnd - tmp), -1) ;
}
// don't add 1 here
codingLength += (cdsEnd - tmpstart );
boolean debug = logger.isDebugEnabled();
if ( debug ) {
StringBuffer b = new StringBuffer();
b.append(" UTR :" + format(cdsEnd + 1) + " - " + format(end) + newline);
if (tmpstart == start)
b.append(" -> ");
else
b.append(" <-> ");
b.append("Exon :" + format(tmpstart + 1) + " - " + (cdsEnd) + " | " + format(cdsEnd - tmpstart + 1) + " - " + codingLength + " | " + (codingLength % 3) + newline);
// single exon with UTR on both ends
if (tmpstart != start)
b.append(" UTR :" + format(cdsStart) + " - " + format(start + 1) + newline);
logger.debug(b.toString());
}
} else if (start <= cdsStart && end >= cdsStart) {
// LAST EXON
inCoding = false;
if (codingLength + end - cdsStart >= cdsPos) {
// how many remaining coding nucleotides?
int tmp = codingLength + end - cdsStart - cdsPos ;
logger.debug("cdl: " +codingLength + " tmp:" + tmp + " cdsStart: " + format(cdsStart));
logger.debug(" -> XXX found position noncoding exon: cdsPos:" + cdsPos + " s:" + format(start + 1) + " tmp:" + format(tmp) + " cdsStart:" + (cdsStart + 1) + " codingLength:" + codingLength + " cdsEnd:" + format(cdsEnd));
return new ChromPos((cdsStart + tmp),-1);
}
codingLength += (end - cdsStart);
logger.debug(" <- Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3));
logger.debug(" UTR : " + format(start+1) + " - " + format(cdsStart ));
} else if (inCoding) {
if (codingLength + end - start -1 >= cdsPos) {
int tmp = cdsPos - codingLength ;
if ( tmp > (end - start ) ) {
tmp = (end - start );
logger.debug("changing tmp to {}", tmp);
}
logger.debug(" " + cdsPos + " " + codingLength + " | " + (cdsPos - codingLength) + " | " + (end -start) + " | " + tmp);
logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3));
logger.debug(" -> RRR found position coding exon: " + cdsPos + " " + format(start+1) + " " + format(end) + " " + tmp + " " + format(cdsStart+1) + " " + codingLength);
return new ChromPos((end - tmp),cdsPos %3);
}
// full exon is coding
codingLength += (end - start) ;
logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start+1) + " | " + codingLength + " | " + (codingLength % 3));
} else {
// e.g. see UBQLN3
logger.debug(" no translation!");
}
logger.debug(" coding length: " + codingLength + "(phase:" + (codingLength % 3) + ") CDS POS trying to map:" + cdsPos);
}
logger.debug("length exons: {}", lengthExons);
// could not map, or map over the full length??
return new ChromPos(-1,-1);
}
/**
* Get the CDS position mapped onto the chromosome position
*
* @param exonStarts
* @param exonEnds
* @param cdsStart
* @param cdsEnd
* @return
*/
public static ChromPos getChromPosForward(int cdsPos, List exonStarts, List exonEnds, int cdsStart, int cdsEnd) {
boolean inCoding = false;
int codingLength = 0;
@SuppressWarnings("unused")
int lengthExons = 0;
// map forward
for (int i = 0; i < exonStarts.size(); i++) {
// start can include UTR
int start = exonStarts.get(i);
int end = exonEnds.get(i);
lengthExons += end - start;
if (start <= cdsStart +1 && end >= cdsStart+1) {
// first exon with UTR
if (codingLength + (end - cdsStart-1) >= cdsPos) {
// we are reaching our target position
int tmp = cdsPos - codingLength;
logger.debug(cdsStart + " | " + codingLength + " | " + tmp);
logger.debug(" -> found position in UTR exon: #"+(i+1)+ " cdsPos:" + cdsPos +
" return:"+(cdsStart +1 + tmp) +" start:" + format(start + 1) + " " + format(tmp) + " " + cdsStart + " " + codingLength);
// we start 1 after cdsStart...
return new ChromPos((cdsStart +1 + tmp),-1);
}
inCoding = true;
codingLength += (end - cdsStart);
logger.debug(" UTR : " + format(start+1) + " - " + (cdsStart ));
logger.debug(" -> Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
} else if (start+1 <= cdsEnd && end >= cdsEnd) {
// LAST EXON with UTR
//logger.debug(" <-- CDS end at: " + cdsEnd );
inCoding = false;
if (codingLength + (cdsEnd - start-1) >= cdsPos) {
int tmp = cdsPos - codingLength;
logger.debug(" <- Exon : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3));
logger.debug(" UTR : " + format(cdsEnd + 1) + " - " + format(end));
logger.debug( codingLength + " | " + tmp + " | " + format(start+1));
logger.debug(" -> chromPosForward found position in non coding exon: " + cdsPos + " " + format(start+1) + " " + format(tmp) + " " + format(cdsStart) + " " + codingLength);
return new ChromPos((start +1 + tmp),cdsPos%3);
}
codingLength += (cdsEnd - start-1);
logger.debug(" <- Exon : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3));
logger.debug(" UTR : " + format(cdsEnd + 1) + " - " + format(end));
} else if (inCoding) {
// A standard coding Exon
// tests for the maximum length of this coding exon
if (codingLength + (end - start -1) >= cdsPos) {
// we are within the range of this exon
int tmp = cdsPos - codingLength ;
logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + tmp + " | " + codingLength);
logger.debug(" -> found chr position in coding exon #" + (i+1) + ": cdsPos:" + format(cdsPos) + " s:" + format(start) + "-" + format(end) + " tmp:" + format(tmp) + " cdsStart:" + format(cdsStart) + " codingLength:" + codingLength);
return new ChromPos((start +1 + tmp),cdsPos%3);
}
// full exon is coding
codingLength += (end - start );
logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3));
}
}
return new ChromPos(-1,-1);
}
/**
* Get the length of the coding sequence
*
* @param exonStarts
* @param exonEnds
* @param cdsStart
* @param cdsEnd
* @return
*/
public static int getCDSLengthReverse(List exonStarts, List exonEnds, int cdsStart, int cdsEnd) {
int codingLength = 0;
if (cdsEnd < cdsStart) {
int tmp = cdsEnd;
cdsEnd = cdsStart;
cdsStart = tmp;
}
cdsStart = cdsStart + base;
// map reverse
for (int i = exonStarts.size() - 1; i >= 0; i--) {
int end = exonStarts.get(i);
int start = exonEnds.get(i);
if (end < start) {
int tmp = end;
end = start;
start = tmp;
}
start = start + base;
if ((start < cdsStart && end < cdsStart) || (start > cdsEnd && end > cdsEnd))
continue;
if (start < cdsStart)
start = cdsStart;
if (end > cdsEnd)
end = cdsEnd;
codingLength += (end - start + 1);
}
return codingLength - 3;
}
/**
* Get the length of the coding sequence
*
* @param exonStarts
* @param exonEnds
* @param cdsStart
* @param cdsEnd
* @return
*/
public static int getCDSLengthForward(List exonStarts, List exonEnds, int cdsStart, int cdsEnd) {
int codingLength = 0;
for (int i = 0; i < exonStarts.size(); i++) {
int start = exonStarts.get(i)+base;
int end = exonEnds.get(i);
if ( (start < cdsStart+base && end < cdsStart) || (start > cdsEnd && end > cdsEnd) )
continue;
if (start < cdsStart+base)
start = cdsStart+base;
if (end > cdsEnd)
end = cdsEnd;
codingLength += (end - start + 1);
}
return codingLength-3 ;
}
/**
* Extracts the exon boundaries in CDS coordinates. (needs to be divided by 3 to get AA positions)
*
* @param chromPos
* @return
*/
public static List> getCDSExonRanges(GeneChromosomePosition chromPos){
if ( chromPos.getOrientation() == '+')
return getCDSExonRangesForward(chromPos,CDS);
return getCDSExonRangesReverse(chromPos,CDS);
}
/** Extracts the boundaries of the coding regions in chromosomal coordinates
*
* @param chromPos
* @return
*/
public static List> getChromosomalRangesForCDS(GeneChromosomePosition chromPos){
if ( chromPos.getOrientation() == '+')
return getCDSExonRangesForward(chromPos,CHROMOSOME);
return getCDSExonRangesReverse(chromPos,CHROMOSOME);
}
private static List> getCDSExonRangesReverse(GeneChromosomePosition chromPos, String responseType) {
List exonStarts = chromPos.getExonStarts();
List exonEnds = chromPos.getExonEnds();
List> data = new ArrayList<>();
int cdsStart = chromPos.getCdsStart();
int cdsEnd = chromPos.getCdsEnd();
boolean inCoding = false;
int codingLength = 0;
if (cdsEnd < cdsStart) {
int tmp = cdsEnd;
cdsEnd = cdsStart;
cdsStart = tmp;
}
java.lang.StringBuffer s =null;
boolean debug = logger.isDebugEnabled();
if ( debug)
s = new StringBuffer();
//int lengthExons = 0;
// map reverse
for (int i = exonStarts.size() - 1; i >= 0; i--) {
int end = exonStarts.get(i);
int start = exonEnds.get(i);
if (end < start) {
int tmp = end;
end = start;
start = tmp;
}
//lengthExons += end - start;
//s.append("Reverse exon: " + end + " - " + start + " | " + (end - start));
//s.append(newline);
if (start <= cdsEnd && end >= cdsEnd) {
inCoding = true;
int tmpstart = start;
if (start < cdsStart) {
tmpstart = cdsStart;
}
codingLength += (cdsEnd - tmpstart);
if ( debug ) {
s.append(" UTR :").append(format(cdsEnd + 1)).append(" | ").append(format(end));
s.append(newline);
if (tmpstart == start)
s.append(" -> ");
else
s.append(" <-> ");
s.append("Exon :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(cdsEnd - tmpstart).append(" | ").append(codingLength).append(" | ").append(codingLength % 3);
s.append(newline);
// single exon with UTR on both ends
if (tmpstart != start)
s.append(" UTR :").append(format(cdsStart)).append(" - ").append(format(start + 1));
s.append(newline);
}
Range r ;
if ( responseType.equals(CDS))
r = Range.closed(0,codingLength);
else
r = Range.closed(tmpstart,cdsEnd);
data.add(r);
} else if (start <= cdsStart && end >= cdsStart) {
inCoding = false;
Range r;
if ( responseType.equals(CDS))
r = Range.closed(codingLength,codingLength+(end-cdsStart));
else
r = Range.closed(cdsStart+1,end);
data.add(r);
codingLength += (end - cdsStart);
if (debug) {
s.append(" <- Exon : " + format(cdsStart + 1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
s.append(newline);
s.append(" UTR : ").append(format(start + 1)).append(" - ").append(format(cdsStart));
s.append(newline);
}
} else if (inCoding) {
// full exon is coding
Range r;
if ( responseType.equals(CDS))
r = Range.closed(codingLength,codingLength+(end-start));
else
r = Range.closed(start,end);
data.add(r);
codingLength += (end - start);
if (debug) {
s.append(" Exon : " + format(start + 1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
s.append(newline);
}
} else {
// e.g. see UBQLN3
if ( debug ) {
s.append(" no translation! UTR: " + format(start) + " - " + format(end));
s.append(newline);
}
}
}
if ( debug ) {
s.append("CDS length: ").append(Integer.toString(codingLength - 3));
s.append(newline);
logger.debug(s.toString());
}
return data;
}
private static List> getCDSExonRangesForward(GeneChromosomePosition chromPos, String responseType) {
List> data = new ArrayList<>();
List exonStarts = chromPos.getExonStarts();
List exonEnds = chromPos.getExonEnds();
int cdsStart = chromPos.getCdsStart();
int cdsEnd = chromPos.getCdsEnd();
boolean inCoding = false;
int codingLength = 0;
for (int i = 0; i < exonStarts.size(); i++) {
int start = exonStarts.get(i);
int end = exonEnds.get(i);
if (start <= cdsStart && end >= cdsStart) {
inCoding = true;
codingLength += (end - cdsStart);
Range r;
if ( responseType.equals(CDS))
r = Range.closed(0,codingLength);
else
r = Range.closed(cdsStart,end);
data.add(r);
} else if (start <= cdsEnd && end >= cdsEnd) {
//logger.debug(" <-- CDS end at: " + cdsEnd );
inCoding = false;
Range r;
if ( responseType.equals(CDS))
r = Range.closed(codingLength,codingLength+(cdsEnd-start));
else
r = Range.closed(start,cdsEnd);
data.add(r);
codingLength += (cdsEnd - start);
} else if (inCoding) {
// full exon is coding
Range r;
if ( responseType.equals(CDS))
r = Range.closed(codingLength,codingLength+(end-start));
else
r = Range.closed(start,end);
data.add(r);
codingLength += (end - start);
}
}
return data;
}
/**
* I have a genomic coordinate, where is it on the mRNA
*
* @param coordinate
* @param chromosomePosition
* @return
*/
public static int getCDSPosForChromosomeCoordinate(int coordinate, GeneChromosomePosition chromosomePosition) {
if ( chromosomePosition.getOrientation() == '+')
return getCDSPosForward(coordinate,
chromosomePosition.getExonStarts(),
chromosomePosition.getExonEnds(),
chromosomePosition.getCdsStart(),
chromosomePosition.getCdsEnd());
return getCDSPosReverse(coordinate,
chromosomePosition.getExonStarts(),
chromosomePosition.getExonEnds(),
chromosomePosition.getCdsStart(),
chromosomePosition.getCdsEnd());
}
/**
* Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene
* living on the forward DNA strand.
*
* @param chromPos The genetic coordinate on a chromosome
* @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
* @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
* @param cdsStart The start position of a coding region
* @param cdsEnd The end position of a coding region
*
* @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1)
*
* @author Yana Valasatava
*/
public static int getCDSPosForward(int chromPos, List exonStarts, List exonEnds,
int cdsStart, int cdsEnd) {
// the genetic coordinate is not in a coding region
if ( (chromPos < (cdsStart+base) ) || ( chromPos > (cdsEnd+base) ) ) {
logger.debug("The {} position is not in a coding region", format(chromPos));
return -1;
}
logger.debug("looking for CDS position for {}", format(chromPos));
// map the genetic coordinates of coding region on a stretch of a reverse strand
List> cdsRegions = getCDSRegions(exonStarts, exonEnds, cdsStart, cdsEnd);
int codingLength = 0;
int lengthExon = 0;
for (Range range : cdsRegions) {
int start = range.lowerEndpoint();
int end = range.upperEndpoint();
lengthExon = end - start;
if (start+base <= chromPos && end >= chromPos ) {
return codingLength + (chromPos-start);
}
else {
codingLength += lengthExon;
}
}
return -1;
}
/**
* Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene
* living on the reverse DNA strand.
*
* @param chromPos The genetic coordinate on a chromosome
* @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
* @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
* @param cdsStart The start position of a coding region
* @param cdsEnd The end position of a coding region
*
* @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1)
*
* @author Yana Valasatava
*/
public static int getCDSPosReverse(int chromPos, List exonStarts, List exonEnds,
int cdsStart, int cdsEnd) {
// the genetic coordinate is not in a coding region
if ( (chromPos < (cdsStart+base)) || ( chromPos > (cdsEnd+base) ) ) {
logger.debug("The {} position is not in a coding region", format(chromPos));
return -1;
}
logger.debug("looking for CDS position for {}", format(chromPos));
// map the genetic coordinate on a stretch of a reverse strand
List> cdsRegions = getCDSRegions(exonStarts, exonEnds, cdsStart, cdsEnd);
int codingLength = 0;
int lengthExon = 0;
for ( int i=cdsRegions.size()-1; i>=0; i-- ) {
int start = cdsRegions.get(i).lowerEndpoint();
int end = cdsRegions.get(i).upperEndpoint();
lengthExon = end - start;
// +1 offset to be a base 1
if (start+base <= chromPos && end >= chromPos ) {
return codingLength + (end-chromPos+1);
}
else {
codingLength += lengthExon;
}
}
return -1;
}
/**
* Extracts the exons boundaries in CDS coordinates corresponding to the forward DNA strand.
*
* @param origExonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
* @param origExonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
* @param cdsStart The start position of a coding region
* @param cdsEnd The end position of a coding region
*
* @return the list of genetic positions corresponding to the exons boundaries in CDS coordinates
*/
public static List> getCDSRegions(List origExonStarts, List origExonEnds, int cdsStart, int cdsEnd) {
// remove exons that are fully landed in UTRs
List exonStarts = new ArrayList<>(origExonStarts);
List exonEnds = new ArrayList<>(origExonEnds);
int j=0;
for (int i = 0; i < origExonStarts.size(); i++) {
if ( ( origExonEnds.get(i) < cdsStart) || ( origExonStarts.get(i) > cdsEnd) ) {
exonStarts.remove(j);
exonEnds.remove(j);
}
else {
j++;
}
}
// remove untranslated regions from exons
int nExons = exonStarts.size();
exonStarts.remove(0);
exonStarts.add(0, cdsStart);
exonEnds.remove(nExons-1);
exonEnds.add(cdsEnd);
List> cdsRegion = new ArrayList<>();
for ( int i=0; i r = Range.closed(exonStarts.get(i), exonEnds.get(i));
cdsRegion.add(r);
}
return cdsRegion;
}
/**
* Extracts the DNA sequence transcribed from the input genetic coordinates.
*
* @param twoBitFacade the facade that provide an access to a 2bit file
* @param gcp The container with chromosomal positions
*
* @return the DNA sequence transcribed from the input genetic coordinates
*/
public static DNASequence getTranscriptDNASequence(TwoBitFacade twoBitFacade, GeneChromosomePosition gcp) throws Exception {
return getTranscriptDNASequence(twoBitFacade,gcp.getChromosome(),gcp.getExonStarts(), gcp.getExonEnds(), gcp.getCdsStart(), gcp.getCdsEnd(), gcp.getOrientation());
}
/**
* Extracts the DNA sequence transcribed from the input genetic coordinates.
*
* @param chromosome the name of the chromosome
* @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
* @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
* @param cdsStart The start position of a coding region
* @param cdsEnd The end position of a coding region
* @param orientation The orientation of the strand where the gene is living
*
* @return the DNA sequence transcribed from the input genetic coordinates
*/
public static DNASequence getTranscriptDNASequence(TwoBitFacade twoBitFacade, String chromosome, List exonStarts, List exonEnds, int cdsStart, int cdsEnd, Character orientation) throws Exception {
List> cdsRegion = getCDSRegions(exonStarts, exonEnds, cdsStart, cdsEnd);
StringBuilder dnaSequence = new StringBuilder();
for (Range range : cdsRegion) {
String exonSequence = twoBitFacade.getSequence(chromosome,range.lowerEndpoint(), range.upperEndpoint());
dnaSequence.append(exonSequence);
}
if (orientation.equals('-')) {
dnaSequence = new StringBuilder(new StringBuilder(dnaSequence.toString()).reverse().toString());
DNASequence dna = new DNASequence(dnaSequence.toString());
SequenceView compliment = dna.getComplement();
dnaSequence = new StringBuilder(compliment.getSequenceAsString());
}
return new DNASequence(dnaSequence.toString().toUpperCase());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy