All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.interval.Transcript Maven / Gradle / Ivy

The newest version!
package org.snpeff.interval;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.snpeff.interval.codonChange.CodonChange;
import org.snpeff.serializer.MarkerSerializer;
import org.snpeff.snpEffect.Config;
import org.snpeff.snpEffect.EffectType;
import org.snpeff.snpEffect.VariantEffect.ErrorWarningType;
import org.snpeff.snpEffect.VariantEffects;
import org.snpeff.stats.ObservedOverExpectedCpG;
import org.snpeff.util.Gpr;
import org.snpeff.util.GprSeq;

/**
 * Codon position
 * @author pcingola
 */
class CodonPosition {

	public int codonNum = -1;
	public int codonIndex = -1;
}

/**
 * Interval for a transcript, as well as some other information: exons, utrs, cds, etc.
 *
 * @author pcingola
 */
public class Transcript extends IntervalAndSubIntervals {

	private static final long serialVersionUID = -2665025617916107311L;

	boolean aaCheck; // Has this transcript been checked against a protein sequence?
	boolean canonical; // Is this a canonical transcript?
	boolean corrected; // Have coordinates been corrected? (e.g. frame correction)
	boolean dnaCheck; // Has this transcript been checked against a CDS/cDNA sequence?
	boolean proteinCoding; // Is this a protein-coding transcript?
	boolean ribosomalSlippage; // Ribosomal slippage causes changes in reading frames. This might be represented as negative length introns (overlapping exons).
	int cdsStart, cdsEnd; // CDS start and end coordinates. Note: If the transcript is in reverse strand, then cdsStart > cdsEnd
	int spliceSiteSize, spliceRegionExonSize, spliceRegionIntronMin, spliceRegionIntronMax; // Splice sizes
	int upDownLength; // Upstream and downstream size
	BioType bioType; // Transcript biotype
	String cds; // Coding sequence
	String mRna; // mRna sequence (includes 5'UTR and 3'UTR)
	String protein; // Protein sequence
	String version = ""; // Transcript version
	List utrs; // UTRs
	List cdss; // CDS information
	List introns; // Intron markers
	Upstream upstream; // Upstream interval
	Downstream downstream; // Downstream interval
	Exon firstCodingExon; // First coding exon. I.e. where transcription start site (TSS) is.
	int cds2pos[], aa2pos[];
	TranscriptSupportLevel transcriptSupportLevel = null;

	public Transcript() {
		super();
		utrs = new ArrayList<>();
		cdss = new ArrayList<>();
		type = EffectType.TRANSCRIPT;
	}

	public Transcript(Gene gene, int start, int end, boolean strandMinus, String id) {
		super(gene, start, end, strandMinus, id);
		type = EffectType.TRANSCRIPT;
	}

	/**
	 * Calculate chromosome position as function of Amino Acid number
	 *
	 * @returns An array mapping 'pos[aaNumber] = chromosmalPos'
	 */
	public synchronized int[] aaNumber2Pos() {
		if (aa2pos != null) return aa2pos;

		calcCdsStartEnd();

		aa2pos = new int[protein().length()];
		for (int i = 0; i < aa2pos.length; i++)
			aa2pos[i] = -1;

		int cdsMin = Math.min(cdsStart, cdsEnd);
		int cdsMax = Math.max(cdsStart, cdsEnd);

		// For each exon, add CDS position to array
		int aaNum = 0;
		int step = isStrandPlus() ? 1 : -1;
		int codonFrame = 0;
		for (Exon exon : sortedStrand()) {
			int min = isStrandPlus() ? exon.getStart() : exon.getEnd();

			int aaIdxStart = -1, aaIdxEnd = -1;

			for (int pos = min; exon.intersects(pos) && aaNum < aa2pos.length; pos += step) {
				// Is this within a CDS?
				if ((cdsMin <= pos) && (pos <= cdsMax)) {
					// Update AA indexes for this exon
					if (aaIdxStart < 0) aaIdxStart = aaNum;
					aaIdxEnd = aaNum;

					// First codon base? Add to map
					if (codonFrame == 0) aa2pos[aaNum] = pos;

					// Last base in this codon? Increment AA number
					if (codonFrame == 2) aaNum++;

					// Update codon base
					codonFrame = (codonFrame + 1) % 3;
				}
			}

			// Update exons' AA indexes
			if (aaIdxStart >= 0) exon.setAaIdx(aaIdxStart, aaIdxEnd);
		}

		return aa2pos;
	}

	/**
	 * Find a genomic position of the first base in a Amino Acid 'aaNum'
	 */
	public int aaNumber2Pos(int aaNum) {
		aaNumber2Pos();

		return -1;
	}

	/**
	 * Add a CDS
	 */
	public void add(Cds cdsInt) {
		cdss.add(cdsInt);
		cds = null;
	}

	/**
	 * Add an intron
	 */
	public void add(Intron intron) {
		if (introns == null) introns = new ArrayList<>();
		introns.add(intron);

		// Introns should be sorted by strand
		if (isStrandPlus()) Collections.sort(introns);
		else Collections.sort(introns, Collections.reverseOrder());
	}

	/**
	 * Add a SpliceSite
	 */
	public void add(SpliceSite spliceSite) {
		for (Exon ex : this)
			if (ex.intersects(spliceSite)) ex.add(spliceSite);

		for (Intron intr : introns())
			if (intr.intersects(spliceSite)) intr.add(spliceSite);
	}

	/**
	 * Add a UTR
	 */
	public void add(Utr utr) {
		utrs.add(utr);
		cds = null;
	}

	/**
	 * Add missing UTRs. See utrFromCds() method.
	 */
	boolean addMissingUtrs(Markers missingUtrs, boolean verbose) {
		missingUtrs.sort(false, isStrandMinus());

		// Get min/max CDS positions
		int minCds = Integer.MAX_VALUE;
		int maxCds = 0;
		for (Cds c : cdss) {
			minCds = Math.min(minCds, c.getStart());
			maxCds = Math.max(maxCds, c.getEnd());
		}

		if (verbose) {
			System.out.println("Transcript '" + id + "' has missing UTRs." //
					+ " Strand: " + (strandMinus ? "-" : "+") //
					+ " (minCds: " + minCds //
					+ " , maxCds: " + maxCds + "):" //
			);
		}

		// Add intervals
		boolean retVal = false;
		for (Marker mu : missingUtrs) {
			Exon exon = queryExon(mu);
			if (exon == null) throw new RuntimeException("Cannot find exon for UTR: " + mu);
			Utr toAdd = null;

			if (isStrandPlus()) {
				if (mu.getEnd() <= minCds) toAdd = new Utr5prime(exon, mu.getStart(), mu.getEnd(), strandMinus, mu.getId());
				else if (mu.getStart() >= maxCds) toAdd = new Utr3prime(exon, mu.getStart(), mu.getEnd(), strandMinus, mu.getId());
			} else {
				if (mu.getStart() >= maxCds) toAdd = new Utr5prime(exon, mu.getStart(), mu.getEnd(), strandMinus, mu.getId());
				else if (mu.getEnd() <= minCds) toAdd = new Utr3prime(exon, mu.getStart(), mu.getEnd(), strandMinus, mu.getId());
			}

			// OK?
			if (toAdd != null) {
				add(toAdd);
				if (verbose) System.out.println("\tAdding " + toAdd);
				retVal = true;
			}
		}

		return retVal;
	}

	/**
	 * Adjust transcript coordinates
	 */
	public boolean adjust() {
		boolean changed = false;
		int strandSumTr = 0;
		int newStart = Integer.MAX_VALUE;
		int newEnd = Integer.MIN_VALUE;

		int countStrandPlus = 0, countStrandMinus = 0;
		for (Exon exon : sortedStrand()) {
			newStart = Math.min(newStart, exon.getStart());
			newEnd = Math.max(newEnd, exon.getEnd());

			// Common exon strand
			if (exon.isStrandPlus()) countStrandPlus++;
			else countStrandMinus++;
		}

		// UTRs
		for (Utr utr : getUtrs()) {
			newStart = Math.min(newStart, utr.getStart());
			newEnd = Math.max(newEnd, utr.getEnd());
		}

		// Sanity check
		strandSumTr = countStrandPlus - countStrandMinus; // Some exons have incorrect strands, we use the strand indicated by most exons
		boolean newStrandMinus = strandSumTr < 0;
		if ((countStrandPlus > 0) && (countStrandMinus > 0)) Gpr.debug("Transcript '" + id + "' has " + countStrandPlus + " exons on the plus and " + countStrandMinus + " exons on the minus strand! This should never happen!");

		// Change transcript strand?
		if (strandMinus != newStrandMinus) {
			changed = true;
			setStrandMinus(newStrandMinus); // Change strand
		}

		// Changed? Update values
		if (newStart < Integer.MAX_VALUE && newEnd > Integer.MIN_VALUE) {
			// Change start?
			if (start != newStart) {
				setStart(newStart);
				changed = true;
			}

			// Change end?
			if (end != newEnd) {
				setEnd(newEnd);
				changed = true;
			}
		}

		return changed;
	}

	/**
	 * Create a new transcript after applying changes in variant
	 *
	 * Note: If this transcript is unaffected, no new transcript is created (same transcript is returned)
	 */
	@Override
	public Transcript apply(Variant variant) {
		// Variant after this marker: No effect
		if (!shouldApply(variant)) return this;

		//---
		// Create new transcript
		//---
		Transcript newTr = (Transcript) super.apply(variant);
		if (newTr == null) return null;

		//---
		// Add changed UTRs
		//---
		for (Utr utr : utrs) {
			Utr newUtr = (Utr) utr.apply(variant);
			if (newUtr != null) {
				Exon newExon = newTr.findExon(newUtr);

				if (newExon != null) {
					newUtr.setParent(newExon);
					newTr.utrs.add(newUtr);
				} else {
					// This might happen when a duplication affecting part of an exon
					// E.g. If the duplication affects the coding part and NOT the 3'UTR then the UTR doesn't have a
					if (Config.get().isDebug()) Gpr.debug("WARNING: applying variant: Could not find 'new' parent exon for 'new' UTR" //
							+ "\n\t\tVariant           : " + variant //
							+ "\n" //
							+ "\n\t\tUTR        (ori) :" + utr //
							+ "\n" //
							+ "\n\t\tTranscript (ori) :" + this //
							+ "\n" //
							+ "\n\t\tUTR        (new) :" + newUtr //
							+ "\n" //
							+ "\n\t\tTranscript (new) :" + newTr //
					);
				}
			}
		}

		// Rank exons: if a variant deletes one or more exons, ranks will change
		newTr.rankExons();

		// Up & Down stream
		newTr.createUpDownStream(upDownLength);

		// Introns
		newTr.introns();

		// Splice sites
		newTr.createSpliceSites(spliceSiteSize, spliceRegionExonSize, spliceRegionIntronMin, spliceRegionIntronMax);

		return newTr;
	}

	/**
	 * Find base at genomic coordinate 'pos'
	 */
	public String baseAt(int pos) {
		calcCdsStartEnd();
		Exon ex = findExon(pos);
		if (ex == null) return null;
		return ex.basesAt(pos - ex.getStart(), 1);
	}

	/**
	 * Calculate distance from transcript start to a position
	 * mRNA is roughly the same than cDNA. Strictly speaking mRNA
	 * has a poly-A tail and 5'cap.
	 */
	public synchronized int baseNumber2MRnaPos(int pos) {
		int count = 0;
		for (Exon eint : sortedStrand()) {
			if (eint.intersects(pos)) {
				// Intersect this exon? Calculate the number of bases from the beginning
				int dist = 0;
				if (isStrandPlus()) dist = pos - eint.getStart();
				else dist = eint.getEnd() - pos;

				// Sanity check
				if (dist < 0) throw new RuntimeException("Negative distance for position " + pos + ". This should never happen!\n" + this);

				return count + dist;
			}

			count += eint.size();
		}
		return -1;
	}

	/**
	 * Calculate base number in a CDS where 'pos' maps
	 *
	 * @param usePrevBaseIntron: When 'pos' is intronic this method returns:
	 * 			- if( usePrevBaseIntron== false)  => The first base in the exon after 'pos' (i.e. first coding base after intron)
	 * 			- if( usePrevBaseIntron== true)   => The last base in the  exon before 'pos'  (i.e. last coding base before intron)
	 *
	 * @returns Base number or '-1' if it does not map to a coding base
	 */
	public synchronized int baseNumberCds(int pos, boolean usePrevBaseIntron) {
		// Doesn't hit this transcript?
		if (!intersects(pos)) return -1;

		// Is it in UTR instead of CDS?
		if (isUtr(pos)) return -1;

		// Calculate cdsStart and cdsEnd (if not already done)
		calcCdsStartEnd();

		// All exons..
		int firstCdsBaseInExon = 0; // Where the exon maps to the CDS (i.e. which CDS base number does the first base in this exon maps to).
		for (Exon eint : sortedStrand()) {
			if (eint.intersects(pos)) {
				int cdsBaseInExon; // cdsBaseInExon: base number relative to the beginning of the coding part of this exon (i.e. excluding 5'UTRs)
				if (isStrandPlus()) cdsBaseInExon = pos - Math.max(eint.getStart(), cdsStart);
				else cdsBaseInExon = Math.min(eint.getEnd(), cdsStart) - pos;

				cdsBaseInExon = Math.max(0, cdsBaseInExon);

				return firstCdsBaseInExon + cdsBaseInExon;
			} else {
				// Before exon begins?
				if ((isStrandPlus() && (pos < eint.getStart())) // Before exon begins (positive strand)?
						|| (isStrandMinus() && (pos > eint.getEnd()))) // Before exon begins (negative strand)?
					return firstCdsBaseInExon - (usePrevBaseIntron ? 1 : 0);
			}

			if (isStrandPlus()) firstCdsBaseInExon += Math.max(0, eint.getEnd() - Math.max(eint.getStart(), cdsStart) + 1);
			else firstCdsBaseInExon += Math.max(0, Math.min(cdsStart, eint.getEnd()) - eint.getStart() + 1);
		}

		return firstCdsBaseInExon - 1;
	}

	/**
	 * Return a codon that includes 'cdsBaseNumber'
	 */
	public String baseNumberCds2Codon(int cdsBaseNumber) {
		int codonNum = cdsBaseNumber / CodonChange.CODON_SIZE;
		int min = codonNum * CodonChange.CODON_SIZE;
		int max = codonNum * CodonChange.CODON_SIZE + CodonChange.CODON_SIZE;
		if ((min >= 0) && (max <= cds().length())) return cds().substring(min, max).toUpperCase();
		return null;
	}

	/**
	 * Calculate chromosome position as function of CDS number
	 *
	 * @returns An array mapping 'cds2pos[cdsBaseNumber] = chromosmalPos'
	 */
	public synchronized int[] baseNumberCds2Pos() {
		if (cds2pos != null) return cds2pos;

		calcCdsStartEnd();

		cds2pos = new int[cds().length()];
		for (int i = 0; i < cds2pos.length; i++)
			cds2pos[i] = -1;

		int cdsMin = Math.min(cdsStart, cdsEnd);
		int cdsMax = Math.max(cdsStart, cdsEnd);

		// For each exon, add CDS position to array
		int cdsBaseNum = 0;
		for (Exon exon : sortedStrand()) {
			int min = isStrandPlus() ? exon.getStart() : exon.getEnd();
			int step = isStrandPlus() ? 1 : -1;
			for (int pos = min; exon.intersects(pos) && cdsBaseNum < cds2pos.length; pos += step)
				if ((cdsMin <= pos) && (pos <= cdsMax)) cds2pos[cdsBaseNum++] = pos;
		}

		return cds2pos;
	}

	/**
	 * Calculate CDS start and CDS end
	 */
	synchronized void calcCdsStartEnd() {
		// Do we need to calculate these values?
		// Note: In circular genomes, one of cdsStart / cdsEnd might be less
		//       than zero (we must check both)
		if ((cdsStart < 0) && (cdsEnd < 0)) {
			// Calculate coding start (after 5 prime UTR)

			if (utrs.isEmpty()) {
				// No UTRs => Use all exons
				cdsStart = (isStrandPlus() ? end : start); // cdsStart is the position of the first base in the CDS (i.e. the first base after all 5'UTR)
				cdsEnd = (isStrandPlus() ? start : end); // cdsEnd is the position of the last base in the CDS (i.e. the first base before all 3'UTR)

				for (Exon ex : this) {
					if (isStrandPlus()) {
						cdsStart = Math.min(cdsStart, ex.getStart());
						cdsEnd = Math.max(cdsEnd, ex.getEnd());
					} else {
						cdsStart = Math.max(cdsStart, ex.getEnd());
						cdsEnd = Math.min(cdsEnd, ex.getStart());
					}
				}
			} else {
				// We have to take into account UTRs
				cdsStart = (isStrandPlus() ? start : end); // cdsStart is the position of the first base in the CDS (i.e. the first base after all 5'UTR)
				cdsEnd = (isStrandPlus() ? end : start); // cdsEnd is the position of the last base in the CDS (i.e. the first base before all 3'UTR)
				int cdsStartNotExon = cdsStart;

				for (Utr utr : utrs) {
					if (utr instanceof Utr5prime) {
						if (isStrandPlus()) cdsStart = Math.max(cdsStart, utr.getEnd() + 1);
						else cdsStart = Math.min(cdsStart, utr.getStart() - 1);
					} else if (utr instanceof Utr3prime) {
						if (isStrandPlus()) cdsEnd = Math.min(cdsEnd, utr.getStart() - 1);
						else cdsEnd = Math.max(cdsEnd, utr.getEnd() + 1);
					}
				}

				// Make sure cdsStart and cdsEnd lie within an exon
				if (isStrandPlus()) {
					cdsStart = firstExonPositionAfter(cdsStart);
					cdsEnd = lastExonPositionBefore(cdsEnd);
				} else {
					cdsStart = lastExonPositionBefore(cdsStart);
					cdsEnd = firstExonPositionAfter(cdsEnd);
				}

				// We were not able to find cdsStart & cdsEnd within exon limits.
				// Probably there is something wrong with the database and the transcript does
				// not have a single coding base (e.g. all of it is UTR).
				if (cdsStart < 0 || cdsEnd < 0) cdsStart = cdsEnd = cdsStartNotExon;
			}
		}
	}

	/**
	 * Retrieve coding sequence
	 */
	public synchronized String cds() {
		if (cds != null) return cds;

		// Concatenate all exons
		List exons = sortedStrand();
		StringBuilder sequence = new StringBuilder();
		int utr5len = 0, utr3len = 0;

		// 5 prime UTR length
		for (Utr utr : get5primeUtrs())
			utr5len += utr.size();

		// Append all exon sequences
		boolean missingSequence = false;
		for (Exon exon : exons) {
			missingSequence |= !exon.hasSequence(); // If there is no sequence, we are in trouble
			sequence.append(exon.getSequence());
		}

		if (missingSequence) cds = ""; // One or more exons does not have sequence. Nothing to do
		else {
			// OK, all exons have sequences

			// 3 prime UTR length
			for (Utr utr : get3primeUtrs())
				utr3len += utr.size();

			// Cut 5 prime UTR and 3 prime UTR points
			int subEnd = sequence.length() - utr3len;

			if (utr5len > subEnd) cds = "";
			else cds = sequence.substring(utr5len, subEnd);
		}

		return cds;
	}

	/**
	 * Create a marker of the coding region in this transcript
	 */
	public Marker cdsMarker() {
		return isStrandPlus() //
				? new Marker(this, getCdsStart(), getCdsEnd()) //
				: new Marker(this, getCdsEnd(), getCdsStart()) //
		;
	}

	@Override
	public Transcript cloneShallow() {
		Transcript clone = (Transcript) super.cloneShallow();

		clone.proteinCoding = proteinCoding;
		clone.canonical = canonical;
		clone.bioType = bioType;
		clone.aaCheck = aaCheck;
		clone.dnaCheck = dnaCheck;
		clone.corrected = corrected;
		clone.ribosomalSlippage = ribosomalSlippage;

		return clone;
	}

	/**
	 * Return an array of 3 genomic positions where amino acid number 'aaNum' maps
	 *
	 * @return aa2pos[0], aa2pos[1], aa2pos[2] are the coordinates (within the chromosome)
	 *         of the three bases conforming codon 'aaNum'. Any aa2pos[i] = -1 means that
	 *         it could a base in the codon could not be mapped.
	 */
	public int[] codonNumber2Pos(int codonNum) {
		if (cds2pos == null) baseNumberCds2Pos();

		// Initialize
		int codon[] = new int[3];
		int step = isStrandPlus() ? 1 : -1;
		int idxStart = isStrandPlus() ? 0 : 2;
		for (int i = idxStart, j = 3 * codonNum; (i < codon.length) && (i >= 0) && j < cds2pos.length; i += step, j++) {
			codon[i] = cds2pos[j];
		}

		return codon;
	}

	/**
	 * Collapses exons having gaps of zero (i.e. exons that followed by other exons).
	 * Does the same for CDSs and UTRs.
	 * @return true of any exon in the transcript was 'collapsed'
	 */
	public boolean collapseZeroGap() {
		if (ribosomalSlippage) return false; // Overlapping exons are representing ribosomal slippage, so they are not annotations errors and must not be corrected.

		boolean ret = false;
		introns = null; // These need to be recalculated

		//---
		// Collapse Exons
		//---
		Markers markers = new Markers();
		markers.addAll(subIntervals());
		Map collapse = MarkerUtil.collapseZeroGap(markers); // Create a map of collapsed exons

		// Replace exons
		for (Marker exon : collapse.keySet()) {
			Exon collapsedExon = (Exon) collapse.get(exon);

			// Is this exon to be replaced? (i.e. collapseZeroGap returns different coordinates)
			if (exon.size() != collapsedExon.size() //
					|| exon.getStart() != collapsedExon.getStart() //
					|| exon.getEnd() != collapsedExon.getEnd() //
			) {
				ret = true;

				// Show debugging information
				if (Config.get().isDebug()) System.err.println("\t\t\tTranscript " + getId() + ": Collapsing exon " + exon.getId() + "\t[ " + exon.getStart() + " - " + exon.getEnd() + " ]\t=>\t[ " + collapsedExon.getStart() + " - " + collapsedExon.getEnd() + " ]");

				// Replace exon
				remove((Exon) exon);
				if (!containsId(collapsedExon.getId())) add(collapsedExon); // Add collapsedExon. Make sure we don't add it twice (since many exons can be collapsed into one).

				// Change parent exon in UTRs
				for (Marker m : getUtrs()) {
					Utr utr = (Utr) m;
					if (utr.getParent() == exon) utr.setParent(collapsedExon);
				}
			}
		}

		//---
		// Collapse CDS
		//---
		collapse = MarkerUtil.collapseZeroGap(new Markers(cdss));
		cdss = new ArrayList<>(); // Re-create CDSs list
		Markers uniqCollapsedCds = new Markers(collapse.values()).unique(); // Create a set of unique CDSs and add them to CDSs list
		for (Marker cds : uniqCollapsedCds)
			cdss.add((Cds) cds);

		//---
		// Collapse UTRs
		//---
		collapse = MarkerUtil.collapseZeroGap(new Markers(utrs));
		Markers uniqCollapsedUtrs = new Markers(collapse.values()).unique(); // Create a set of unique UTRs, and add them to the list
		utrs = new ArrayList<>(); // Re-generate UTRs list
		for (Marker utr : uniqCollapsedUtrs)
			utrs.add((Utr) utr);

		return ret;
	}

	/**
	 * Calculate CpG bias: number of CpG / expected[CpG]
	 */
	public double cpgExonBias() {
		ObservedOverExpectedCpG oe = new ObservedOverExpectedCpG();
		return oe.oe(this);
	}

	/**
	 * Count total CpG in this transcript's exons
	 */
	public int cpgExons() {
		ObservedOverExpectedCpG oe = new ObservedOverExpectedCpG();
		return oe.observed(this);
	}

	/**
	 * Find all splice sites.
	 */
	public void createSpliceSites(int spliceSiteSize, int spliceRegionExonSize, int spliceRegionIntronMin, int spliceRegionIntronMax) {
		this.spliceSiteSize = spliceSiteSize;
		this.spliceRegionExonSize = spliceRegionExonSize;
		this.spliceRegionIntronMin = spliceRegionIntronMin;
		this.spliceRegionIntronMax = spliceRegionIntronMax;

		// Create spliceSiteRegion on the Exon side
		ArrayList exons = (ArrayList) sortedStrand();
		if (exons.size() > 0) {
			for (int i = 0; i < exons.size(); i++) { // Iterate like this to check rank
				Exon exon = exons.get(i);

				if (i > 0) exon.createSpliceSiteRegionStart(spliceRegionExonSize); // Splice site region at the start
				if (i < (exon.size() - 1)) exon.createSpliceSiteRegionEnd(spliceRegionExonSize); // Splice site region at the end

				// Sanity check
				int rank = i + 1;
				if (exon.getRank() != rank) {
					String msg = "Rank numbers do not march: " + rank + " != " + exon.getRank() + "\n\tTranscript: " + this;
					throw new RuntimeException(msg);
				}
			}
		}

		// Create spliceSite (donor/acceptor) and spliceSiteRegion on the Intron side
		List introns = introns();
		if (introns != null) {
			for (int i = 0; i < introns.size(); i++) {
				Intron intron = introns.get(i);
				intron.createSpliceSiteAcceptor(spliceSiteSize); // Acceptor splice site
				intron.createSpliceSiteDonor(spliceSiteSize); // Acceptor splice site

				// Splice region
				intron.createSpliceSiteRegionStart(spliceRegionIntronMin, spliceRegionIntronMax);
				intron.createSpliceSiteRegionEnd(spliceRegionIntronMin, spliceRegionIntronMax);
			}
		}
	}

	/**
	 * Creates a list of UP/DOWN stream regions (for each transcript)
	 * Upstream (downstream) stream is defined as upDownLength before (after) transcript
	 */
	public void createUpDownStream(int upDownLength) {
		this.upDownLength = upDownLength;

		Chromosome chr = getChromosome();
		int chrMin = chr.getStart(), chrMax = chr.getEnd();

		// Create up/down stream intervals and add them to the list
		int beforeStart = Math.max(start - upDownLength, chrMin);
		int beforeEnd = Math.max(start - 1, chrMin);
		int afterStart = Math.min(end + 1, chrMax);
		int afterEnd = Math.min(end + upDownLength, chrMax);

		if (isStrandPlus()) {
			if (beforeStart < beforeEnd) upstream = new Upstream(this, beforeStart, beforeEnd, false, id);
			if (afterStart < afterEnd) downstream = new Downstream(this, afterStart, afterEnd, false, id);
		} else {
			if (afterStart < afterEnd) upstream = new Upstream(this, afterStart, afterEnd, false, id);
			if (beforeStart < beforeEnd) downstream = new Downstream(this, beforeStart, beforeEnd, false, id);
		}
	}

	/**
	 * Deletes redundant exons (i.e. exons that are totally included in other exons).
	 * Does the same for CDSs.
	   Does the same for UTRs.
	 */
	public boolean deleteRedundant() {
		boolean ret = false;
		introns = null; // These need to be recalculated

		//---
		// Delete redundant exons
		//---
		Map includedIn = MarkerUtil.redundant(subIntervals());
		for (Marker exon : includedIn.keySet()) {
			ret = true;
			remove((Exon) exon);

			// Change parent exon in UTRs
			for (Marker m : getUtrs()) {
				Utr utr = (Utr) m;
				if (utr.getParent() == exon) utr.setParent(includedIn.get(exon));
			}
		}

		//---
		// Delete redundant CDS
		//---
		includedIn = MarkerUtil.redundant(cdss);
		for (Marker cds : includedIn.keySet())
			cdss.remove(cds);

		//---
		// Delete redundant UTRs
		//---
		includedIn = MarkerUtil.redundant(utrs);
		for (Marker utr : includedIn.keySet())
			utrs.remove(utr);

		return ret;
	}

	/**
	 * Find a CDS that matches exactly the exon
	 */
	public Cds findCds(Exon exon) {
		for (Cds cds : cdss)
			if (exon.includes(cds)) return cds;
		return null;
	}

	/**
	 * Return the an exon that intersects 'pos'
	 */
	public Exon findExon(int pos) {
		for (Exon exon : this)
			if (exon.intersects(pos)) return exon;
		return null;
	}

	/**
	 * Return an exon intersecting 'marker' (first exon found)
	 */
	public Exon findExon(Marker marker) {
		for (Exon exon : this)
			if (exon.intersects(marker)) return exon;
		return null;
	}

	/**
	 * Return an intron overlapping position 'pos'
	 */
	public Intron findIntron(int pos) {
		// Is 'pos' in intron?
		for (Intron intron : introns())
			if (intron.intersects(pos)) return intron;
		return null;
	}

	/**
	 * Return the UTR that hits position 'pos'
	 * @return An UTR intersecting 'pos' (null if not found)
	 */
	public Utr findUtr(int pos) {
		// Is it in UTR?
		for (Utr utr : utrs)
			if (utr.intersects(pos)) return utr;
		return null;
	}

	/**
	 * Return the UTR that intersects 'marker' (null if not found)
	 */
	public List findUtrs(Marker marker) {
		List utrs = new LinkedList<>();

		// Is it in UTR instead of CDS?
		for (Utr utr : utrs)
			if (utr.intersects(marker)) utrs.add(utr);

		return utrs.isEmpty() ? null : utrs;
	}

	/**
	 * Find the first position after 'pos' within an exon
	 */
	int firstExonPositionAfter(int pos) {
		for (Exon ex : sorted()) {
			if (pos <= ex.getStart()) return ex.getStart();
			if (pos <= ex.getEnd()) return pos;
		}

		System.err.println("WARNING: Cannot find first exonic position after " + pos + " for transcript '" + id + "'");
		return -1;
	}

	/**
	 * Correct exons based on frame information.
	 *
	 * E.g. if the frame information (form a genomic
	 * database file, such as a GTF) does not
	 * match the calculated frame, we correct exon's
	 * boundaries to make them match.
	 *
	 * This is performed in two stages:
	 *    i) First exon is corrected by adding a fake 5'UTR
	 *    ii) Other exons are corrected by changing the start (or end) coordinates.
	 */
	public synchronized boolean frameCorrection() {
		// Copy frame information form CDSs to Exons (if missing)
		frameFromCds();

		// First exon is corrected by adding a fake 5'UTR
		boolean changedFirst = frameCorrectionFirstCodingExon();

		// Other exons are corrected by changing the start (or end) coordinates.
		// boolean changedNonFirst = false;
		// Gpr.debug("UNCOMMENT!");
		boolean changedNonFirst = frameCorrectionNonFirstCodingExon();

		boolean changed = changedFirst || changedNonFirst;

		// We have to reset cached CDS data if anything changed
		if (changed) {
			resetCache();
			corrected = true;
		}

		// Return true if there was any adjustment
		return changed;
	}

	/**
	 * Fix transcripts having non-zero frames in first exon
	 *
	 * Transcripts whose first exon has a non-zero frame indicate problems.
	 * We add a 'fake' UTR5 to compensate for reading frame.
	 */
	synchronized boolean frameCorrectionFirstCodingExon() {
		List exons = sortedStrand();

		// No exons? Nothing to do
		if ((exons == null) || exons.isEmpty()) return false;

		Exon exonFirst = getFirstCodingExon(); // Get first exon
		// Exon exonFirst =  exons.get(0); // Get first exon
		if (exonFirst.getFrame() <= 0) return false; // Frame OK (or missing), nothing to do

		// First exon is not zero? => Create a UTR5 prime to compensate
		Utr5prime utr5 = null;
		int frame = exonFirst.getFrame();

		if (isStrandPlus()) {
			int end = exonFirst.getStart() + (frame - 1);
			utr5 = new Utr5prime(exonFirst, exonFirst.getStart(), end, isStrandMinus(), exonFirst.getId());
		} else {
			int start = exonFirst.getEnd() - (frame - 1);
			utr5 = new Utr5prime(exonFirst, start, exonFirst.getEnd(), isStrandMinus(), exonFirst.getId());
		}

		// Reset frame, since it was already corrected
		exonFirst.setFrame(0);
		Cds cds = findCds(exonFirst);
		if (cds != null) cds.frameCorrection(cds.getFrame());

		// Add UTR5'
		add(utr5);

		// Update counter
		return true;
	}

	/**
	 * Correct exons according to frame information
	 */
	synchronized boolean frameCorrectionNonFirstCodingExon() {
		boolean corrected = false;

		// Concatenate all exons to create a CDS
		List exons = sortedStrand();
		StringBuilder sequence = new StringBuilder();

		// We don't need to correct if there is no sequence! (the problem only exists due to sequence frame)
		for (Exon exon : exons)
			if (!exon.hasSequence()) return false;

		// 5'UTR length
		int utr5Start = Integer.MAX_VALUE, utr5End = -1;
		for (Utr utr : get5primeUtrs()) {
			utr.size();
			utr5Start = Math.min(utr5Start, utr.getStart());
			utr5End = Math.max(utr5End, utr.getEnd());
		}

		// Create UTR
		Marker utr5 = utr5End >= 0 ? new Marker(this, utr5Start, utr5End, strandMinus, "") : null;

		// Append all exon sequences
		for (Exon exon : exons) {
			String seq = "";
			int utrOverlap = 0;
			boolean utrIncluded = false;

			// Check if exon overlaps UTR
			if (utr5 != null && utr5.includes(exon)) {
				// The whole exon is included => No sequence change
				seq = "";
				utrIncluded = true;
			} else {
				// Add sequence
				seq = exon.getSequence();
				if (utr5 != null && utr5.intersects(exon)) {
					utrOverlap = utr5.intersectSize(exon);
					if (utrOverlap > 0) {
						if (utrOverlap < seq.length()) seq = seq.substring(utrOverlap);
						else seq = "";
					}
				}
			}

			//---
			// Frame check
			//---
			if (exon.getFrame() < 0) {
				// Nothing to do (assume current frame is right)
			} else if (utrIncluded) {
				// Exon does not code, frame information is meaningless
				if (exon.getFrame() >= 0) exon.setFrame(-1);
			} else {
				// Calculate frame
				// We use GFF style frame calculation
				// References: http://mblab.wustl.edu/GTF22.html
				int frameReal = FrameType.GFF.frameFromLength(sequence.length());

				// Does calculated frame match?
				if (frameReal != exon.getFrame()) {
					if (utrOverlap > 0) {
						throw new RuntimeException("Fatal Error: First exon needs correction: This should never happen!"//
								+ "\n\tThis method is supposed to be called AFTER method"//
								+ "\n\tSnpEffPredictorFactory.frameCorrectionFirstCodingExon(), which"//
								+ "\n\tshould have taken care of this problem." //
								+ "\n\t" + this //
						);
					} else {
						if (Config.get().isDebug()) {
							System.err.println("\t\tFrame correction: " //
									+ "Position " + toStr() //
									+ "Transcript '" + getId() + "'" //
									+ "\tExon rank " + exon.getRank() //
									+ "\tExpected frame: " + frameReal //
									+ "\tExon frame: " + exon.getFrame() //
									+ "\tSequence len: " + sequence.length() //
							);
						}
						// Find matching CDS
						Cds cdsToCorrect = findCds(exon);

						// Correct exon until we get the expected frame
						for (boolean ok = true; ok && frameReal != exon.getFrame();) {
							// Correct both Exon and CDS
							ok &= exon.frameCorrection(1);
							if (cdsToCorrect != null) cdsToCorrect.frameCorrection(1);
							corrected = true;
						}

						// Get new exon's sequence
						seq = exon.getSequence();
					}
				}
			}

			// Append sequence
			sequence.append(seq);
		}

		return corrected;
	}

	/**
	 * Copy frame info from CDSs into Exons
	 */
	void frameFromCds() {
		for (Exon ex : this) {
			// No frame info? => try to find matching CDS
			if (ex.getFrame() < 0) {
				// Check a CDS that matches an exon
				for (Cds cds : getCds()) {
					// CDS matches the exon coordinates? => Copy frame info
					if (isStrandPlus() && (ex.getStart() == cds.getStart())) {
						ex.setFrame(cds.getFrame());
						break;
					} else if (isStrandMinus() && (ex.getEnd() == cds.getEnd())) {
						ex.setFrame(cds.getFrame());
						break;
					}
				}
			}
		}

	}

	/**
	 * Create a list of 3 prime UTRs
	 */
	public List get3primeUtrs() {
		ArrayList list = new ArrayList<>();
		for (Utr utr : utrs)
			if (utr instanceof Utr3prime) list.add((Utr3prime) utr);
		return list;
	}

	public List get3primeUtrsSorted() {
		List list = get3primeUtrs();
		Collections.sort(list);
		return list;
	}

	/**
	 * Create a list of 5 prime UTRs
	 */
	public List get5primeUtrs() {
		ArrayList list = new ArrayList<>();
		for (Utr utr : utrs)
			if (utr instanceof Utr5prime) list.add((Utr5prime) utr);
		return list;
	}

	public List get5primeUtrsSorted() {
		List list = get5primeUtrs();
		Collections.sort(list);
		return list;
	}

	public BioType getBioType() {
		return bioType;
	}

	/**
	 * Get all CDSs
	 */
	public List getCds() {
		return cdss;
	}

	public int getCdsEnd() {
		calcCdsStartEnd();
		return cdsEnd;
	}

	public int getCdsStart() {
		calcCdsStartEnd();
		return cdsStart;
	}

	public Downstream getDownstream() {
		return downstream;
	}

	/**
	 * Get first coding exon
	 */
	public synchronized Exon getFirstCodingExon() {
		if (firstCodingExon == null) {
			// Get transcription start position
			long cstart = getCdsStart();

			// Pick exon intersecting cdsStart (TSS)
			for (Exon exon : sortedStrand())
				if (exon.intersects(cstart)) firstCodingExon = exon;

			// Sanity check
			if (firstCodingExon == null) throw new RuntimeException("Error: Cannot find first coding exon for transcript:\n" + this);
		}
		return firstCodingExon;
	}

	public TranscriptSupportLevel getTranscriptSupportLevel() {
		return transcriptSupportLevel;
	}

	/**
	 * Create a TSS marker
	 */
	public Marker getTss() {
		calcCdsStartEnd();
		Marker tss = new Marker(this, start + (isStrandPlus() ? 0 : -1), start + (isStrandPlus() ? 1 : 0), false, "TSS_" + id);
		return tss;
	}

	public Upstream getUpstream() {
		return upstream;
	}

	/**
	 * Get all UTRs
	 */
	public List getUtrs() {
		return utrs;
	}

	public String getVersion() {
		return version;
	}

	/**
	 * Does this transcript have any errors?
	 */
	public boolean hasError() {
		return isErrorProteinLength() || isErrorStartCodon() || isErrorStopCodonsInCds();
	}

	/**
	 * Does this transcript have any errors?
	 */
	public boolean hasErrorOrWarning() {
		return hasError() || hasWarning();
	}

	public boolean hasTranscriptSupportLevelInfo() {
		return (transcriptSupportLevel != null) && (transcriptSupportLevel != TranscriptSupportLevel.TSL_NA);
	}

	/**
	 * Does this transcript have any errors?
	 */
	public boolean hasWarning() {
		return isWarningStopCodon() // All possible warnings
		;
	}

	/**
	 * Get all introns (lazy init)
	 */
	public synchronized List introns() {
		if (introns == null) {
			introns = new ArrayList<>();

			Exon exBefore = null;
			for (Exon ex : sortedStrand()) {
				if (exBefore != null) {
					// Create intron
					Intron intron;
					int rank = introns.size() + 1;

					// Find intron start and end
					int start, end;
					if (isStrandPlus()) {
						start = exBefore.getEnd() + 1;
						end = ex.getStart() - 1;
					} else {
						start = ex.getEnd() + 1;
						end = exBefore.getStart() - 1;
					}

					int size = end - start + 1;
					if (size > 0) {
						// Add intron to list
						intron = new Intron(this, start, end, strandMinus, id + "_intron_" + rank, exBefore, ex);

						intron.setRank(rank);
						introns.add(intron);
					}
				}

				exBefore = ex;
			}
		}
		return introns;
	}

	public boolean isAaCheck() {
		return aaCheck;
	}

	@Override
	protected boolean isAdjustIfParentDoesNotInclude(Marker parent) {
		return true;
	}

	public boolean isCanonical() {
		return canonical;
	}

	/**
	 * Is this variant in the CDS part of this transcript?
	 */
	boolean isCds(Variant variant) {
		calcCdsStartEnd();

		int cs = cdsStart;
		int ce = cdsEnd;

		if (isStrandMinus()) {
			cs = cdsEnd;
			ce = cdsStart;
		}

		return (variant.getEnd() >= cs) && (variant.getStart() <= ce);
	}

	/**
	 * Has this transcript been checked against CDS/DNA/AA sequences?
	 */
	public boolean isChecked() {
		return aaCheck || dnaCheck;
	}

	public boolean isCorrected() {
		return corrected;
	}

	public boolean isDnaCheck() {
		return dnaCheck;
	}

	public boolean isDownstream(int pos) {
		return downstream != null && downstream.intersects(pos);
	}

	/**
	 * Check if coding length is multiple of 3 in protein coding transcripts
	 * @return true on Error
	 */
	public boolean isErrorProteinLength() {
		if (!Config.get().isTreatAllAsProteinCoding() && !isProteinCoding()) return false;
		return (cds().length() % 3) != 0;
	}

	/**
	 * Is the first codon a START codon?
	 */
	public boolean isErrorStartCodon() {
		if (!Config.get().isTreatAllAsProteinCoding() && !isProteinCoding()) return false;

		// Not even one codon in this protein? Error
		String cds = cds();
		if (cds.length() < 3) return true;

		String codon = cds.substring(0, 3);
		return !codonTable().isStart(codon);
	}

	/**
	 * Check if protein sequence has STOP codons in the middle of the coding sequence
	 * @return true on Error
	 */
	public boolean isErrorStopCodonsInCds() {
		if (!Config.get().isTreatAllAsProteinCoding() && !isProteinCoding()) return false;

		// Get protein sequence
		String prot = protein();
		if (prot == null) return false;

		// Any STOP codon before the end?
		char bases[] = prot.toCharArray();
		int max = bases.length - 1;
		int countErrs = 0;
		for (int i = 0; i < max; i++)
			if (bases[i] == '*') {
				countErrs++;
				// We allow up to one STOP codon because it can be a RARE_AMINO_ACID which is coded as a STOP codon.
				// More than one STOP codon is not "normal", so it's probably an error in the genomic annotations (e.g. ENSEMBL or UCSC)
				if (countErrs > 1) return true;
			}

		// OK
		return false;
	}

	public boolean isIntron(int pos) {
		return findIntron(pos) != null;
	}

	public boolean isProteinCoding() {
		return proteinCoding;
	}

	public boolean isRibosomalSlippage() {
		return ribosomalSlippage;
	}

	public boolean isUpstream(int pos) {
		return upstream != null && upstream.intersects(pos);
	}

	public boolean isUtr(int pos) {
		return findUtr(pos) != null;
	}

	public boolean isUtr(Marker marker) {
		return findUtrs(marker) != null;
	}

	public boolean isUtr3(int pos) {
		Utr utr = findUtr(pos);
		return utr != null && utr instanceof Utr3prime;
	}

	public boolean isUtr5(int pos) {
		Utr utr = findUtr(pos);
		return utr != null && utr instanceof Utr5prime;
	}

	/**
	 * Is the last codon a STOP codon?
	 */
	public boolean isWarningStopCodon() {
		if (!Config.get().isTreatAllAsProteinCoding() && !isProteinCoding()) return false;

		// Not even one codon in this protein? Error
		String cds = cds();
		if (cds.length() < 3) return true;

		String codon = cds.substring(cds.length() - 3);
		return !codonTable().isStop(codon);
	}

	/**
	 * Find the last position before 'pos' within an exon
	 */
	int lastExonPositionBefore(int pos) {
		int last = -1;
		for (Exon ex : sorted()) {
			if (pos < ex.getStart()) {
				// Nothing found?
				if (last < 0) {
					System.err.println("WARNING: Cannot find last exonic position before " + pos + " for transcript '" + id + "'");
					return -1;
				}
				return last;
			} else if (pos <= ex.getEnd()) return pos;
			last = ex.getEnd();
		}

		if (last < 0) System.err.println("WARNING: Cannot find last exonic position before " + pos + " for transcript '" + id + "'");
		return pos;
	}

	/**
	 * A list of all markers in this transcript
	 */
	@Override
	public Markers markers() {
		Markers markers = new Markers();
		markers.addAll(subIntervals());
		markers.addAll(utrs);
		markers.addAll(cdss);
		markers.add(upstream);
		markers.add(downstream);
		markers.addAll(introns());
		return markers;
	}

	/**
	 * Retrieve coding sequence AND the UTRs (mRNA = 5'UTR + CDS + 3'UTR)
	 * I.e. Concatenate all exon sequences
	 */
	public synchronized String mRna() {
		if (mRna != null) return mRna;

		List exons = sortedStrand();

		// Concatenate all exons
		StringBuilder sequence = new StringBuilder();
		for (Exon ex : exons)
			sequence.append(ex.getSequence());

		mRna = sequence.toString();
		return mRna;
	}

	/**
	 * Protein sequence (amino acid sequence produced by this transcripts)
	 */
	public String protein() {
		if (protein == null) {
			if (!(Config.get() != null && Config.get().isTreatAllAsProteinCoding()) && !isProteinCoding()) protein = "";
			else protein = codonTable().aa(cds(), true);
		}
		return protein;
	}

	/**
	 * Query all genomic regions that intersect 'marker'
	 */
	@Override
	public Markers query(Marker marker) {
		Set results = new HashSet<>();

		// Add exons
		for (Exon ex : this)
			if (ex.intersects(marker)) {
				results.add(ex);

				// Query deeper
				for (Marker ee : ex.query(marker))
					results.add(ee);
			}

		// Ad UTRs
		for (Utr u : utrs)
			if (u.intersects(marker)) results.add(u);

		// Add CDSs
		for (Cds m : cdss)
			if (m.intersects(marker)) results.add(m);

		// Add introns
		for (Intron intr : introns())
			if (intr.intersects(marker)) {
				results.add(intr);

				// Query deeper
				for (Marker ee : intr.query(marker))
					results.add(ee);
			}

		// Get results into markers
		Markers markers = new Markers();
		markers.addAll(results);

		return markers;
	}

	/**
	 * Return the first exon that intersects 'interval' (null if not found)
	 */
	public Exon queryExon(Marker interval) {
		for (Exon ei : this)
			if (ei.intersects(interval)) return ei;
		return null;
	}

	/**
	 * Assign ranks to exons
	 */
	public boolean rankExons() {
		boolean changed = false;
		int rank = 1;
		for (Exon exon : sortedStrand()) {
			if (rank != exon.getRank()) {
				exon.setRank(rank);
				changed = true;
			}
			rank++;
		}
		return changed;
	}

	@Override
	public void reset() {
		super.reset();

		utrs = new ArrayList<>();
		cdss = new ArrayList<>();
		introns = null;
		upstream = null;
		downstream = null;
		resetCache();
	}

	public void resetCache() {
		cdsStart = -1;
		cdsEnd = -1;
		firstCodingExon = null;
		cds = null;
		cds2pos = null;
		aa2pos = null;
		mRna = null;
		protein = null;
	}

	public void resetExons() {
		super.reset();
		resetCache();
	}

	/**
	 * Perfom some baseic chekcs, return error type, if any
	 */
	public ErrorWarningType sanityCheck(Variant variant) {
		if (isErrorStopCodonsInCds()) return ErrorWarningType.WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS;
		if (isErrorProteinLength()) return ErrorWarningType.WARNING_TRANSCRIPT_INCOMPLETE;
		if (isErrorStartCodon()) return ErrorWarningType.WARNING_TRANSCRIPT_NO_START_CODON;
		if (isWarningStopCodon()) return ErrorWarningType.WARNING_TRANSCRIPT_NO_STOP_CODON;
		return null;
	}

	/**
	 * Parse a line from a serialized file
	 */
	@Override
	public void serializeParse(MarkerSerializer markerSerializer) {
		super.serializeParse(markerSerializer);
		bioType = BioType.parse(markerSerializer.getNextField());
		proteinCoding = markerSerializer.getNextFieldBoolean();
		dnaCheck = markerSerializer.getNextFieldBoolean();
		aaCheck = markerSerializer.getNextFieldBoolean();
		corrected = markerSerializer.getNextFieldBoolean();
		ribosomalSlippage = markerSerializer.getNextFieldBoolean();
		transcriptSupportLevel = TranscriptSupportLevel.parse(markerSerializer.getNextField());
		version = markerSerializer.getNextField();

		upstream = (Upstream) markerSerializer.getNextFieldMarker();
		downstream = (Downstream) markerSerializer.getNextFieldMarker();

		for (Marker m : markerSerializer.getNextFieldMarkers())
			utrs.add((Utr) m);

		for (Marker m : markerSerializer.getNextFieldMarkers())
			cdss.add((Cds) m);
	}

	/**
	 * Create a string to serialize to a file
	 */
	@SuppressWarnings({ "unchecked", "rawtypes" })
	@Override
	public String serializeSave(MarkerSerializer markerSerializer) {
		return super.serializeSave(markerSerializer) //
				+ "\t" + bioType //
				+ "\t" + proteinCoding //
				+ "\t" + dnaCheck //
				+ "\t" + aaCheck //
				+ "\t" + corrected //
				+ "\t" + ribosomalSlippage //
				+ "\t" + (transcriptSupportLevel == null ? "" : transcriptSupportLevel.toString()) //
				+ "\t" + version //
				+ "\t" + markerSerializer.save(upstream) //
				+ "\t" + markerSerializer.save(downstream) //
				+ "\t" + markerSerializer.save((Iterable) utrs)//
				+ "\t" + markerSerializer.save((Iterable) cdss)//
		;
	}

	public void setAaCheck(boolean aaCheck) {
		this.aaCheck = aaCheck;
	}

	public void setBioType(BioType bioType) {
		this.bioType = bioType;
	}

	public void setCanonical(boolean canonical) {
		this.canonical = canonical;
	}

	public void setDnaCheck(boolean dnaCheck) {
		this.dnaCheck = dnaCheck;
	}

	public void setProteinCoding(boolean proteinCoding) {
		this.proteinCoding = proteinCoding;
	}

	public void setRibosomalSlippage(boolean ribosomalSlippage) {
		this.ribosomalSlippage = ribosomalSlippage;
	}

	public void setTranscriptSupportLevel(TranscriptSupportLevel transcriptSupportLevel) {
		this.transcriptSupportLevel = transcriptSupportLevel;
	}

	public void setVersion(String version) {
		this.version = version;
	}

	public void sortCds() {
		Collections.sort(cdss);
		resetCache();
	}

	public List spliceSites() {
		List sslist = new ArrayList<>();

		for (Exon ex : this)
			sslist.addAll(ex.getSpliceSites());

		for (Intron intr : introns())
			sslist.addAll(intr.getSpliceSites());

		return sslist;
	}

	@Override
	public String toString() {
		return toString(false);
	}

	public String toString(boolean full) {
		StringBuilder sb = new StringBuilder();

		sb.append(getChromosomeName() + ":" + start + "-" + end);
		sb.append(", strand: " + (isStrandPlus() ? "+" : "-"));
		if ((id != null) && (id.length() > 0)) sb.append(", id:" + id);
		if ((bioType != null) && (bioType != null)) sb.append(", bioType:" + bioType);
		if (isProteinCoding()) sb.append(", Protein");
		if (isAaCheck()) sb.append(", AA check");
		if (isDnaCheck()) sb.append(", DNA check");

		if (numChilds() > 0) {
			sb.append("\n");
			for (Utr utr : get5primeUtrsSorted())
				sb.append("\t\t5'UTR   :\t" + utr + "\n");

			sb.append("\t\tExons:\n");
			for (Exon exon : sorted())
				sb.append("\t\t" + exon + "\n");

			for (Utr utr : get3primeUtrsSorted())
				sb.append("\t\t3'UTR   :\t" + utr + "\n");

			// We may show CDS
			if (isProteinCoding()) {
				sb.append("\t\tCDS     :\t" + cds() + "\n");
				sb.append("\t\tProtein :\t" + protein() + "\n");
			}
		}

		if (full) {
			// Show errors or warnings
			String warn = "";
			if (isErrorStopCodonsInCds()) warn += ErrorWarningType.WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS + " ";
			if (isErrorProteinLength()) warn += ErrorWarningType.WARNING_TRANSCRIPT_INCOMPLETE + " ";
			if (isErrorStartCodon()) warn += ErrorWarningType.WARNING_TRANSCRIPT_NO_START_CODON + " ";
			if (isWarningStopCodon()) warn += ErrorWarningType.WARNING_TRANSCRIPT_NO_STOP_CODON + " ";
			if (!warn.isEmpty()) sb.append("\tWarnings  :" + warn);

			// Sequence checks
			sb.append("\t\tCDS check : " + (isDnaCheck() ? "OK" : "Failed (or missing)") + "\n");
			sb.append("\t\tAA check  : " + (isAaCheck() ? "OK" : "Failed (or missing)") + "\n");
		}

		return sb.toString();
	}

	/**
	 * Show a transcript as an ASCII Art
	 */
	public String toStringAsciiArt(boolean full) {

		//---
		// ASCII art for transcript
		//---
		char art[] = new char[size()];
		for (int i = start, j = 0; i <= end; i++, j++) {
			Utr utr = findUtr(i);
			if (utr != null) art[j] = utr.isUtr5prime() ? '5' : '3';
			else {
				Exon exon = findExon(i);
				if (exon != null) {
					art[j] = exon.isStrandPlus() ? '>' : '<';
				} else art[j] = '-';
			}
		}

		// Only 'basic' ASCII art?
		if (!full) return new String(art);

		//---
		// DNA Sequence
		//---
		StringBuilder seq = new StringBuilder();
		for (int i = start; i <= end; i++) {
			Exon exon = findExon(i);
			if (exon != null) {
				String s = exon.getSequence().toLowerCase();
				if (exon.isStrandMinus()) s = GprSeq.reverseWc(s);

				s = GprSeq.padN(s, exon.size());
				seq.append(s);
				i += s.length() - 1;
			} else seq.append('.');
		}

		//---
		// AA Sequence and frame
		//---
		StringBuilder aa = new StringBuilder();
		StringBuilder frameSb = new StringBuilder();
		StringBuilder pos1sb = new StringBuilder();
		StringBuilder pos10sb = new StringBuilder();
		StringBuilder pos100sb = new StringBuilder();
		StringBuilder pos1000sb = new StringBuilder();

		int pos = start;
		if (isProteinCoding()) {
			char codon[] = new char[3];
			int step = isStrandPlus() ? 1 : -1;
			int frame = 0;
			for (int i = (isStrandPlus() ? 0 : art.length - 1), j = 0; (i >= 0) && (i < art.length); i += step) {
				if (art[i] == '3' || art[i] == '5') {
					// 5'UTR or 3'UTR
					aa.append(' ');
					frameSb.append(' ');
				} else {
					char b = seq.charAt(i);
					if (b == 'a' || b == 'c' || b == 'g' || b == 't') {
						// Coding sequence
						codon[j++] = b;
						if (j >= 3) {
							j = 0;
							String cod = new String(codon);
							if (isStrandMinus()) cod = GprSeq.wc(cod); // Bases are already reversed, we only need WC complement

							boolean translateStart = (aa.length() <= 0); // First codon? Translate start coddon accordingly
							aa.append(" " + codonTable().aa(cod, translateStart) + " ");
						}

						// Update frame
						frameSb.append(frame);
						frame = (frame + 1) % 3;

					} else {
						// Intron
						aa.append(' ');
						frameSb.append(' ');
					}
				}

				// Position
				pos1sb.append(pos % 10);

				if (pos % 10 == 0) pos10sb.append((pos % 100) / 10);
				else pos10sb.append(' ');

				if (pos % 100 == 0) pos100sb.append((pos % 1000) / 100);
				else pos100sb.append(' ');

				if (pos % 1000 == 0) pos1000sb.append((pos % 10000) / 1000);
				else pos1000sb.append(' ');

				pos++;
			}
		}

		String aaStr = isStrandPlus() ? aa.toString() : aa.reverse().toString();
		String frameStr = isStrandPlus() ? frameSb.toString() : frameSb.reverse().toString();

		//---
		// Coordinates
		//---

		// Create 'vertical lines'
		StringBuilder lines = new StringBuilder();
		if (isStrandPlus()) lines.append(' ');
		int prev = start;
		for (Exon ex : this.sorted()) {
			lines.append(Gpr.repeat(' ', ex.getStart() - prev - 1) + "|");
			prev = ex.getStart();

			lines.append(Gpr.repeat(' ', ex.getEnd() - prev - 1) + "|");
			prev = ex.getEnd();
		}

		StringBuilder coords = new StringBuilder();
		coords.append(lines + "\n");

		ArrayList exons = new ArrayList<>();
		exons.addAll(subIntervals());
		Collections.sort(exons, new IntervalComparatorByStart(true)); // Sort by reverse position

		int n, len;
		for (Exon ex : exons) {
			// Right-side coordinate
			n = ex.getEnd();
			len = n - start + 1;
			coords.append((len > 0 ? lines.subSequence(0, len) : "") + "^" + n + "\n");

			// Left-side coordinate
			n = ex.getStart();
			len = n - start + 1;
			coords.append((len > 0 ? lines.subSequence(0, len) : "") + "^" + n + "\n");
		}

		// Result
		return "" //
				+ (pos1000sb.toString().trim().isEmpty() ? "" : "\n" + pos1000sb) //
				+ (pos100sb.toString().trim().isEmpty() ? "" : "\n" + pos100sb) //
				+ (pos10sb.toString().trim().isEmpty() ? "" : "\n" + pos10sb) //
				+ "\n" + pos1sb //
				+ "\n" + seq //
				+ (isProteinCoding() ? "\n" + aaStr + "\n" + frameStr : "") //
				+ "\n" + new String(art) //
				+ "\n" + coords //
		;
	}

	/**
	 * Calculate UTR regions from CDSs
	 */
	public boolean utrFromCds(boolean verbose) {
		if (cdss.size() <= 0) return false; // Cannot do this if we don't have CDS information

		// All exons minus all UTRs and CDS should give us the missing UTRs
		Markers exons = new Markers();
		Markers minus = new Markers();

		// Add all exons
		for (Exon e : this)
			exons.add(e);

		// Add all UTRs and CDSs to the 'minus' set
		for (Utr uint : getUtrs())
			minus.add(uint);

		for (Cds cint : cdss)
			minus.add(cint);

		Markers missingUtrs = exons.minus(minus); // Perform interval minus
		if (!missingUtrs.isEmpty()) return addMissingUtrs(missingUtrs, verbose); // Anything left? => There was a missing UTR
		return false;
	}

	/**
	 * Get some details about the effect on this transcript
	 */
	@Override
	public boolean variantEffect(Variant variant, VariantEffects variantEffects) {
		if (!intersects(variant)) return false; // Sanity check

		// Large structural variant including the whole transcript?
		if (variant.includes(this) && variant.isStructural()) {
			CodonChange codonChange = CodonChange.factory(variant, this, variantEffects);
			codonChange.codonChange();
			return true;
		}

		//---
		// Structural variants may affect more than one exon
		//---
		boolean mayAffectSeveralExons = variant.isStructural() || variant.isMixed() || variant.isMnp();
		if (mayAffectSeveralExons) {
			int countExon = 0;
			for (Exon ex : this)
				if (ex.intersects(variant)) countExon++;

			// More than one exon?
			if (countExon > 1) {
				CodonChange codonChange = CodonChange.factory(variant, this, variantEffects);
				codonChange.codonChange();
				return true;
			}
		}

		//---
		// Does it hit an exon?
		// Note: This only adds spliceSites effects, for detailed codon
		//       changes effects we use 'CodonChange' class
		//---
		boolean exonAnnotated = false;
		for (Exon ex : this)
			if (ex.intersects(variant)) {
				exonAnnotated |= ex.variantEffect(variant, variantEffects);
			}

		//---
		// Hits a UTR region?
		//---
		boolean included = false;
		for (Utr utr : utrs)
			if (utr.intersects(variant)) {
				// Calculate the effect
				utr.variantEffect(variant, variantEffects);
				included |= utr.includes(variant); // Is this variant fully included in the UTR?
			}
		if (included) return true; // Variant fully included in the UTR? => We are done.

		//---
		// Does it hit an intron?
		//---
		for (Intron intron : introns())
			if (intron.intersects(variant)) {
				intron.variantEffect(variant, variantEffects);
				included |= intron.includes(variant); // Is this variant fully included in this intron?
			}
		if (included) return true; // Variant fully included? => We are done.

		//---
		// No annotations from exons? => Add transcript
		//---
		if (!exonAnnotated) {
			variantEffects.add(variant, this, EffectType.TRANSCRIPT, "");
			return true;
		}

		return exonAnnotated;
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy