All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.snpEffect.HgvsDna Maven / Gradle / Ivy

The newest version!
package org.snpeff.snpEffect;

import org.snpeff.interval.CytoBands;
import org.snpeff.interval.Exon;
import org.snpeff.interval.Intron;
import org.snpeff.interval.Marker;
import org.snpeff.interval.Markers;
import org.snpeff.interval.VariantBnd;
import org.snpeff.util.Gpr;
import org.snpeff.util.GprSeq;

/**
 * Coding DNA reference sequence
 *
 * References http://www.hgvs.org/mutnomen/recs.html
 *
 * Nucleotide numbering:
 * 	- there is no nucleotide 0
 * 	- nucleotide 1 is the A of the ATG-translation initiation codon
 * 	- the nucleotide 5' of the ATG-translation initiation codon is -1, the previous -2, etc.
 * 	- the nucleotide 3' of the translation stop codon is *1, the next *2, etc.
 * 	- intronic nucleotides (coding DNA reference sequence only)
 * 		- beginning of the intron; the number of the last nucleotide of the preceding exon, a plus sign and the position in the intron, like c.77+1G, c.77+2T, ....
 * 		- end of the intron; the number of the first nucleotide of the following exon, a minus sign and the position upstream in the intron, like ..., c.78-2A, c.78-1G.
 * 		- in the middle of the intron, numbering changes from "c.77+.." to "c.78-.."; for introns with an uneven number of nucleotides the central nucleotide is the last described with a "+" (see Discussion)
 *
 * Genomic reference sequence
 * 		- nucleotide numbering starts with 1 at the first nucleotide of the sequence
 * 		  NOTE: the sequence should include all nucleotides covering the sequence (gene) of interest and should start well 5' of the promoter of a gene
 * 		- no +, - or other signs are used
 * 		- when the complete genomic sequence is not known, a coding DNA reference sequence should be used
 * 		- for all descriptions the most 3' position possible is arbitrarily assigned to have been changed (see Exception)
 */

public class HgvsDna extends Hgvs {

	public static boolean debug = false;

	public HgvsDna(VariantEffect variantEffect) {
		super(variantEffect);
	}

	protected String alt() {
		return variant.getAlt();
	}

	/**
	 * DNA level base changes
	 */
	protected String dnaBaseChange() {

		switch (variant.getVariantType()) {
		case SNP:
			if (strandPlus) return ref() + ">" + alt();
			return GprSeq.reverseWc(ref()) + ">" + GprSeq.reverseWc(alt());

		case MNP:
			String ref, alt;
			if (strandPlus) {
				ref = ref();
				alt = alt();
			} else {
				ref = GprSeq.reverseWc(ref());
				alt = GprSeq.reverseWc(alt());
			}
			return "del" + ref + "ins" + alt;

		case DEL:
		case DUP:
		case INS:
			if (variant.size() > MAX_SEQUENCE_LEN_HGVS) return "";
			String netChange = variant.netChange(false);
			if (strandPlus) return netChange;
			return GprSeq.reverseWc(netChange);

		case MIXED:
			if (strandPlus) return "del" + ref() + "ins" + alt();
			return "del" + GprSeq.reverseWc(ref()) + "ins" + GprSeq.reverseWc(alt());

		case INV:
			// Inversions are designated by "inv" after an indication of the
			// first and last nucleotides affected by the inversion.
			// Reference: http://www.hgvs.org/mutnomen/recs-DNA.html#inv
			// => No base changes are used
			return "";

		case INTERVAL:
			return "";

		case BND:
			return "";

		default:
			throw new RuntimeException("Unimplemented method for variant type " + variant.getVariantType());
		}
	}

	/**
	 * Is this position downstream?
	 */
	boolean isDownstream(int pos) {
		if (tr.isStrandPlus()) return tr.getEnd() < pos;
		return pos < tr.getStart();
	}

	/**
	 * Is this a duplication?
	 */
	protected boolean isDuplication() {
		// Only insertion cause duplications
		// Reference: Here is a discussion of a possible new term ('los') as an analogous
		//            to 'dup' for deletions:
		//                http://www.hgvs.org/mutnomen/disc.html#loss
		//            So, it's still not decided if there is an analogous 'dup' term
		//            for deletions.
		if (!variant.isIns()) return false;

		// Extract sequence from genomic coordinates before variant
		String seq = null;

		// Get sequence at the 3'end of the variant
		int sstart, send;
		int len = alt().length();

		if (strandPlus) {
			sstart = Math.max(0, variant.getStart() - len);
			send = variant.getStart() - 1;
		} else {
			sstart = variant.getStart();
			send = sstart + (len - 1);
		}

		// Maybe we can just use exonic sequences (it's faster)
		// Create a marker and check that is comprised within exon boundaries
		Marker m = new Marker(variant.getParent(), sstart, send, false, "");

		Exon ex = variantEffect.getExon();
		if (ex != null && ex.includes(m)) {
			if (debug) Gpr.debug("Variant: " + variant + "\n\tmarker: " + m.toStr() + "\tsstart:" + sstart + "\tsend: " + send + "\n\texon: " + ex + "\n\tstrand: " + (strandPlus ? "+" : "-"));
			seq = ex.getSequence(m);
			if (debug) Gpr.debug("Sequence (Exon)  [ " + sstart + " , " + send + " ]: '" + seq + "'\talt: '" + alt() + "'\tsequence (+ strand): " + (ex.isStrandPlus() ? ex.getSequence() : GprSeq.reverseWc(ex.getSequence())));
		}

		// May be it is not completely in the exon. Use genomic sequences
		if (seq == null) {
			seq = genome.getGenomicSequences().querySequence(m);
			if (debug) Gpr.debug("Sequence (Genome) [ " + sstart + " , " + send + " ]: '" + seq + "'\talt: '" + alt() + "'\tsequence (+ strand): " + seq);
		}

		// Compare to ALT sequence
		if (seq == null) return false; // Cannot compare

		return seq.equalsIgnoreCase(alt());
	}

	/**
	 * Is this position upstream?
	 */
	boolean isUpstream(int pos) {
		if (tr.isStrandPlus()) return pos < tr.getStart();
		return tr.getEnd() < pos;
	}

	/**
	 * Genomic position for exonic variants
	 */
	protected String pos() {
		int posStart = -1, posEnd = -1;

		int variantPosStart = strandPlus ? variant.getStart() : variant.getEnd();

		switch (variant.getVariantType()) {
		case SNP:
			posStart = posEnd = variantPosStart;
			break;

		case MNP:
			posStart = variantPosStart;
			posEnd = posStart + (strandPlus ? 1 : -1) * (variant.size() - 1);
			break;

		case INS:
			posStart = variantPosStart;
			if (duplication) {
				// Duplication coordinates
				int lenAlt = alt().length();
				if (lenAlt == 1) {
					// One base duplications do not require end positions:
					// Reference: http://www.hgvs.org/mutnomen/disc.html#dupins
					// Example: c.7dupT (or c.7dup) denotes the duplication (insertion) of a T at position 7 in the sequence ACTTACTGCC to ACTTACTTGCC
					posStart += strandPlus ? -1 : 0; // The 'previous base' is duplicated, we have to decrement the position
					posEnd = posStart;
				} else {
					// Duplication coordinates
					if (strandPlus) {
						posEnd = posStart - 1;
						posStart -= lenAlt;
					} else {
						// Insert is 'before' variant position, so we must shift one base (compared to plus strand)
						posEnd = posStart;
						posStart += lenAlt - 1;
					}
				}
			} else {
				// Other insertions must list both positions:
				// Reference: http://www.hgvs.org/mutnomen/disc.html#ins
				//            ...to prevent confusion, both flanking residues have to be listed.
				// Example: c.6_7dup (or c.6_7dupTG) denotes a TG duplication (TG insertion) in the sequence ACATGTGCC to ACATGTGTGCC
				if (strandPlus) {
					// Insert before current posStart
					posEnd = posStart;
					posStart -= 1;
				} else {
					// Insert before current posStart (negative strand)
					posEnd = posStart - 1;
				}
			}
			break;

		case BND:
		case DEL:
		case DUP:
		case INV:
		case MIXED:
			if (strandPlus) {
				posStart = variant.getStart();
				posEnd = variant.getEnd();
			} else {
				posStart = variant.getEnd();
				posEnd = variant.getStart();
			}
			break;

		case INTERVAL:
			return "";

		default:
			throw new RuntimeException("Unimplemented method for variant type " + variant.getVariantType());
		}

		// Single base
		if (posStart == posEnd) return pos(posStart);

		// Base range
		String ps = pos(posStart);
		String pe = pos(posEnd);
		if (ps == null || pe == null) return null;
		return ps + "_" + pe;
	}

	/**
	 * HGVS position base on genomic coordinates (chr is assumed to be the same as in transcript/marker).
	 */
	protected String pos(int pos) {
		// Cannot do much if there is no transcript
		if (tr == null) return Integer.toString(pos + 1);

		// Are we in an exon?
		// Note: This may come from an intron-exon boundary variant (intron side, walked in a duplication).
		//       In that case, the exon marker won't be available from 'variantEffect.marker'.
		Exon ex = tr.findExon(pos);
		if (ex != null) return posExon(pos);

		Intron intron = tr.findIntron(pos);
		if (intron != null) return posIntron(pos, intron);

		if (isDownstream(pos)) return posDownstream(pos);
		if (isUpstream(pos)) return posUpstream(pos);

		if (debug) Gpr.debug("Unknown HGVS position " + pos + ", transcript " + tr);
		return null;
	}

	//	/**
	//	 * Position downstream of the transcript
	//	 */
	//	protected String posDownstream(int pos) {
	//		int baseNumCdsEnd = tr.getCdsEnd();
	//		int idx = Math.abs(pos - baseNumCdsEnd);
	//
	//		return "*" + idx; // We are after stop codon, coordinates must be '*1', '*2', etc.
	//	}

	/**
	 * Position downstream of the transcript
	 */
	protected String posDownstream(int pos) {
		int trEnd = tr.isStrandPlus() ? tr.getEnd() : tr.getStart();
		int baseNumTrEnd = tr.baseNumber2MRnaPos(trEnd);
		int baseNumCdsEnd = tr.baseNumber2MRnaPos(tr.getCdsEnd());
		int basesFromCdsEndToTrEnd = Math.abs(baseNumTrEnd - baseNumCdsEnd);

		int idx = basesFromCdsEndToTrEnd + Math.abs(pos - trEnd);
		return "*" + idx; // We are after stop codon, coordinates must be '*1', '*2', etc.
	}

	/**
	 * Convert genomic position to HGVS compatible (DNA) position
	 */
	protected String posExon(int pos) {
		if (tr.isUtr3(pos)) return posUtr3(pos);
		if (tr.isUtr5(pos)) return posUtr5(pos);

		int idx = tr.baseNumberCds(pos, false) + 1; // Coding Exon: just use CDS position

		// Could not find dna position in transcript?
		if (idx <= 0) return null;
		return "" + idx;
	}

	/**
	 * Intronic position
	 */
	protected String posIntron(int pos, Intron intron) {
		// Jump to closest exon position
		// Ref:
		//		Beginning of the intron; the number of the last nucleotide of the
		//      preceding exon, a plus sign and the position in the intron, like
		//      c.77+1G, c.77+2T, etc.
		// 		End of the intron; the number of the first nucleotide of the following
		//      exon, a minus sign and the position upstream in the intron, like c.78-1G.
		int posExon = -1;
		String posExonStr = "";
		int distanceLeft = Math.max(0, pos - intron.getStart()) + 1;
		int distanceRight = Math.max(0, intron.getEnd() - pos) + 1;
		if (distanceLeft < distanceRight) {
			posExon = intron.getStart() - 1;
			posExonStr = (intron.isStrandPlus() ? "+" : "-");
		} else if (distanceRight < distanceLeft) {
			posExon = intron.getEnd() + 1;
			posExonStr = (intron.isStrandPlus() ? "-" : "+");
		} else {
			// Reference: in the middle of the intron, numbering changes from "c.77+.." to "c.78-.."; for introns with an uneven number of nucleotides the central nucleotide is the last described with a "+"
			posExonStr = "+";

			if (strandPlus) posExon = intron.getStart() - 1;
			else posExon = intron.getEnd() + 1;
		}

		// Distance to closest exonic base
		int exonDistance = Math.abs(posExon - pos);

		// Closest exonic base within coding region?
		int cdsLeft = Math.min(tr.getCdsStart(), tr.getCdsEnd());
		int cdsRight = Math.max(tr.getCdsStart(), tr.getCdsEnd());
		if ((posExon >= cdsLeft) && (posExon <= cdsRight)) {
			int distExonBase = tr.baseNumberCds(posExon, false) + 1;
			return distExonBase + (exonDistance > 0 ? posExonStr + exonDistance : "");
		}

		// Left side of coding part
		int cdnaPos = tr.baseNumber2MRnaPos(posExon);
		if (posExon < cdsLeft) {
			int cdnaStart = tr.baseNumber2MRnaPos(cdsLeft); // tr.getCdsStart());
			int utrDistance = Math.abs(cdnaStart - cdnaPos);
			String utrStr = strandPlus ? "-" : "*";
			return utrStr + utrDistance + (exonDistance > 0 ? posExonStr + exonDistance : "");
		}

		// Right side of coding part
		int cdnaEnd = tr.baseNumber2MRnaPos(cdsRight); // tr.getCdsEnd());
		int utrDistance = Math.abs(cdnaEnd - cdnaPos);
		String utrStr = strandPlus ? "*" : "-";
		return utrStr + utrDistance + (exonDistance > 0 ? posExonStr + exonDistance : "");
	}

	//	/**
	//	 * Position upstream of the transcript
	//	 */
	//	protected String posUpstream(int pos) {
	//		int tss = tr.getCdsStart();
	//		int idx = Math.abs(pos - tss);
	//
	//		if (idx <= 0) return null;
	//		return "-" + idx; // 5'UTR: We are before TSS, coordinates must be '-1', '-2', etc.
	//	}

	/**
	 * Position upstream of the transcript
	 * 
	 * Note: How to calculate Upstream position:
	 * If strand is '-' as for NM_016176.3, "genomicTxStart" being the rightmost tx coord:
	 *     cDotUpstream = -(cdsStart + variantPos - genomicTxStart)
	 *     
	 * Instead of "-(variantPos - genomicCdsStart)":
	 * 
	 * The method that stays in transcript space until extending beyond the transcript is
	 * correct because of these statements on http://varnomen.hgvs.org/bg-material/numbering/:
	 * 
	 *     * nucleotides upstream (5') of the ATG-translation initiation
	 *       codon (start) are marked with a "-" (minus) and numbered c.-1,
	 *       c.-2, c.-3, etc. (i.e. going further upstream)
	 *
	 *     * Question: When the ATG translation initiation codon is in
	 *       exon 2, and we find a variant in exon 1, should we include
	 *       intron 1 (upstream of c.-14) in nucleotide
	 *       numbering? (Isabelle Touitou, Montpellier, France)
	 *
	 *       Answer: Nucleotides in introns 5' of the ATG translation
	 *       initiation codon (i.e. in the 5'UTR) are numbered as
	 *       introns in the protein coding sequence (see coding DNA
	 *       numbering). In your example, based on a coding DNA
	 *       reference sequence, the intron is present between
	 *       nucleotides c.-15 and c.-14. The nucleotides for this
	 *       intron are numbered as c.-15+1, c.-15+2, c.-15+3, ....,
	 *       c.-14-3, c.-14-2, c.-14-1. Consequently, regarding the
	 *       question, when a coding DNA reference sequence is used,
	 *       the intronic nucleotides are not counted.
	 *
	 */
	protected String posUpstream(int pos) {
		int baseNumTss = tr.baseNumber2MRnaPos(tr.getCdsStart());
		int trStart = tr.isStrandPlus() ? tr.getStart() : tr.getEnd();
		int idx = baseNumTss + Math.abs(pos - trStart);

		if (idx <= 0) return null;
		return "-" + idx; // 5'UTR: We are before TSS, coordinates must be '-1', '-2', etc.
	}

	/**
	 * Position within 3'UTR
	 */
	protected String posUtr3(int pos) {
		int baseNum = tr.baseNumber2MRnaPos(pos);
		int baseNumCdsEnd = tr.baseNumber2MRnaPos(tr.getCdsEnd());
		int idx = Math.abs(baseNum - baseNumCdsEnd);

		if (idx <= 0) return null;
		return "*" + idx; // 3'UTR: We are after stop codon, coordinates must be '*1', '*2', etc.
	}

	/**
	 * Position within 5'UTR
	 */
	protected String posUtr5(int pos) {
		int baseNum = tr.baseNumber2MRnaPos(pos);
		int baseNumTss = tr.baseNumber2MRnaPos(tr.getCdsStart());
		int idx = Math.abs(baseNum - baseNumTss);

		if (idx <= 0) return null;
		return "-" + idx; // 5'UTR: We are before TSS, coordinates must be '-1', '-2', etc.
	}

	/**
	 * Translocation nomenclature.
	 * From HGVS:
	 * 		Translocations are described at the molecular level using the
	 * 		format "t(X;4)(p21.2;q34)", followed by the usual numbering, indicating
	 * 		the position translocation breakpoint. The sequences of the translocation
	 * 		breakpoints need to be submitted to a sequence database (Genbank, EMBL,
	 * 		DDJB) and the accession.version numbers should be given (see Discussion).
	 * 		E.g.:
	 * 			t(X;4)(p21.2;q35)(c.857+101_857+102) denotes a translocation breakpoint
	 * 			in the intron between coding DNA nucleotides 857+101 and 857+102, joining
	 * 			chromosome bands Xp21.2 and 4q34
	 */
	protected String prefixTranslocation() {
		VariantBnd vtr = (VariantBnd) variant;

		// Chromosome part
		String chrCoords = "(" //
				+ vtr.getChromosomeName() //
				+ ";" //
				+ vtr.getEndPoint().getChromosomeName() //
				+ ")" //
		;

		// Get cytobands
		String band1 = "";
		CytoBands cytoBands = genome.getCytoBands();
		Markers bands1 = cytoBands.query(vtr);
		if (!bands1.isEmpty()) band1 = bands1.get(0).getId(); // Get first match

		String band2 = "";
		Markers bands2 = cytoBands.query(vtr.getEndPoint());
		if (!bands2.isEmpty()) band2 = bands2.get(0).getId(); // Get first match

		String bands = "(" + band1 + ";" + band2 + ")";

		return "t" + chrCoords + bands + "(";
	}

	protected String ref() {
		return variant.getReference();
	}

	@Override
	public String toString() {
		if (variant == null || genome == null) return null;

		// Is this a duplication?
		if (variant.isIns()) duplication = isDuplication();

		String type = "", prefix = "", suffix = "";
		switch (variant.getVariantType()) {
		case INS:
			type = duplication ? "dup" : "ins";
			break;

		case DEL:
			type = "del";
			break;

		case MNP:
			type = "";
			break;

		case SNP:
		case MIXED:
		case INTERVAL:
			type = "";
			break;

		case INV:
			type = "inv";
			break;

		case DUP:
			type = "dup";
			break;

		case BND:
			prefix = prefixTranslocation();
			type = "";
			suffix = ")";
			break;

		default:
			throw new RuntimeException("Unimplemented method for variant type " + variant.getVariantType());
		}

		// HGVS formatted Position
		String pos = pos();
		if (pos == null) return null;

		// SNPs using old HGVS notation?
		if (Config.get().isHgvsOld() && type.isEmpty()) {
			String ref, alt;
			if (strandPlus) {
				ref = ref();
				alt = alt();
			} else {
				ref = GprSeq.reverseWc(ref());
				alt = GprSeq.reverseWc(alt());
			}

			// Use 'c.G123T' instead of 'c.123G>T'
			return prefix + typeOfReference() + ref + pos + alt + suffix;
		}

		return prefix + typeOfReference() + pos + type + dnaBaseChange() + suffix;
	}

	/**
	 * Prefix for coding or non-coding sequences
	 */
	protected String typeOfReference() {
		if (tr == null) return "n.";

		// Is the transcript protein coding?
		String prefix = tr.isProteinCoding() ? "c." : "n.";

		// Not using transcript ID?
		if (!hgvsTrId) return prefix;

		StringBuilder sb = new StringBuilder();
		sb.append(tr.getId());

		String ver = tr.getVersion();
		if (!ver.isEmpty()) sb.append("." + ver);

		sb.append(':');
		sb.append(prefix);

		return sb.toString();
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy