
org.snpeff.snpEffect.HgvsProtein Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package org.snpeff.snpEffect;
import org.snpeff.codons.CodonTable;
import org.snpeff.interval.Transcript;
import org.snpeff.interval.VariantBnd;
import org.snpeff.util.Gpr;
/**
* Coding change in HGVS notation (amino acid changes)
* References: http://www.hgvs.org/mutnomen/recs.html
*/
public class HgvsProtein extends Hgvs {
public static boolean debug = false;
int codonNum, aaPos;
String aaNew, aaOld;
boolean hgvsOneLetterAa;
boolean hgvsOld;
int lettersPerAa;
char stop;
public HgvsProtein(VariantEffect variantEffect) {
super(variantEffect);
codonNum = variantEffect.getCodonNum();
hgvsOneLetterAa = Config.get().isHgvs1LetterAA();
hgvsOld = Config.get().isHgvsOld();
stop = hgvsOld ? 'X' : '*';
lettersPerAa = hgvsOneLetterAa ? 1 : 3;
// No marker? Nothing to do
if (marker != null) {
// Codon numbering
// HGVS: the translation initiator Methionine is numbered as +1
if (codonNum >= 0) aaPos = codonNum + 1;
aaNew = aaCode(variantEffect.getAaAlt());
aaOld = aaCode(variantEffect.getAaRef());
} else {
aaPos = -1;
aaNew = aaOld = "";
}
}
protected String aaCode(char aa1Letter) {
return aaCode(Character.toString(aa1Letter));
}
/**
* Use one letter / three letter AA codes
* Most times we want to vonvert to 3 letter code
* HGVS: the three-letter amino acid code is prefered (see Discussion), with "*" designating a translation
* termination codon; for clarity we this page describes changes using the three-letter amino acid
*/
protected String aaCode(String aa1Letter) {
if (aa1Letter == null || aa1Letter.isEmpty() || aa1Letter.equals("-")) return "";
if (hgvsOld) return aa1Letter.replace('*', 'X');
if (hgvsOneLetterAa) return aa1Letter;
return marker.codonTable().aaThreeLetterCode(aa1Letter);
}
/**
* Deletions remove one or more amino acid residues from the protein and are described using "del"
* after an indication of the first and last amino acid(s) deleted separated by a "_" (underscore).
* Deletions remove either a small internal segment of the protein (in-frame deletion), part of
* the N-terminus of the protein (initiation codon change) or the entire C-terminal part of the
* protein (nonsense change). A nonsense change is a special type of deletion removing the entire
* C-terminal part of a protein starting at the site of the variant (specified 2013-03-16).
*
* 1) in-frame deletions - are described using "del" after an indication of the first and last amino acid(s) deleted separated, by a "_" (underscore).
*
* p.Gln8del in the sequence MKMGHQQQCC denotes a Glutamine-8 (Gln, Q) deletion to MKMGHQQCC
*
* p.(Cys28_Met30del) denotes RNA nor protein was analysed but the predicted change is a deletion of three amino acids, from Cysteine-28 to Methionine-30
*
* 2) initiating methionine change (Met1) causing a N-terminal deletion (see Discussion, see Examples)
* NOTE: changes extending the N-terminal protein sequence are described as an extension
* p.0 - no protein is produced (experimental data should be available)
* NOTE: this change is not described as p.Met1_Leu833del, i.e. as a deletion removing the entire protein coding sequence
*
* p.Met1? - denotes that amino acid Methionine-1 (translation initiation site) is changed and that it is unclear what the consequence of this change is
*
* p.Met1_Lys45del - a new translation initiation site is activated (at Met46)
*
* 3) nonsense variant - are a special type of amino acid deletion removing the entire C-terminal
* part of a protein starting at the site of the variant. A nonsense change is described using
* the format p.Trp26Ter (alternatively p.Trp26*). The description does not include the deletion
* at protein level from the site of the change to the C-terminal end of the protein (stop codon)
* like p.Trp26_Leu833del (the deletion of amino acid residue Trp26 to the last amino acid of
* the protein Leu833).
*
* p.(Trp26Ter) indicates RNA nor protein was analysed but amino acid Tryptophan26 (Trp, W) is predicted to change to a stop codon (Ter) (alternatively p.(W26*) or p.(Trp26*))
*/
protected String del() {
return "del";
}
/**
* Mixed variants
* Deletion/insertions (indels) replace one or more amino acid residues with one or more other
* amino acid residues. Deletion/insertions are described using "delins" as a deletion followed
* by an insertion after an indication of the amino acid(s) flanking the site of the
* deletion/insertion separated by a "_" (underscore, see Discussion). Frame shifts are a special
* type of amino acid deletion/insertion affecting an amino acid between the first (initiation, ATG)
* and last codon (termination, stop), replacing the normal C-terminal sequence with one encoded
* by another reading frame (specified 2013-10-11). A frame shift is described using "fs" after
* the first amino acid affected by the change. Descriptions either use a short ("fs") or long
* ("fsTer#") description. The description of frame shifts does not include the deletion at
* protein level from the site of the frame shift to the natural end of the protein (stop codon).
* The inserted amino acid residues are not described, only the total length of the new shifted
* frame is given (i.e. including the first amino acid changed).
*/
protected String delins() {
return "delins" + aaNew;
}
/**
* Duplications
*/
protected String dup() {
return "dup";
}
/**
* Frame shifts are a special type of amino acid deletion/insertion affecting an amino acid
* between the first (initiation, ATG) and last codon (termination, stop), replacing the
* normal C-terminal sequence with one encoded by another reading frame (specified 2013-10-11).
* A frame shift is described using "fs" after the first amino acid affected by the change.
* Descriptions either use a short ("fs") or long ("fsTer#") description
*/
protected String fs() {
return "fs";
}
/**
* Insertions
* Insertions add one or more amino acid residues between two existing amino acids and this
* insertion is not a copy of a sequence immediately 5'-flanking (see Duplication). Insertions
* are described using "ins" after an indication of the amino acids flanking the insertion
* site, separated by a "_" (underscore) and followed by a description of the amino acid(s)
* inserted. Since for large insertions the amino acids can be derived from the DNA and/or RNA
* descriptions they need not to be described exactly but the total number may be given (like "ins17").
*
* Examples:
* 1) p.Lys2_Met3insGlnSerLys denotes that the sequence GlnSerLys (QSK) was inserted between amino acids Lysine-2 (Lys, K) and Methionine-3 (Met, M), changing MKMGHQQQCC to MKQSKMGHQQQCC
* 2) p.Trp182_Gln183ins17 describes a variant that inserts 17 amino acids between amino acids Trp182 and Gln183
*
* NOTE: it must be possible to deduce the 17 inserted amino acids from the description given at DNA or RNA level
*/
protected String ins() {
return "ins" + aaNew;
}
/**
* Is this a 'pure' deletion?
* E.g.:
* - A 'CODON_DELETION' is a pure deletion (form HGVS' perspective)
* - A 'CODON_CHANGE_PLUS_CODON_DELETION' is not a 'pure' deletion for HGVS (it's a 'delins')
*/
boolean isDel() {
return !aaOld.isEmpty() && aaNew.isEmpty();
}
/**
* Is this variant a duplication
*
* Reference: http://www.hgvs.org/mutnomen/disc.html#dupins
* ...the description "dup" (see Standards) may by definition only be used
* when the additional copy is directly 3'-flanking of the original copy (tandem
* duplication)
*/
protected boolean isDuplication() {
//---
// Simple duplications can be obtained by looking into AA.Ref / AA.Alt
//---
String aaRef = variantEffect.getAaRef().toUpperCase();
String aaAlt = variantEffect.getAaAlt().toUpperCase();
// Compare to ALT sequence
String dupAaRef = aaRef + aaRef;
if (debug) Gpr.debug("AA.Ref: '" + aaRef + "'\tAA.Alt: '" + aaAlt);
if (aaAlt.equals(dupAaRef)) return true;
//---
// Duplications need to look into the protein sequence
//---
// Extract sequence from genomic coordinates before variant
String protein = tr.protein();
if (protein == null) return false; // Cannot calculate duplication
// Calculate net amino acid change
aaAlt = variantEffect.getAaNetChange();
// Get previous AA sequence
int aaEnd = variantEffect.getCodonNum();
int aaStart = aaEnd - aaAlt.length();
if (aaStart < 0 || aaEnd > protein.length()) return false;
aaRef = protein.substring(aaStart, aaEnd);
// Compare to ALT sequence
boolean dup = aaRef.equalsIgnoreCase(aaAlt);
if (debug) Gpr.debug("SEQUENCE [ " + aaStart + " , " + aaEnd + " ]: '" + aaRef + "'" //
+ "\n\tAA Ref : '" + variantEffect.getAaRef() + "'" //
+ "\n\tAA Alt : '" + variantEffect.getAaAlt() + "'" //
+ "\n\tAA Alt (net) : '" + aaAlt + "'" //
+ "\n\tDup? : " + dup);
return dup;
}
/**
* Is this a frameShift variant?
*/
boolean isFs() {
return variantEffect.hasEffectType(EffectType.FRAME_SHIFT) //
|| variantEffect.hasEffectType(EffectType.FRAME_SHIFT_BEFORE_CDS_START) //
|| variantEffect.hasEffectType(EffectType.FRAME_SHIFT_AFTER_CDS_END) //
;
}
/**
* Is this a 'pure' insertion?
* E.g.:
* - A 'CODON_INSERTION' is a pure insertion (form HGVS' perspective)
* - A 'CODON_CHANGE_PLUS_CODON_INSERTION' is not a 'pure' insertion for HGVS (it's a 'delins')
*/
boolean isIns() {
return aaOld.isEmpty() && !aaNew.isEmpty();
}
/**
* Protein position
*/
protected String pos(int codonNum) {
Transcript tr = variantEffect.getTranscript();
return pos(tr, codonNum);
}
protected String pos(int start, int end) {
Transcript tr = variantEffect.getTranscript();
return pos(tr, start, end);
}
/**
* Protein position
*/
protected String pos(Transcript tr, int codonNum) {
if (codonNum < 0 || tr == null) return null;
// Sanity check: Longer than protein?
String protSeq = tr.protein();
if (codonNum >= protSeq.length()) return null;
// Get AA code
String aa = aaCode(protSeq.charAt(codonNum));
return aa + (codonNum + 1);
}
/**
* Position string given two coordinates
*/
protected String pos(Transcript tr, int start, int end) {
// Only one position needed?
String posStart = pos(tr, start);
if (posStart == null) return null;
if (start == end) return posStart;
// Both position needed
String posEnd = pos(tr, end);
if (posEnd == null) return null;
return posStart + "_" + posEnd;
}
/**
* Position for deletions
*/
protected String posDel() {
String posStart = pos(codonNum);
if (posStart == null) return null;
if (aaOld == null || aaOld.isEmpty() || aaOld.equals("-")) return null;
if (aaOld.length() == lettersPerAa) return posStart; // Single AA deleted
int end = codonNum + (aaOld.length() - aaNew.length()) / lettersPerAa - 1;
String posEnd = pos(end);
if (posEnd == null) return null;
return posStart + "_" + posEnd;
}
/**
* Position for 'delins'
*/
protected String posDelIns() {
String posStart = pos(codonNum);
if (posStart == null) return null;
// Frame shifts ....are described using ... the change of the first amino acid affected
// ... the description does not include a description of the deletion from the site of the change
if (variantEffect.hasEffectType(EffectType.FRAME_SHIFT)) return posStart;
if (aaOld == null || aaOld.isEmpty() || aaOld.equals("-")) aaOld = "";
if (aaOld.length() == lettersPerAa) return posStart; // Single AA deleted
int end = codonNum + (aaOld.length() / lettersPerAa) - 1;
String posEnd = pos(end);
if (posEnd == null) return null;
return posStart + "_" + posEnd;
}
/**
* Position for 'duplications' (a special kind of insertion)
*/
protected String posDup() {
int start, end;
int netLen = aaNew.length() / lettersPerAa;
if (netLen == 1) {
start = end = codonNum - 1;
} else {
end = codonNum - 1;
start = end - (netLen - 1);
}
return pos(start, end);
}
/**
* Frame shifts ....are described using ... the change of the first amino acid affected
* ... the description does not include a description of the deletion from the site of the change
*/
protected String posFs() {
if (codonNum < 0) return null;
// Sanity check: Longer than protein?
Transcript tr = variantEffect.getTranscript();
String protSeq = tr.protein();
if (codonNum > protSeq.length()) return null;
if (codonNum == protSeq.length()) return aaCode(CodonTable.TERMINATION_CODON_1) + codonNum;
// NOTE: the changes observed should be described on protein level and not try to
// incorporate any knowledge regarding the change at DNA-level.
// Thus, p.His150Hisfs*10 is not correct, but p.Gln151Thrfs*9 is.
// Reference: http://www.hgvs.org/mutnomen/recs-prot.html#del
if (!variant.isMixed()) { // Apply is not supported on MIXED variants at the moment
Transcript newTr = tr.apply(variant);
String newProtSeq = newTr.protein();
// Find the first difference in the protein sequences and use that
// one as AA number
int len = Math.min(protSeq.length(), newProtSeq.length());
for (int cn = codonNum; cn < len; cn++) {
char aa = protSeq.charAt(cn);
char newAa = newProtSeq.charAt(cn);
if (aa != newAa) return aaCode(aa) + (cn + 1);
}
}
return aaCode(protSeq.charAt(codonNum)) + (codonNum + 1);
}
/**
* Position for insertions
*/
protected String posIns() {
int start = codonNum - 1;
int end = codonNum;
return pos(start, end);
}
/**
* Position: SNP or NMP
*/
protected String posSnpOrMnp() {
return pos(codonNum);
}
/**
* Can we simplify AAs?
*/
void simplifyAminoAcids() {
simplifyAminoAcidsLeft();
simplifyAminoAcidsRight();
}
/**
* Remove same AA from left side
*/
void simplifyAminoAcidsLeft() {
// Can we simplify AAs?
while (!aaOld.isEmpty() && !aaNew.isEmpty()) {
// Get the first AA
String ao = aaOld.substring(0, lettersPerAa);
String an = aaNew.substring(0, lettersPerAa);
// Both share the same AA? => Remove it
if (ao.equals(an)) {
aaOld = aaOld.substring(lettersPerAa);
aaNew = aaNew.substring(lettersPerAa);
codonNum++;
aaPos++;
} else return;
}
}
/**
* Remove same AA form right side
*/
void simplifyAminoAcidsRight() {
// Can we simplify AAs?
while (!aaOld.isEmpty() && !aaNew.isEmpty()) {
// Get the last AA
int lastAaOldIdx = aaOld.length() - lettersPerAa;
String ao = aaOld.substring(lastAaOldIdx);
int lastAaNewIdx = aaNew.length() - lettersPerAa;
String an = aaNew.substring(lastAaNewIdx);
// Both share the same AA? => Remove it
if (ao.equals(an)) {
aaOld = aaOld.substring(0, lastAaOldIdx);
aaNew = aaNew.substring(0, lastAaNewIdx);
} else return;
}
}
/**
* SNP or MNP changes
*/
protected String snpOrMnp() {
// No codon change information? only codon number?
if (variantEffect.getAaRef().isEmpty() && variantEffect.getAaAlt().isEmpty()) {
if (codonNum >= 0) return "" + (codonNum + 1);
return null;
}
// Stop gained
// Reference: http://www.hgvs.org/mutnomen/recs-prot.html#del
// Nonsense variant are a special type of amino acid deletion removing the entire C-terminal part of a
// protein starting at the site of the variant. A nonsense change is described using the format
// p.Trp26Ter (alternatively p.Trp26*).
if (variantEffect.hasEffectType(EffectType.STOP_GAINED)) return aaOld + aaPos + stop;
// Stop codon mutations
// Reference: http://www.hgvs.org/mutnomen/recs-prot.html#extp
// A change affecting the translation termination codon (Ter/*) introducing a new downstream termination
// codon extending the C-terminus of the encoded protein described using "extTer#" (alternatively "ext*#")
// where "#" is the position of the new stop codon (Ter# / *#)
// E.g.:
// p.*327Argext*? (alternatively p.Ter327ArgextTer? or p.*327Rext*?) describes a variant in the stop
// codon (Ter/*) at position 327, changing it to a codon for Arginine (Arg, R) and adding a tail of
// new amino acids of unknown length since the shifted frame does not contain a new stop codon.
if (variantEffect.hasEffectType(EffectType.STOP_LOST)) return aaOld + aaPos + aaNew + "ext" + stop + "?";
// Start codon lost
// Reference : http://www.hgvs.org/mutnomen/disc.html#Met
// Currently, variants in the translation initiating Methionine (M1) are usually described as a substitution, e.g. p.Met1Val.
// This is not correct. Either no protein is produced (p.0) or a new translation initiation site up- or downstream is used (e.g. p.Met1ValextMet-12 or p.Met1_Lys45del resp.).
// Unless experimental proof is available, it is probably best to report the effect on protein level as "p.Met1?" (unknown).
// When experimental data show that no protein is made, the description "p.0" is recommended (see Examples).
//
// We use the same for SYNONYMOUS_START since we cannot rally predict if the new start codon will actually be functioning as a start codon (since the Kozak sequence changed)
// Ditto for NON_SYNONYMOUS_START
if (variantEffect.hasEffectType(EffectType.START_LOST) //
|| variantEffect.hasEffectType(EffectType.SYNONYMOUS_START) //
|| variantEffect.hasEffectType(EffectType.NON_SYNONYMOUS_START) //
) return aaOld + "1?";
// Synonymous changes
// Description of so called "silent" changes in the format p.Leu54Leu (or p.L54L) is not allowed; descriptions
// should be given at DNA level, it is non-informative and not unequivocal (there are five possibilities
// at DNA level which may underlie p.Leu54Leu); correct description has the format c.162C>G.
if ((variantEffect.hasEffectType(EffectType.SYNONYMOUS_CODING)) //
|| (variantEffect.hasEffectType(EffectType.SYNONYMOUS_STOP)) //
) return aaOld + aaPos + aaNew;
return aaOld + aaPos + aaNew;
}
@Override
public String toString() {
if (variant == null || marker == null) return null;
// Deleted transcript produces no protein.
if (variantEffect.getEffectType() == EffectType.TRANSCRIPT_DELETED) return typeOfReference() + "0?";
// Can we simplify amino acids in aaNew/aaOld?
// if (!variant.isSnp() && !variant.isMnp()) simplifyAminoAcids();
if (!variant.isSnp()) simplifyAminoAcids();
String pos = "", protChange = "", prefix = "", suffix = "";
switch (variant.getVariantType()) {
case INTERVAL:
// Not a variant? Nothing to do
return "";
case SNP:
case MNP:
// SNP or MNP
protChange = snpOrMnp();
pos = "";
break;
case INV:
// Inversion description not used at protein level
// Reference: http://www.hgvs.org/mutnomen/examplesAA.html
return "";
case BND:
return translocation();
default:
if (isFs()) {
// Frame shifts
protChange = fs();
pos = posFs();
} else if (isIns()) {
// This is a 'pure' insertion
duplication = isDuplication(); // Is it a duplication?
if (duplication) {
protChange = dup();
pos = posDup();
} else {
protChange = ins();
pos = posIns();
}
} else if (isDel()) {
// A deletion
protChange = del();
pos = posDel();
} else {
// A mixture of insertion and deletion
protChange = delins();
pos = posDelIns();
}
}
if (protChange == null || pos == null) return null;
return prefix + typeOfReference() + pos + protChange + suffix;
}
/**
* Translocation nomenclature.
* From HGVS:
* Translocations at protein level occur when a translocation at DNA level
* leads to the production of a fusion protein, joining the N-terminal
* end of the protein on one chromosome to the C-terminal end of the protein
* on the other chromosome (and vice versa). No recommendations have been
* made sofar to describe protein translocations.
*
* t(X;17)(DMD:p.Met1_Val1506; SGCA:p.Val250_*387)
* describes a fusion protein resulting from a translocation between the
* chromosomes X and 17; the fusion protein contains an N-terminal segment
* of DMD (dystrophin, amino acids Methionine-1 to Valine-1506), and a
* C-terminal segment of SGCA (alpha-sarcoglycan, amino acids Valine-250
* to the stop codon at 387)
*/
protected String translocation() {
if (!(variantEffect instanceof VariantEffectFusion)) return "";
VariantBnd vtr = (VariantBnd) variant;
VariantEffectFusion veffFusion = (VariantEffectFusion) variantEffect;
if (veffFusion.getTrLeft() == null && veffFusion.getTrRight() == null) return "";
// Chromosome part
String chrCoords = "(" //
+ vtr.getChromosomeName() //
+ ";" //
+ vtr.getEndPoint().getChromosomeName() //
+ ")" //
;
// Left transcript coordinates
String trLeftStr = "";
if (veffFusion.getTrLeft() != null) {
trLeftStr = veffFusion.getTrLeft().getId() //
+ ":" //
+ pos(veffFusion.getTrLeft(), veffFusion.getAaNumLeftStart(), veffFusion.getAaNumLeftEnd());
}
// Right transcript coordinates
String trRightStr = "";
if (veffFusion.getTrRight() != null) {
trRightStr = veffFusion.getTrRight().getId() //
+ ":" //
+ pos(veffFusion.getTrRight(), veffFusion.getAaNumRightStart(), veffFusion.getAaNumRightEnd());
}
return "t" + chrCoords + "(" + trLeftStr + ";" + trRightStr + ")";
}
/**
* Return "p." string with/without transcript ID, according to user command line options.
*/
protected String typeOfReference() {
if (!hgvsTrId || tr == null) return "p.";
String ver = tr.getVersion();
return tr.getId() + (ver.isEmpty() ? "" : "." + ver) + ":p.";
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy