
org.snpeff.interval.Gene Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package org.snpeff.interval;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Set;
import org.snpeff.interval.tree.IntervalTree;
import org.snpeff.interval.tree.Itree;
import org.snpeff.serializer.MarkerSerializer;
import org.snpeff.snpEffect.Config;
import org.snpeff.snpEffect.EffectType;
import org.snpeff.snpEffect.VariantEffect;
import org.snpeff.snpEffect.VariantEffect.ErrorWarningType;
import org.snpeff.snpEffect.VariantEffects;
import org.snpeff.stats.ObservedOverExpectedCpG;
import org.snpeff.util.Gpr;
import org.snpeff.util.Timer;
/**
* Interval for a gene, as well as transcripts
*
* @author pcingola
*
*/
public class Gene extends IntervalAndSubIntervals implements Serializable {
public enum GeneType {
CODING, NON_CODING, UNKNOWN
}
public static final String CIRCULAR_GENE_ID = "_circ";
private static final long serialVersionUID = 8419206759034068147L;
String geneName;
BioType bioType;
Itree intervalTreeGene;
public Gene() {
super();
geneName = "";
bioType = null;
type = EffectType.GENE;
}
public Gene(Marker parent, int start, int end, boolean strandMinus, String id, String geneName, BioType bioType) {
super(parent, start, end, strandMinus, id);
this.geneName = geneName;
this.bioType = bioType;
type = EffectType.GENE;
}
/**
* Add a gene dependent marker
*/
public void addPerGene(Marker marker) {
if (intervalTreeGene == null) intervalTreeGene = new IntervalTree();
intervalTreeGene.add(marker);
}
/**
* Adjust start, end and strand values
* @return true if any adjustment was done
*/
public boolean adjust() {
boolean changed = false;
int strandSumGene = 0;
int newStart = Integer.MAX_VALUE;
int newEnd = Integer.MIN_VALUE;
for (Transcript tr : this) {
newStart = Math.min(newStart, tr.getStart());
newEnd = Math.max(newEnd, tr.getEnd());
for (Exon exon : tr.sortedStrand()) {
newStart = Math.min(newStart, exon.getStart());
newEnd = Math.max(newEnd, exon.getEnd());
strandSumGene += exon.isStrandMinus() ? -1 : 1; // Some exons have incorrect strands, we use the strand indicated by most exons
}
for (Utr utr : tr.getUtrs()) {
newStart = Math.min(newStart, utr.getStart());
newEnd = Math.max(newEnd, utr.getEnd());
}
}
// Change gene strand?
boolean newStrandMinus = strandSumGene < 0;
if (strandMinus != newStrandMinus) {
strandMinus = newStrandMinus;
changed = true;
}
if (newStart < newEnd) {
// Change start?
if (start != newStart) {
start = newStart;
changed = true;
}
// Change end?
if (end != newEnd) {
end = newEnd;
changed = true;
}
} else if (Config.get().isDebug()) {
Gpr.debug("Gene '" + id + "' (name:'" + geneName + "') not adjusted: " + this);
}
return changed;
}
/**
* Build gene dependent interval tree
*/
public void buildPerGene() {
if (intervalTreeGene != null) intervalTreeGene.build();
}
/**
* Get canonical transcript
* Canonical transcripts are defined as the longest CDS of amongst the protein coding transcripts.
* If none of the transcripts is protein coding, then it is the longest cDNA.
*/
public Transcript canonical() {
Transcript canonical = null;
int canonicalLen = 0;
if (isProteinCoding()) {
// Find canonical transcript in protein coding gene (longest CDS)
for (Transcript t : this) {
int tlen = t.cds().length();
// Compare coding length. If both lengths are equal, compare IDs
if (t.isProteinCoding() //
&& ((canonical == null) // No canonical selected so far? => Select this one
|| (canonicalLen < tlen) // Longer? => Update
|| ((canonicalLen == tlen) && (t.getId().compareTo(canonical.getId()) < 0)) // Same length? Compare IDs
) //
) {
canonical = t;
canonicalLen = tlen;
}
}
} else {
// Find canonical transcript in non-protein coding gene (longest mRNA)
for (Transcript t : this) {
int tlen = t.mRna().length();
if (canonicalLen <= tlen //
&& ((canonical == null) // No canonical selected so far? => Select this one
|| (canonicalLen < tlen) // Longer? => Update
|| ((canonicalLen == tlen) && (t.getId().compareTo(canonical.getId()) < 0)) // Same length? Compare IDs
) //
) {
canonical = t;
canonicalLen = tlen;
}
}
}
// Found canonincal transcript? Set canonical flag
if (canonical != null) canonical.setCanonical(true);
return canonical;
}
/**
* In a circular genome, a gene can have negative coordinates or crosses
* over chromosome end. These genes are mirrored to the opposite end of
* the chromosome so that they can be referenced by both circular
* coordinates.
*/
public Gene circularClone() {
Gene newGene = (Gene) clone();
// Change IDs
newGene.setId(id + CIRCULAR_GENE_ID);
for (Transcript tr : newGene) {
tr.setId(tr.getId() + CIRCULAR_GENE_ID);
for (Exon ex : tr)
ex.setId(ex.getId() + CIRCULAR_GENE_ID);
}
// Shift coordinates
Chromosome chr = getChromosome();
int shift = 0;
if (start < 0) {
shift = chr.size();
} else if (end > chr.getEnd()) {
shift = -chr.size();
} else throw new RuntimeException("Sanity check: This should neve happen!");
newGene.shiftCoordinates(shift);
if (Config.get().isVerbose()) {
Timer.showStdErr("Gene '" + id + "' spans across coordinate zero: Assuming circular chromosome, creating mirror gene at the end." //
+ "\n\tGene :" + toStr() //
+ "\n\tNew gene :" + newGene.toStr() //
+ "\n\tChrsomosome :" + chr.toStr() //
);
}
return newGene;
}
@Override
public Gene cloneShallow() {
Gene clone = (Gene) super.cloneShallow();
clone.bioType = bioType;
clone.geneName = geneName;
return clone;
}
/**
* Calculate CpG bias: number of CpG / expected[CpG]
*/
public double cpgExonBias() {
ObservedOverExpectedCpG oe = new ObservedOverExpectedCpG();
return oe.oe(this);
}
/**
* Filter transcripts by TSL
*/
public void filterTranscriptSupportLevel(TranscriptSupportLevel maxTsl) {
ArrayList toDelete = new ArrayList<>();
// Mark transcripts for removal
for (Transcript tr : this)
if (!tr.hasTranscriptSupportLevelInfo() //
|| (tr.getTranscriptSupportLevel().compareTo(maxTsl) > 0) //
) toDelete.add(tr);
// Remove transcripts
for (Transcript t : toDelete)
remove(t);
}
public GeneType geneType() {
if (bioType != null) {
if (bioType.isProteinCoding()) return GeneType.CODING;
return GeneType.NON_CODING;
}
return GeneType.UNKNOWN;
}
public BioType getBioType() {
return bioType;
}
public String getGeneName() {
return geneName;
}
/**
* Is any of the transcripts protein coding?
*/
public boolean isProteinCoding() {
for (Transcript tr : this)
if (tr.isProteinCoding()) return true;
return false;
}
@Override
protected boolean isShowWarningIfParentDoesNotInclude() {
return false;
}
/**
* Remove all transcripts in trIds
* @return : Number of transcripts removed
*/
public int keepTranscripts(Set trIds) {
// Find transcripts in trIds
ArrayList toDelete = new ArrayList<>();
for (Transcript t : this) {
String trId = t.getId();
// Sometimes the provided list does not have version numbers
// (e.g. NM_005157 instead of NM_005157.4)
String trIdNoVersion = t.getId();
int versionIdx = trIdNoVersion.indexOf('.');
if (versionIdx > 0) trIdNoVersion = trIdNoVersion.substring(0, versionIdx);
// Transcript not in the list? => Remove it
if (!trIds.contains(trId) && !trIds.contains(trIdNoVersion)) toDelete.add(t);
}
// Remove them
for (Transcript t : toDelete)
remove(t);
return toDelete.size();
}
/**
* Keep only protein coding transcripts
* @return : Number of transcripts removed
*/
public int keepTranscriptsProteinCoding() {
// Find transcripts in trIds
ArrayList toDelete = new ArrayList<>();
for (Transcript t : this)
if (!t.isProteinCoding()) toDelete.add(t);
// Remove them
for (Transcript t : toDelete)
remove(t);
return toDelete.size();
}
@Override
public Markers markers() {
Markers markers = new Markers();
for (Transcript tr : this) {
markers.add(tr);
markers.add(tr.markers());
}
return markers;
}
/**
* Remove all non-canonical transcripts
*/
public void removeNonCanonical(String trId) {
// User transcript 'trId' or find canonical transcript
Transcript canonical;
if (trId != null) {
canonical = get(trId);
if (canonical == null) throw new RuntimeException("Canonical transcript '" + trId + "' not found! Gene name: '" + getGeneName() + "', Gene ID: '" + getId() + "'");
} else canonical = canonical();
// Found canonical? => Remove all others
if (canonical != null) {
// Remove all other transcripts
ArrayList toDelete = new ArrayList<>();
toDelete.addAll(subIntervals());
toDelete.remove(canonical); // Do not remove canonical transcript.
// Remove all other transcripts
for (Transcript t : toDelete)
remove(t);
}
}
/**
* Remove unverified or corrected transcripts
* @return : True if ALL transcripts have been removed
*/
public boolean removeUnverified() {
// Mark unchecked transcripts for deletion
ArrayList toDelete = new ArrayList<>();
int countRemoved = 0;
for (Transcript tr : this) {
if (!tr.isChecked() || tr.isCorrected()) {
toDelete.add(tr);
countRemoved++;
}
}
if (Config.get().isDebug()) Gpr.debug("Gene '', removing " + countRemoved + " / " + numChilds() + " unchecked transcript.");
// Remove
for (Transcript t : toDelete)
remove(t);
return numChilds() <= 0;
}
/**
* Parse a line from a serialized file
*/
@Override
public void serializeParse(MarkerSerializer markerSerializer) {
super.serializeParse(markerSerializer);
geneName = markerSerializer.getNextField();
bioType = BioType.parse(markerSerializer.getNextField());
}
/**
* Create a string to serialize to a file
*/
@Override
public String serializeSave(MarkerSerializer markerSerializer) {
return super.serializeSave(markerSerializer) //
+ "\t" + geneName //
+ "\t" + bioType;
}
public void setBioType(BioType bioType) {
this.bioType = bioType;
}
/**
* Size of a genetic region for a given gene
*/
public int sizeof(String type) {
// Calculate size
EffectType eff = EffectType.valueOf(type.toUpperCase());
Markers all = new Markers();
int len = 0;
switch (eff) {
case GENE:
return size();
case EXON:
// Add all exons
for (Transcript tr : this)
for (Exon ex : tr)
all.add(ex);
break;
case CDS:
// Add all cds
for (Transcript tr : this)
for (Cds cds : tr.getCds())
all.add(cds);
break;
case TRANSCRIPT:
// Add all transcripts
for (Transcript tr : this)
all.add(tr);
break;
case INTRON:
return Math.max(0, sizeof("TRANSCRIPT") - sizeof("EXON"));
case UTR_3_PRIME:
// Add all Utr3prime
for (Transcript tr : this)
for (Utr3prime utr : tr.get3primeUtrs())
all.add(utr);
break;
case UTR_5_PRIME:
// Add all Utr3prime
for (Transcript tr : this)
for (Utr5prime utr : tr.get5primeUtrs())
all.add(utr);
break;
case UPSTREAM:
for (Transcript tr : this)
all.add(tr.getUpstream());
break;
case DOWNSTREAM:
for (Transcript tr : this)
all.add(tr.getDownstream());
break;
case SPLICE_SITE_ACCEPTOR:
// Add all exons
for (Transcript tr : this)
for (Exon ex : tr)
for (SpliceSite ss : ex.getSpliceSites())
if (ss instanceof SpliceSiteAcceptor) all.add(ss);
break;
case SPLICE_SITE_BRANCH:
// Add all exons
for (Transcript tr : this)
for (SpliceSite ss : tr.spliceSites())
if (ss instanceof SpliceSiteBranch) all.add(ss);
break;
case SPLICE_SITE_DONOR:
// Add all exons
for (Transcript tr : this)
for (Exon ex : tr)
for (SpliceSite ss : ex.getSpliceSites())
if (ss instanceof SpliceSiteDonor) all.add(ss);
break;
case SPLICE_SITE_REGION:
// Add all exons
for (Transcript tr : this) {
for (Exon ex : tr) {
for (SpliceSite ss : ex.getSpliceSites())
if (ss instanceof SpliceSiteRegion) all.add(ss);
}
for (Intron intron : tr.introns()) {
for (SpliceSite ss : intron.getSpliceSites())
if (ss instanceof SpliceSiteRegion) all.add(ss);
}
}
case INTRAGENIC:
// We have to perform a set minus operation between this gene and all the transcripts
Markers gene = new Markers();
gene.add(this);
// Create transcripts
Markers trans = new Markers();
for (Transcript tr : this)
trans.add(tr);
all = gene.minus(trans);
break;
case NONE:
return 0;
default:
throw new RuntimeException("Unimplemented sizeof('" + type + "')");
}
// Merge and calculate total length
Markers merged = all.merge();
for (Marker m : merged)
len += m.size();
return len;
}
@Override
public String toString() {
return toString(true);
}
public String toString(boolean showTr) {
StringBuilder sb = new StringBuilder();
sb.append(getChromosomeName() + ":" + start + "-" + end);
sb.append(", strand:" + (strandMinus ? "-1" : "1"));
if ((id != null) && (id.length() > 0)) sb.append(", id:" + id);
if ((geneName != null) && (geneName.length() > 0)) sb.append(", name:" + geneName);
if ((bioType != null) && (bioType != null)) sb.append(", bioType:" + bioType);
sb.append("\n");
if (showTr && numChilds() > 0) {
sb.append("Transcipts:\n");
for (Transcript tint : sorted())
sb.append("\t" + tint + "\n");
}
return sb.toString();
}
/**
* Get some details about the effect on this gene
*/
@Override
public boolean variantEffect(Variant variant, VariantEffects variantEffects) {
if (!intersects(variant)) return false; // Sanity check
//---
// Shift variant towards the most 3-prime end
// This is done in order to comply with HGVS notation
//---
boolean shifted3prime = false;
if (Config.get().isHgvsShift()) {
// Keep track of the original variants, just in case it is changed
Variant variantOri = variant;
// Do we need to 'walk and roll'? I.e. align variant towards the most 3-prime
// end of the transcript? Note that VCF request variants to be aligned towards
// the 'leftmost' coordinate, so this re-alignment is only required for variants
// within transcripts on the positive strand.
if (!variant.isSnp() && Config.get().isHgvsShift() && isStrandPlus()) {
// Get sequence information. Might have to load sequences from database
variant = variant.realignLeft();
// Created a new variant? => It was shifted towards the left (i.e. 3-prime)
shifted3prime = (variant != variantOri);
}
}
//---
// Calculate effects
//---
// Calculate effect for each transcript
boolean hitTranscript = false;
for (Transcript tr : this) {
// Apply sequence change to create new 'reference'?
if (variant.isNonRef()) {
Variant vref = ((VariantNonRef) variant).getVariantRef();
tr = tr.apply(vref); // Note: Do we need to check on null transcript? e.g. huge deletion removing the whole transcript
}
// Calculate effects
hitTranscript |= tr.intersects(variant);
tr.variantEffect(variant, variantEffects);
}
// May be none of the transcripts are actually hit
if (!hitTranscript) {
variantEffects.add(variant, this, EffectType.INTRAGENIC, "");
return true;
}
// Do we have 'per gene' information?
variantEffectGene(variant, variantEffects);
//---
// Do we need to add INFO_SHIFT_3_PRIME warning message?
//---
if (shifted3prime) {
for (VariantEffect ve : variantEffects) {
// Is this effect using the shifted variant? => Mark as shifted
if (ve.getVariant() == variant) ve.addErrorWarningInfo(ErrorWarningType.INFO_REALIGN_3_PRIME);
}
}
return true;
}
/**
* Add gene-specific annotations
*/
protected void variantEffectGene(Variant variant, VariantEffects variantEffects) {
if (intervalTreeGene == null) return;
Markers res = intervalTreeGene.query(variant);
for (Marker m : res)
m.variantEffect(variant, variantEffects);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy