
org.snpeff.snpEffect.factory.SnpEffPredictorFactoryFeatures Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package org.snpeff.snpEffect.factory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.snpeff.genBank.Feature;
import org.snpeff.genBank.Feature.Type;
import org.snpeff.genBank.FeatureCoordinates;
import org.snpeff.genBank.Features;
import org.snpeff.genBank.FeaturesFile;
import org.snpeff.interval.Cds;
import org.snpeff.interval.Chromosome;
import org.snpeff.interval.CircularCorrection;
import org.snpeff.interval.Exon;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Transcript;
import org.snpeff.snpEffect.Config;
import org.snpeff.snpEffect.SnpEffectPredictor;
import org.snpeff.util.Gpr;
import org.snpeff.util.GprSeq;
/**
* This class creates a SnpEffectPredictor from a 'features' file.
* This includes derived formats as GenBank and Embl.
*
* References:
* http://www.ebi.ac.uk/embl/Documentation/User_manual/printable.html
* http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html
*
*
* @author pcingola
*/
public abstract class SnpEffPredictorFactoryFeatures extends SnpEffPredictorFactory {
public static final int OFFSET = 1;
Chromosome chromosome; // It is assumed that there is only one 'Chromosome' (i.e. only one 'SOURCE' feature)
FeaturesFile featuresFile;
Map proteinByTrId;
public SnpEffPredictorFactoryFeatures(Config config) {
super(config, OFFSET);
proteinByTrId = new HashMap<>();
}
/**
* Add CDS and protein coding information
*/
Transcript addCds(Feature fcds, Gene geneLatest, List trLatest) {
// Find (or create) transcript
Transcript tr = findTrForCds(fcds, geneLatest, trLatest);
//
// Mark transcript as protein coding
if (fcds.getAasequence() != null) tr.setProteinCoding(true);
// Check and set ribosomal slippage
if (fcds.get("ribosomal_slippage") != null) tr.setRibosomalSlippage(true);
// Add exons?
if (fcds.hasMultipleCoordinates()) {
for (FeatureCoordinates fc : fcds) {
int cdsStart = fc.start - inOffset;
int cdsEnd = fc.end - inOffset;
Cds cds = new Cds(tr, cdsStart, cdsEnd, fcds.isComplement(), "CDS_" + tr.getId());
add(cds);
}
// Circular correction
CircularCorrection cc = new CircularCorrection(tr);
cc.setCorrectLargeGap(circularCorrectLargeGap);
cc.correct();
} else {
Cds cds = new Cds(tr, fcds.getStart() - inOffset, fcds.getEnd() - inOffset, fcds.isComplement(), "CDS_" + tr.getId());
add(cds);
}
// Add transcript - protein sequence mapping
proteinByTrId.put(tr.getId(), fcds.getAasequence());
return tr;
}
/**
* Add all features
*/
protected void addFeatures(Features features) {
//---
// Add chromosome
//---
for (Feature f : features.getFeatures()) {
// Convert coordinates to zero-based
int start = f.getStart() - inOffset;
int end = f.getEnd() - inOffset;
// Add chromosome
if (f.getType() == Type.SOURCE) {
if (chromosome == null) {
String chrName = chromoName(features, f);
chromosome = new Chromosome(genome, start, end, chrName);
add(chromosome);
} else {
if (debug) System.err.println("Warnign: 'SOURCE' already assigned to chromosome. Ignoring feature:\n" + f);
}
}
}
// No 'SOURCE' entry found? May be locusName is available.
if (chromosome == null) {
String chrName = chromoName(features, null);
int chrSize = sequence(features).length();
chromosome = new Chromosome(genome, 0, chrSize, chrName);
add(chromosome);
}
// Sanity check
if (chromosome == null) throw new RuntimeException("Could not find SOURCE feature");
if (verbose) System.err.println("Chromosome: '" + chromosome.getId() + "'\tlength: " + chromosome.size());
//---
// Add a genes, transcripts and CDSs
//---
Gene geneLatest = null;
List trLatest = null;
for (Feature f : features.getFeatures()) {
if (f.getType() == Type.GENE) {
// Add gene
geneLatest = findOrCreateGene(f, chromosome, false);
trLatest = null;
} else {
Transcript trl = null;
// Add feature
if (f.getType() == Type.MRNA) trl = addMrna(f, geneLatest);
else if (f.getType() == Type.CDS) trl = addCds(f, geneLatest, trLatest);
// Added transcript?
if (trl != null) {
// If we are using another gene, then 'geneLatest' should change
if (geneLatest == null //
|| trLatest == null //
|| !trl.getParent().getId().equals(geneLatest.getId()) // New gene? i.e. gen IDs do not math
) {
// Create new transcripts list
trLatest = new ArrayList<>();
trLatest.add(trl);
} else trLatest.add(trl);
geneLatest = (Gene) trl.getParent();
}
}
}
}
/**
* Add transcript information
*/
Transcript addMrna(Feature f, Gene geneLatest) {
if (debug) Gpr.debug("Feature:" + f);
// Convert coordinates to zero-based
int start = f.getStart() - inOffset;
int end = f.getEnd() - inOffset;
// Get gene: Make sure the transcript actually refers to the latest gene.
Gene gene = null;
if ((geneLatest != null) && geneLatest.intersects(start, end)) gene = geneLatest;
else gene = findOrCreateGene(f, chromosome, false); // Find or create gene
// Add transcript
String trId = f.getTranscriptId();
Transcript tr = new Transcript(gene, start, end, f.isComplement(), trId);
// Add exons?
if (f.hasMultipleCoordinates()) {
int exNum = 1;
for (FeatureCoordinates fc : f) {
Exon e = new Exon(tr, fc.start - inOffset, fc.end - inOffset, fc.complement, tr.getId() + "_" + exNum, exNum);
tr.add(e);
exNum++;
}
}
add(tr);
return tr;
}
/**
* Does the transcript match the latest gene?
*/
boolean cdsMatchesGene(Feature fcds, Gene gene) {
if (gene == null) return false;
// Convert coordinates to zero-based
int start = fcds.getStart() - inOffset;
int end = fcds.getEnd() - inOffset;
// CDS cannot be outside gene coordinates
if (start < gene.getStart() || gene.getEnd() < end) return false;
// Sanity check: Do gene names match?
String geneName = fcds.getGeneName();
// Gene names do no match, we are not in the same transcript
return geneName != null //
&& gene != null //
&& gene.getGeneName().equals(geneName) // No match?
;
}
/**
* Does this CDS feature match the latest transcript ?
*/
boolean cdsMatchesTr(Feature fcds, Transcript tr) {
if (tr == null) return false;
// Convert coordinates to zero-based
int start = fcds.getStart() - inOffset;
int end = fcds.getEnd() - inOffset;
// CDS cannot be outside transcript coordinates
if (start < tr.getStart() || tr.getEnd() < end) return false;
// If multiple coordinates are available (and transcript has exons), try to match exons
if (fcds.hasMultipleCoordinates() && !tr.subIntervals().isEmpty()) return cdsMatchesTrExons(fcds, tr);
return true;
}
/**
* Does this CDS feature match the transcript coordinates?
*/
boolean cdsMatchesTrExons(Feature fcds, Transcript tr) {
// Sorted list of exons from CDS
List cdsExons = new ArrayList<>();
for (FeatureCoordinates fc : fcds) {
Exon e = new Exon(tr, fc.start - inOffset, fc.end - inOffset, fc.complement, "", -1);
cdsExons.add(e);
}
Collections.sort(cdsExons);
// Sorted list of exons from transcript
List trExons = new ArrayList<>();
trExons.addAll(tr.subIntervals());
Collections.sort(trExons);
// Do CDS exons match transcript exons?
if (cdsExons.size() > trExons.size()) return false; // Transcript must have at least as many exons as CDS
// Do exons match
return cdsMatchesTrExons(cdsExons, trExons, tr);
}
/**
* Do CDS exon match transcript exons?
* @param cdsExons : List of CDS exons sorted by start position
* @param trExons : List of transcript exons sorted by start position
* @param tr : Transcript
* @return true if the list of CDS exons fits in transcript exons
*/
boolean cdsMatchesTrExons(List cdsExons, List trExons, Transcript tr) {
// Special case: CDS has only one exon
if (cdsExons.size() == 1) {
Exon cdsEx = cdsExons.get(0);
Exon trEx = tr.findExon(cdsEx);
return (trEx != null) && (trEx.includes(cdsEx));
}
int cdsExIdx = 0, trExIdx = 0;
Exon cdsEx = cdsExons.get(cdsExIdx);
Exon trEx = trExons.get(trExIdx);
// Skip transcript exons before CDs
while (cdsEx.getStart() > trEx.getEnd()) {
trExIdx++;
if (trExIdx >= trExons.size()) return false; // We run out of transcript's exons and none matched
trEx = trExons.get(trExIdx);
}
// First CDS exon can differ only on the left side.
if ((trEx.getStart() > cdsEx.getStart()) || (trEx.getEnd() != cdsEx.getEnd())) return false;
// Exons after the first (and before the last one) must match exactly
while (cdsExIdx < cdsExons.size() - 2) {
cdsExIdx++;
trExIdx++;
if (trExIdx >= trExons.size()) return false; // We run out of transcript's exons and none matched
cdsEx = cdsExons.get(cdsExIdx);
trEx = trExons.get(trExIdx);
if ((trEx.getStart() != cdsEx.getStart()) || (trEx.getEnd() != cdsEx.getEnd())) return false;
}
// Compare last CDS exon
cdsExIdx++;
trExIdx++;
if (cdsExIdx >= cdsExons.size()) return true; // Finished comparing
if (trExIdx >= trExons.size()) return false; // We run out of transcript's exons
cdsEx = cdsExons.get(cdsExIdx);
trEx = trExons.get(trExIdx);
// Last CDS exon can differ only on the right side.
if ((trEx.getStart() != cdsEx.getStart()) || (trEx.getEnd() < cdsEx.getEnd())) return false;
return true; // OK, all CDS exons match
}
/**
* Find or create a chromosome name for a feature
*/
String chromoName(Features features, Feature sourceFeature) {
// Try 'chromosome' from SOURCE feature
if (sourceFeature != null) {
if (sourceFeature.getType() != Type.SOURCE) throw new RuntimeException("Cannot find chromosome name in a non-SOURCE feature");
String chrName = sourceFeature.get("chromosome");
if (chrName != null) return chrName;
}
// Try locusName
String chrName = features.getLocusName();
if (chrName != null) return chrName;
return genome.getId();
}
@Override
public SnpEffectPredictor create() {
// Read gene intervals from a file
try {
// Iterate over all features
for (Features features : featuresFile) {
chromosome = null; // Make sure we create a new source for each file
addFeatures(features);
// Some clean-up before reading exon sequences
beforeExonSequences();
// Get exon sequences
String sequence = sequence(features);
addSequences(chromosome.getId(), sequence);
}
// Finish up (fix problems, add missing info, etc.)
finishUp();
} catch (Exception e) {
if (verbose) e.printStackTrace();
throw new RuntimeException("Error reading file '" + fileName + "'\n" + e);
}
return snpEffectPredictor;
}
/**
* Find (or create) a gene from a feature
*/
Gene findOrCreateGene(Feature f, Chromosome chr, boolean warn) {
int start = f.getStart() - inOffset;
int end = f.getEnd() - inOffset;
String geneId = geneId(f, start, end);
String geneName = geneName(f, start, end);
Gene gene = findGene(geneId);
if (gene == null) {
gene = new Gene(chr, start, end, f.isComplement(), geneId, geneName, null);
add(gene);
if (debug) System.err.println("WARNING: Gene '" + geneId + "' not found: created.");
}
return gene;
}
/**
* Find (or create) a transcript for this CDS feature
*/
Transcript findTrForCds(Feature fcds, Gene geneLatest, List trLatest) {
// Try to find the 'latest' gene / transcript
Transcript trLatestMatch = findTrFromLatest(fcds, geneLatest, trLatest);
if (trLatestMatch != null) return trLatestMatch;
// Try to find transcript by id
String trId = fcds.getTranscriptId();
Transcript tr = findTranscript(trId);
if (tr != null) return tr;
// Nothing found, create gene and transcript
int start = fcds.getStart() - inOffset; // Convert coordinates to zero-based
int end = fcds.getEnd() - inOffset;
// Create gene (or use latest)
Gene gene = null;
if (cdsMatchesGene(fcds, geneLatest)) gene = geneLatest;
else gene = findOrCreateGene(fcds, chromosome, false);
// Create transcript
if (trId == null) trId = "Tr_" + start + "_" + end;
tr = findTranscript(trId);
if (tr == null) {
if (debug) System.err.println("Transcript '" + trId + "' not found. Creating new transcript for gene '" + gene.getId() + "'.\n" + fcds);
tr = new Transcript(gene, start, end, fcds.isComplement(), trId);
add(tr);
}
return tr;
}
/**
* Does the CDS feature match the latest gene / transcript?
* @return Matching transcript, null if not found
*/
Transcript findTrFromLatest(Feature fcds, Gene geneLatest, List trLatest) {
if (trLatest == null) return null;
// Does CDS match latest gene?
if (cdsMatchesGene(fcds, geneLatest)) {
// Find first matching transcript without CDS information
for (Transcript tr : trLatest) {
if (tr.getCds().isEmpty() && cdsMatchesTr(fcds, tr)) return tr;
}
}
return null; // No match
}
/**
* Try to get geneIDs
*/
protected String geneId(Feature f, int start, int end) {
// Try 'locus'...
String geneId = f.getGeneId();
if (geneId != null) return geneId;
return "Gene_" + start + "_" + end;
}
/**
* Get gene name from feature
*/
protected String geneName(Feature f, int start, int end) {
// Try 'gene'...
String geneName = f.getGeneName();
if (geneName != null) return geneName;
return "Gene_" + start + "_" + end;
}
@Override
public Map getProteinByTrId() {
return proteinByTrId;
}
/**
* Get sequence either from features or from FASTA file
*/
String sequence(Features features) {
String seq = features.getSequence();
if ((seq != null) && !seq.isEmpty()) return seq;
if (verbose) System.out.println("No sequence found in feature file.");
// No sequence information in 'features' file? => Try to read a sequence from a fasta file
for (String fastaFile : config.getFileListGenomeFasta()) {
if (verbose) System.out.println("\tTrying fasta file '" + fastaFile + "'");
if (Gpr.canRead(fastaFile)) {
seq = GprSeq.fastaSimpleRead(fastaFile);
if ((seq != null) && !seq.isEmpty()) return seq;
}
}
throw new RuntimeException("Cannot find sequence for '" + config.getGenome().getVersion() + "'");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy