
org.snpeff.snpEffect.factory.SnpEffPredictorFactoryGff Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package org.snpeff.snpEffect.factory;
import java.io.BufferedReader;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import org.snpeff.interval.BioType;
import org.snpeff.interval.Cds;
import org.snpeff.interval.Chromosome;
import org.snpeff.interval.Exon;
import org.snpeff.interval.FrameType;
import org.snpeff.interval.Gene;
import org.snpeff.interval.GffMarker;
import org.snpeff.interval.GffType;
import org.snpeff.interval.IntergenicConserved;
import org.snpeff.interval.IntronConserved;
import org.snpeff.interval.Marker;
import org.snpeff.interval.Transcript;
import org.snpeff.interval.TranscriptSupportLevel;
import org.snpeff.interval.Utr3prime;
import org.snpeff.interval.Utr5prime;
import org.snpeff.snpEffect.Config;
import org.snpeff.snpEffect.SnpEffectPredictor;
import org.snpeff.util.Gpr;
/**
* This class creates a SnpEffectPredictor from a GFF file.
* This includes derived formats as GTF.
*
* References: http://gmod.org/wiki/GFF3
*
* @author pcingola
*/
public abstract class SnpEffPredictorFactoryGff extends SnpEffPredictorFactory {
public static final String FASTA_DELIMITER = "##FASTA";
String version = "";
boolean mainFileHasFasta = false; // Are sequences in the GFF file or in a separate FASTA file?
public SnpEffPredictorFactoryGff(Config config) {
super(config, 1);
markersById = new HashMap<>();
genesById = new HashMap<>();
transcriptsById = new HashMap<>();
fileName = config.getBaseFileNameGenes() + ".gff";
frameCorrection = true;
frameType = FrameType.GFF;
}
/**
* Create a new exon
*/
protected Exon addExon(Transcript tr, GffMarker gffMarker, String exonId) {
int rank = 0; // Rank information is added later
Exon ex = new Exon(tr, gffMarker.getStart(), gffMarker.getEnd(), gffMarker.isStrandMinus(), exonId, rank);
ex.setFrame(gffMarker.getFrame());
return add(ex);
}
/**
* Create and add a new exon
*/
protected List addExons(GffMarker gffMarker) {
String id = gffMarker.getId();
List list = new LinkedList<>();
// Add exon to each parent (can belong to more than one transcript)
String parentIds[] = gffMarker.getGffParentIds();
if (parentIds == null) {
if (debug) warning("Cannot find parent ID for exon:\n\t" + gffMarker);
return null;
}
for (String parentId : parentIds) {
parentId = parentId.trim();
// Exon's parent (should be a transcript)
Transcript tr = findTranscript(parentId, id);
Gene gene = findGene(parentId);
// Is exon's parent a gene instead of a transcript?
if ((tr == null) && (gene != null)) {
// Create a transcript from the gene
String trId = GffType.TRANSCRIPT + "_" + gene.getId(); // Transcript ID
tr = findTranscript(trId);
if (tr == null) {
tr = addTranscript(gene, gffMarker, trId);
if (debug) warning("Exon's parent '" + parentId + "' is a Gene instead of a transcript. Created transcript '" + tr.getId() + "' for this exon.");
}
}
// Try to find the gene
if (gene == null) gene = findGene(gffMarker.getGeneId());
// No transcript found? => Try creating one
if (tr == null) {
// No gene? Create one
if (gene == null) gene = addGene(gffMarker);
// Create transcript
String trId = parentId.isEmpty() ? GffType.TRANSCRIPT + "_" + id : parentId; // Transcript ID
tr = addTranscript(gene, gffMarker, trId);
// Add gene & transcript
if (debug) warning("Cannot find transcript '" + parentId + "'. Created transcript '" + tr.getId() + "' and gene '" + gene.getId() + "' for this exon");
}
// This can be added in different ways
int rank = 0; // Rank information is added later
switch (gffMarker.getGffType()) {
case EXON:
Exon ex = new Exon(tr, gffMarker.getStart(), gffMarker.getEnd(), gffMarker.isStrandMinus(), id, rank);
ex.setFrame(gffMarker.getFrame());
ex = add(ex);
list.add(ex);
break;
case CDS:
Cds cds = new Cds(tr, gffMarker.getStart(), gffMarker.getEnd(), gffMarker.isStrandMinus(), id);
cds.setFrame(gffMarker.getFrame());
add(cds);
break;
case START_CODON:
case STOP_CODON:
ex = new Exon(tr, gffMarker.getStart(), gffMarker.getEnd(), gffMarker.isStrandMinus(), id, rank);
ex.setFrame(gffMarker.getFrame());
add(ex);
cds = new Cds(tr, gffMarker.getStart(), gffMarker.getEnd(), gffMarker.isStrandMinus(), gffMarker.getGffType() + "_" + id);
cds.setFrame(gffMarker.getFrame());
add(cds);
break;
default:
throw new RuntimeException("Unsupported type " + gffMarker.getGffType());
}
}
return list.isEmpty() ? null : list;
}
/**
* Create and add a gene based on GffMarker
*/
protected Gene addGene(GffMarker gffMarker) {
BioType bioType = gffMarker.getGeneBiotype();
Gene gene = new Gene(gffMarker.getChromosome() //
, gffMarker.getStart() //
, gffMarker.getEnd() //
, gffMarker.isStrandMinus() //
, gffMarker.getGeneId() //
, gffMarker.getGeneName() //
, bioType);
add(gene);
return gene;
}
/**
* Add an intergenic conserved region
*/
protected IntergenicConserved addIntergenicConserved(GffMarker gffMarker) {
IntergenicConserved intergenicConserved = new IntergenicConserved(gffMarker.getChromosome(), gffMarker.getStart(), gffMarker.getEnd(), gffMarker.isStrandMinus(), gffMarker.getId());
add(intergenicConserved);
return intergenicConserved;
}
/**
* Add interval based on GffMarker data
* @return true if on success
*/
protected boolean addInterval(GffMarker gffMarker) {
switch (gffMarker.getGffType()) {
case GENE:
// Sanity check: Have we already added this one?
String geneId = gffMarker.getGeneId();
if ((geneId != null) && (findGene(geneId) != null)) {
warning("Gene '" + geneId + "' already added");
return false;
}
return findOrCreateGene(gffMarker) != null;
case TRANSCRIPT:
// Sanity check: Have we already added this one?
String trId = gffMarker.getTranscriptId();
if ((trId != null) && (findTranscript(trId) != null)) {
warning("Transcript '" + trId + "' already added");
return false;
}
return findOrCreateTranscript(gffMarker) != null;
case CDS:
return addExons(gffMarker) != null;
case EXON:
case STOP_CODON:
case START_CODON:
return addExons(gffMarker) != null;
case UTR5:
return addUtr5(gffMarker) != null;
case UTR3:
return addUtr3(gffMarker) != null;
case INTRON_CONSERVED:
return addIntronConserved(gffMarker) != null;
case INTERGENIC_CONSERVED:
return addIntergenicConserved(gffMarker) != null;
default:
return false;
}
}
/**
* Add an intron conserved region
*/
protected IntronConserved addIntronConserved(GffMarker gffMarker) {
String trId = gffMarker.getTranscriptId();
// Find transcript
Transcript tr = findTranscript(trId);
if (tr == null) tr = findTranscript(gffMarker.getId());
if (tr == null) {
warning("Could not find transcript '" + trId + "'");
return null;
}
IntronConserved intronConserved = new IntronConserved(tr, gffMarker.getStart(), gffMarker.getEnd(), gffMarker.isStrandMinus(), gffMarker.getId());
add(intronConserved);
return intronConserved;
}
/**
* Create and add transcript
*/
Transcript addTranscript(Gene gene, GffMarker gffMarker, String trId) {
Transcript tr = new Transcript(gene, gffMarker.getStart(), gffMarker.getEnd(), gffMarker.isStrandMinus(), trId);
// Set protein coding (if available)
if (gffMarker.isProteingCoding()) tr.setProteinCoding(true);
// Biotype
tr.setBioType(gffMarker.getTranscriptBiotype());
// Transcript support level (TSL)
String tslStr = gffMarker.getAttr("transcript_support_level");
if (tslStr != null) tr.setTranscriptSupportLevel(TranscriptSupportLevel.parse(tslStr));
// Transcript version
String ver = gffMarker.getTranscriptVersion();
if (ver != null) tr.setVersion(ver);
// Add transcript
add(tr);
//---parse
// Sanity check and updates for gene
//---
// Update gene bio-type (if needed)
BioType geneBioType = gffMarker.getGeneBiotype();
if (gene.getBioType() == null && geneBioType != null) gene.setBioType(geneBioType);
// Check that gene and transcript are in the same chromosome
if (!gene.getChromosomeName().equals(tr.getChromosomeName())) {
error("Trying to assign Transcript to a Gene in a different chromosome!" //
+ "\n\tTranscript : " + tr.toStr()//
+ "\n\tGene : " + gene.toStr() //
);
}
return tr;
}
/**
* Create new UTR3primes
*/
protected List addUtr3(GffMarker gffMarker) {
List list = new LinkedList<>();
// Add to each parent (can belong to more than one transcript)
for (String parentId : gffMarker.getGffParentIds()) {
// Find exon & transcript
Exon exon = findOrCreateExon(parentId, gffMarker);
if (exon != null) {
Transcript tr = (Transcript) exon.getParent();
// Create UTR
Utr3prime u3 = new Utr3prime(exon, gffMarker.getStart(), gffMarker.getEnd(), gffMarker.isStrandMinus(), gffMarker.getId());
tr.add(u3);
add(u3);
list.add(u3);
} else warning("Could not add UTR");
}
return list.isEmpty() ? null : list;
}
/**
* Create UTR5primes
*/
protected List addUtr5(GffMarker gffMarker) {
List list = new LinkedList<>();
// Add to each parent (can belong to more than one transcript)
for (String parentId : gffMarker.getGffParentIds()) {
// Find exon & transcript
Exon exon = findOrCreateExon(parentId, gffMarker);
if (exon != null) {
Transcript tr = (Transcript) exon.getParent();
// Create UTR
Utr5prime u5 = new Utr5prime(exon, gffMarker.getStart(), gffMarker.getEnd(), gffMarker.isStrandMinus(), gffMarker.getId());
tr.add(u5);
add(u5);
list.add(u5);
} else warning("Could not add UTR");
}
return list.isEmpty() ? null : list;
}
@Override
public SnpEffectPredictor create() {
// Read gene intervals from a file
if (verbose) System.out.println("Reading " + version + " data file : '" + fileName + "'");
try {
readGff();
// Some clean-up before reading exon sequences
beforeExonSequences();
if (readSequences) readExonSequences();
else if (createRandSequences) createRandSequences();
if (verbose) System.out.println("\tTotal: " + totalSeqsAdded + " sequences added, " + totalSeqsIgnored + " sequences ignored.");
// Finish up (fix problems, add missing info, etc.)
finishUp();
if (verbose) System.out.println(config.getGenome());
} catch (Exception e) {
if (verbose) e.printStackTrace();
throw new RuntimeException("Error reading file '" + fileName + "'\n" + e);
}
return snpEffectPredictor;
}
/**
* Find an exon for a given parentId
*/
protected Exon findOrCreateExon(String parentId, GffMarker gffMarker) {
// Get transcript
Transcript tr = findTranscript(parentId);
if (tr == null) tr = findTranscript(gffMarker.getTranscriptId());
if (tr == null) {
warning("Cannot find transcript '" + parentId + "'");
return null;
}
// Find exon using coordinates
Marker utr = new Marker(tr, gffMarker.getStart(), gffMarker.getEnd(), gffMarker.isStrandMinus(), gffMarker.getId());
Exon exon = tr.queryExon(utr);
if (exon != null) return exon;
// Nothing found? Create exon
exon = addExon(tr, gffMarker, gffMarker.getId());
if (debug) warning("Cannot find exon for UTR: '" + utr.getId() + "'. Creating exon '" + gffMarker.getId() + "'");
return exon;
}
/**
* Find or create a gene based on GffMarker
*/
protected Gene findOrCreateGene(GffMarker gffMarker) {
// Find gene
Gene gene = findGene(gffMarker.getGeneId());
if (gene == null) gene = findGene(gffMarker.getId());
// Add gene if needed
if (gene == null) gene = addGene(gffMarker);
return gene;
}
/**
* Create and add a transcript based on GffMarker
*/
protected Transcript findOrCreateTranscript(GffMarker gffMarker) {
// Add transcript
String trId = gffMarker.getTranscriptId();
Transcript tr = findTranscript(trId);
if (tr == null) tr = findTranscript(gffMarker.getId());
if (tr == null) {
// Add gene if needed
Gene gene = findOrCreateGene(gffMarker);
tr = addTranscript(gene, gffMarker, trId);
}
return tr;
}
/**
* Parse a line
* @return true if a line was parsed
*/
protected boolean parse(String line) {
GffMarker gffMarker = new GffMarker(genome, line);
return addInterval(gffMarker);
}
@Override
protected void readExonSequences() {
// Read chromosome sequences and set exon sequences
if (verbose) System.out.print("\tReading sequences :\n");
if (mainFileHasFasta) readExonSequencesGff(fileName); // Read from GFF file (it has a '##FASTA' delimiter)
else super.readExonSequences(); // Read them from FASTA file
}
/**
* Read chromosome sequence from GFF3 file and extract exons' sequences
*/
protected void readExonSequencesGff(String gffFileName) {
try {
BufferedReader reader = Gpr.reader(gffFileName);
// Get to fasta part of the file
for (lineNum = 1; reader.ready(); lineNum++) {
line = reader.readLine();
if (line.equals(FASTA_DELIMITER)) {
mainFileHasFasta = true;
break;
}
}
// Read fasta sequence
String chromoName = null;
StringBuffer chromoSb = new StringBuffer();
for (; reader.ready(); lineNum++) {
line = reader.readLine();
if (line.startsWith(">")) { // New fasta sequence
// Set chromosome sequences and length (create it if it doesn't exist)
if (chromoName != null) addSequences(chromoName, chromoSb.toString()); // Add all sequences
// Get sequence name
int idxSpace = line.indexOf(' ');
if (idxSpace > 0) line = line.substring(0, idxSpace);
chromoName = Chromosome.simpleName(line.substring(1).trim()); // New chromosome name
chromoNamesReference.add(chromoName);
// Initialize buffer
chromoSb = new StringBuffer();
if (verbose) System.out.println("\t\tReading sequence '" + chromoName + "'");
} else chromoSb.append(line.trim());
}
// Last chromosome
// Set chromosome sequneces and length (create it if it doesn't exist)
if (chromoName != null) addSequences(chromoName, chromoSb.toString()); // Add all sequences
else warning("Ignoring sequences for '" + chromoName + "'. Cannot find chromosome"); // Chromosome not found
reader.close();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/**
* Read GFF file from the beginning looking for 'typeToRead' elements
*/
protected void readGff() throws Exception {
int count = 0;
BufferedReader reader = Gpr.reader(fileName);
if (reader == null) return; // Error
// Parsing GFF3 (reference: http://gmod.org/wiki/GFF3)
try {
for (lineNum = 1; reader.ready(); lineNum++) {
line = reader.readLine();
// Are we done?
if (line == null || line.isEmpty()) {
// Ignore
} else if (line.equals(FASTA_DELIMITER)) {
mainFileHasFasta = true;
break;
} else if (line.startsWith("#")) {
// Ignore this line
} else if (parse(line)) {
count++;
if (verbose) Gpr.showMark(count, MARK, "\t\t");
}
}
} catch (Exception e) {
error("Offending line (lineNum: " + lineNum + "): '" + line + "'", e);
}
reader.close();
if (verbose) System.out.println((count > 0 ? "\n" : "") + "\tTotal: " + count + " markers added.");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy