
org.snpeff.snpEffect.factory.SnpEffPredictorFactoryRefSeq Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package org.snpeff.snpEffect.factory;
import java.io.BufferedReader;
import java.util.List;
import org.snpeff.collections.MultivalueHashMap;
import org.snpeff.interval.BioType;
import org.snpeff.interval.Cds;
import org.snpeff.interval.Chromosome;
import org.snpeff.interval.Exon;
import org.snpeff.interval.FrameType;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Marker;
import org.snpeff.interval.Transcript;
import org.snpeff.snpEffect.Config;
import org.snpeff.snpEffect.SnpEffectPredictor;
import org.snpeff.util.Gpr;
/**
* This class creates a SnpEffectPredictor from a TXT file dumped using UCSC table browser
*
* RefSeq table schema: http://genome.ucsc.edu/cgi-bin/hgTables
*
* field example SQL type info description
* bin 585 smallint(5) range Indexing field to speed chromosome range queries.
* name NR_026818 varchar(255) values Name of gene (usually transcript_id from GTF)
* chrom chr1 varchar(255) values Reference sequence chromosome or scaffold
* strand - char(1) values + or - for strand
* txStart 34610 int(10) range Transcription start position
* txEnd 36081 int(10) range Transcription end position
* cdsStart 36081 int(10) range Coding region start
* cdsEnd 36081 int(10) range Coding region end
* exonCount 3 int(10) range Number of exons
* exonStarts 34610,35276,35720, longblob Exon start positions
* exonEnds 35174,35481,36081, longblob Exon end positions
* score 0 int(11) range
* name2 FAM138A varchar(255) values Alternate name (e.g. gene_id from GTF)
* cdsStartStat unk enum('none', 'unk', 'incmpl', 'cmpl') values enum('none','unk','incmpl','cmpl')
* cdsEndStat unk enum('none', 'unk', 'incmpl', 'cmpl') values enum('none','unk','incmpl','cmpl')
* exonFrames -1,-1,-1, longblob Exon frame {0,1,2}, or -1 if no frame for exon
*
* Refseq Accession format (i.e. NM_ NR_ codes) : http://www.ncbi.nlm.nih.gov/RefSeq/key.html
*
* Accession Molecule Method Note
* AC_123456 Genomic Mixed Alternate complete genomic molecule. This prefix is used for records that are provided to reflect an alternate assembly or annotation. Primarily used for viral, prokaryotic records.
* AP_123456 Protein Mixed Protein products; alternate protein record. This prefix is used for records that are provided to reflect an alternate assembly or annotation. The AP_ prefix was originally designated for bacterial proteins but this usage was changed.
* NC_123456 Genomic Mixed Complete genomic molecules including genomes, chromosomes, organelles, plasmids.
* NG_123456 Genomic Mixed Incomplete genomic region; supplied to support the NCBI genome annotation pipeline. Represents either non-transcribed pseudogenes, or larger regions representing a gene cluster that is difficult to annotate via automatic methods.
* NM_123456789 mRNA Mixed Transcript products; mature messenger RNA (mRNA) transcripts.
* NP_123456789 Protein Mixed Protein products; primarily full-length precursor products but may include some partial proteins and mature peptide products.
* NR_123456 RNA Mixed Non-coding transcripts including structural RNAs, transcribed pseudogenes, and others.
* NT_123456 Genomic Automated Intermediate genomic assemblies of BAC and/or Whole Genome Shotgun sequence data.
* NW_123456789 Genomic Automated Intermediate genomic assemblies of BAC or Whole Genome Shotgun sequence data.
* NZ_ABCD12345678 Genomic Automated A collection of whole genome shotgun sequence data for a project. Accessions are not tracked between releases. The first four characters following the underscore (e.g. 'ABCD') identifies a genome project.
* XM_123456789 mRNA Automated Transcript products; model mRNA provided by a genome annotation process; sequence corresponds to the genomic contig.
* XP_123456789 Protein Automated Protein products; model proteins provided by a genome annotation process; sequence corresponds to the genomic contig.
* XR_123456 RNA Automated Transcript products; model non-coding transcripts provided by a genome annotation process; sequence corresponds to the genomic contig.
* YP_123456789 Protein Mixed Protein products; no corresponding transcript record provided. Primarily used for bacterial, viral, and mitochondrial records.
* ZP_12345678 Protein Automated Protein products; annotated on NZ_ accessions (often via computational methods).
* NS_123456 Genomic Automated Genomic records that represent an assembly which does not reflect the structure of a real biological molecule. The assembly may represent an unordered assembly of unplaced scaffolds, or it may represent an assembly of DNA sequences generated from a biological sample that may not represent a single organism.
*
* $ zcat genes.txt.gz | cut -f 2 | cut -b 1,2 | sort | uniq -c
* 34466 NM
* 6548 NR
*
* @author pcingola
*/
public class SnpEffPredictorFactoryRefSeq extends SnpEffPredictorFactory {
public static final String CDS_STAT_COMPLETE = "cmpl";
MultivalueHashMap genesByName;
public SnpEffPredictorFactoryRefSeq(Config config) {
super(config, 0); // Zero values coordinates
genesByName = new MultivalueHashMap<>();
frameType = FrameType.UCSC;
frameCorrection = true;
}
/**
* Get biotype based on ID
* Reference http://www.ncbi.nlm.nih.gov/RefSeq/key.html
*/
BioType bioType(String id) {
if (id.length() < 2) return null;
String key = id.substring(0, 2);
switch (key) {
case "NM":
case "NP":
case "XM":
case "XP":
case "YP":
case "ZP":
return BioType.protein_coding;
case "AC":
case "AP":
case "NC":
case "NG":
case "NR":
case "NT":
case "NW":
case "NZ":
case "XR":
case "NS":
return BioType.pseudogene;
default:
return null;
}
}
@Override
public SnpEffectPredictor create() {
try {
// Read gene intervals from a file
if (fileName == null) fileName = config.getBaseFileNameGenes() + ".refseq";
if (verbose) System.out.print("Reading gene intervals file : '" + fileName + "'\n\t\t");
readRefSeqFile(); // Read gene info
beforeExonSequences(); // Some clean-up before readng exon sequences
// Read chromosome sequences and set exon sequences
if (readSequences) readExonSequences();
else if (createRandSequences) createRandSequences();
finishUp(); // Perform adjustments
if (verbose) System.out.println(config.getGenome());
} catch (Exception e) {
if (verbose) e.printStackTrace();
throw new RuntimeException("Error reading file '" + fileName + "'\n" + e);
}
return snpEffectPredictor;
}
/**
* Find (or create) a gene for this transcript
*/
Gene findOrCreateGene(String geneName, String trId, Chromosome chromo, int start, int end, boolean strandMinus) {
Marker tr = new Marker(chromo, start, end, strandMinus, trId);
List genes = genesByName.get(geneName);
int geneIndex = 0;
if (genes != null) {
for (Gene gene : genes) {
if (gene.intersects(tr)) {
// Do we need to update gene length?
if (start < gene.getStart()) gene.setStart(start);
if (gene.getEnd() < end) gene.setEnd(end);
return gene;
}
}
geneIndex = genes.size() + 1;
}
// Need to create a new gene
String geneId = geneName + (geneIndex > 0 ? "." + geneIndex : "");
Gene gene = new Gene(chromo, start, end, strandMinus, geneId, geneName, bioType(trId));
genesByName.add(geneName, gene);
add(gene);
return gene;
}
/**
* Is this protein coding?
*/
boolean isProteinCoding(String id) {
BioType biotype = bioType(id);
return biotype != null && biotype.isProteinCoding();
}
/**
* Read and parse RefSeq file
*/
protected void readRefSeqFile() {
try {
int count = 0;
BufferedReader reader = Gpr.reader(fileName);
if (reader == null) return; // Error
for (lineNum = 1; reader.ready(); lineNum++) {
line = reader.readLine();
// Skip headers
if ((lineNum == 1) || line.startsWith("#")) continue;
String fields[] = line.split("\t");
if (fields.length >= 16) {
// Parse fields
// String bin= fields[0]; // Not used
String id = fields[1];
String chromoName = fields[2];
boolean strandMinus = fields[3].equals("-");
int txstart = parsePosition(fields[4]);
int txend = parsePosition(fields[5]) - 1; // Our internal database representations of coordinates always have a zero-based start and a one-based end (Reference: http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )
int cdsStart = parsePosition(fields[6]);
int cdsEnd = parsePosition(fields[7]) - 1; // Our internal database representations of coordinates always have a zero-based start and a one-based end (Reference: http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )
int exonCount = Gpr.parseIntSafe(fields[8]);
String exonStarts = fields[9];
String exonEnds = fields[10]; // Our internal database representations of coordinates always have a zero-based start and a one-based end (Reference: http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )
// String score= fields[11]; // Not used
String geneName = fields[12];
String cdsStartStat = fields[13];
String cdsEndStat = fields[14];
String exonFrames = fields[15];
//---
// Create
//----
Chromosome chromo = getOrCreateChromosome(chromoName);
// Create IDs
String trId = uniqueTrId(id);
// Get or create gene
Gene gene = findOrCreateGene(geneName, trId, chromo, txstart, txend, strandMinus);
// Create transcript
Transcript tr = new Transcript(gene, txstart, txend, strandMinus, trId);
boolean markAsCoding = isProteinCoding(trId) && cdsStartStat.equals(CDS_STAT_COMPLETE) && cdsEndStat.equals(CDS_STAT_COMPLETE); // If CDS start or end are not 'complete', don't mark as protein coding
tr.setProteinCoding(markAsCoding);
add(tr);
// Add Exons and CDS
String exStartStr[] = exonStarts.split(",");
String exEndStr[] = exonEnds.split(",");
String exFrameStr[] = exonFrames.split(",");
for (int i = 0; i < exonCount; i++) {
// Exons
int exStart = parsePosition(exStartStr[i]);
int exEnd = parsePosition(exEndStr[i]) - 1; // Our internal database representations of coordinates always have a zero-based start and a one-based end (Reference: http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )
int exFrame = Gpr.parseIntSafe(exFrameStr[i]);
String exId = trId + ".ex." + (i + 1);
Exon ex = new Exon(tr, exStart, exEnd, strandMinus, exId, i);
exFrame = frameType.convertFrame(exFrame);
ex.setFrame(exFrame);
ex = add(ex);
// CDS (ony if intersects)
if ((exStart <= cdsEnd) && (exEnd >= cdsStart)) {
Cds cds = new Cds(tr, Math.max(cdsStart, exStart), Math.min(cdsEnd, exEnd), strandMinus, exId);
add(cds);
}
}
count++;
if (verbose) Gpr.showMark(count, MARK, "\t\t");
}
}
reader.close();
} catch (Exception e) {
Gpr.debug("Offending line (lineNum: " + lineNum + "): '" + line + "'");
throw new RuntimeException(e);
}
}
/**
* Create a new (unique) transcript ID
*/
String uniqueTrId(String id) {
if (!transcriptsById.containsKey(id)) return id;
for (int i = 2; true; i++) {
String trId = id + "." + i;
if (!transcriptsById.containsKey(trId)) return trId;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy