
org.snpeff.snpEffect.commandLine.SnpEffCmdCds Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package org.snpeff.snpEffect.commandLine;
import java.util.HashMap;
import java.util.List;
import org.snpeff.SnpEff;
import org.snpeff.align.SmithWaterman;
import org.snpeff.codons.CodonTable;
import org.snpeff.codons.CodonTables;
import org.snpeff.fileIterator.FastaFileIterator;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Transcript;
import org.snpeff.snpEffect.Config;
import org.snpeff.util.Gpr;
import org.snpeff.util.Timer;
/**
* Command line: Calculate coding sequences from a file and compare them to the ones calculated from our data structures
*
* Note: This is done in order to see potential incompatibility
* errors between genome sequence and annotation.
*
* @author pcingola
*/
public class SnpEffCmdCds extends SnpEff {
public static boolean onlyOneError = false; // This is used in some test-cases
public static double maxErrorPercentage = 0.01; // Maximum allowed error is 1% (otherwise test fails)
public static int MAX_ALIGN_LENGTH = 33000;
boolean storeAlignments; // Store alignments (used for some test cases)
boolean checkNumOk = true;
int totalErrors = 0;
int totalOk = 0;
int totalWarnings = 0;
int totalNotFound = 0;
String cdsFile = "";
HashMap cdsByTrId;
public SnpEffCmdCds() {
}
public SnpEffCmdCds(Config config) {
this.config = config;
cdsFile = config.getFileNameCds();
}
public SnpEffCmdCds(String genomeVer, String configFile, String cdsFile) {
this.configFile = configFile;
this.genomeVer = genomeVer;
this.cdsFile = cdsFile;
}
void add(String trId, String seq, int lineNum, boolean check) {
// Repeated transcript Id? => Check that Protein is the same
if (check && (cdsByTrId.get(trId) != null) && (!cdsByTrId.get(trId).equals(seq))) //
System.err.println("ERROR: Different CDS for the same transcript ID. This should never happen!!!"//
+ "\n\tLine number : " + lineNum //
+ "\n\tTranscript ID : '" + trId + "'"//
+ "\n\tProtein : " + cdsByTrId.get(trId) //
+ "\n\tProtein (new) : " + seq //
);
// Use whole trId
cdsByTrId.put(trId, seq); // Add it to the hash
if (debug) Gpr.debug("Adding cdsByTrId{'" + trId + "'} :\t" + seq);
}
/**
* Compare all CDS
*/
double cdsCompare() {
int i = 0;
if (verbose) {
// Show labels
System.err.println("\tLabels:");
System.err.println("\t\t'+' : OK");
System.err.println("\t\t'.' : Missing");
System.err.println("\t\t'*' : Error");
System.err.print("\t");
}
// Compare all genes
for (Gene gene : config.getGenome().getGenes())
for (Transcript tr : gene) {
char status = ' ';
boolean ok = false;
String cds = tr.cds().toUpperCase();
String mRna = tr.mRna().toUpperCase();
String cdsReference = cdsByTrId.get(tr.getId());
if (cdsReference != null) cdsReference = cdsReference.toUpperCase();
if (cdsReference == null) {
status = '.';
if (debug) System.err.println("\nWARNING:Cannot find reference CDS for transcript '" + tr.getId() + "'");
} else if (cds.isEmpty()) {
status = '.';
if (debug) System.err.println("\nWARNING:Empty CDS for transcript '" + tr.getId() + "'");
} else if (cds.equals(cdsReference)) {
status = '+';
// Sanity check: Start and stop codons
if ((cds != null) && (cds.length() >= 3)) {
CodonTable ctable = CodonTables.getInstance().getTable(config.getGenome(), tr.getChromosomeName());
// Check start codon
String startCodon = cds.substring(0, 3);
if (!ctable.isStart(startCodon)) {
if (debug) System.err.println("\nWARNING: CDS for transcript '" + tr.getId() + "' does not start with a start codon:\t" + startCodon + "\t" + cds);
totalWarnings++;
}
// Check stop codon
String stopCodon = cds.substring(cds.length() - 3, cds.length());
if (!ctable.isStop(stopCodon)) {
if (debug) System.err.println("\nWARNING: CDS for transcript '" + tr.getId() + "' does not end with a stop codon:\t" + stopCodon + "\t" + cds);
totalWarnings++;
}
}
} else if (mRna.equals(cdsReference)) { // May be the file has mRNA instead of CDS?
status = '+';
} else if ((mRna.length() < cdsReference.length()) // CDS longer than mRNA? May be it is actually an mRNA + poly-A tail (instead of a CDS)
&& cdsReference.substring(mRna.length()).replace('A', ' ').trim().isEmpty() // May be it is an mRNA and it has a ploy-A tail added
&& cdsReference.substring(0, mRna.length()).equals(mRna) // Compare cutting poly-A tail
) {
// OK, it was a mRNA + polyA
status = '+';
} else if ((mRna.length() > cdsReference.length()) // PolyA in the reference?
&& mRna.substring(cdsReference.length()).replace('A', ' ').trim().isEmpty() //
&& mRna.substring(0, cdsReference.length()).equals(mRna) //
) {
// OK, it was a mRNA + polyA
status = '+';
} else if (cdsReference.indexOf(cds) >= 0) { // CDS fully included in reference?
status = '+';
} else {
status = '*';
if (debug || storeAlignments || onlyOneError) {
// Create a string indicating differences
SmithWaterman sw = new SmithWaterman(cds, cdsReference);
if (Math.max(cds.length(), cdsReference.length()) < MAX_ALIGN_LENGTH) sw.align();
int maxScore = Math.min(cds.length(), cdsReference.length());
int score = sw.getAlignmentScore();
if (debug || onlyOneError) {
System.err.println("\nERROR: CDSs do not match for transcript " + tr.getId() //
+ "\tStrand:" + tr.isStrandMinus()//
+ "\tExons: " + tr.numChilds() //
+ "\n" //
+ String.format("\tSnpEff CDS (%6d) : '%s'\n", cds.length(), cds.toLowerCase()) //
+ String.format("\tReference (%6d) : '%s'\n", cdsReference.length(), cdsReference.toLowerCase()) //
+ "\tAlignment (Snpeff CDS vs Reference CDS)." //
+ "\tScore: " + score //
+ "\tMax. possible score: " + maxScore //
+ "\tDiff: " + (maxScore - score) //
+ "\n" + sw //
);
System.err.println("Transcript details:\n" + tr);
}
if (onlyOneError) {
System.err.println("Transcript details:\n" + tr);
throw new RuntimeException("Showing only one error!");
}
}
}
// Update counters
switch (status) {
case '.':
totalNotFound++;
break;
case '+':
totalOk++;
ok = true;
break;
case '*':
totalErrors++;
break;
case ' ':
break;
default:
throw new RuntimeException("Unknown status '" + status + "'");
}
// Update transcript DnaCheck status
if (ok) tr.setDnaCheck(true);
// Show a mark
if (verbose && (status != ' ')) {
if (status == '.' && !tr.isProteinCoding()) {
// OK, may be this is a non-protein coding transcript and that's why we could not find the CDS
} else {
System.out.print(status);
i++;
if (i % 100 == 0) System.out.print("\n\t");
}
}
}
double perc = ((double) totalErrors) / ((double) (totalErrors + totalOk));
System.out.println("\n\tCDS check:\t" //
+ config.getGenome().getVersion() //
+ "\tOK: " + totalOk //
+ "\tWarnings: " + totalWarnings //
+ "\tNot found: " + totalNotFound //
+ "\tErrors: " + totalErrors //
+ "\tError percentage: " + (100 * perc) + "%" //
);
// Sanity check
if (checkNumOk && totalOk <= 0) fatalErrorNoTranscriptsChecked();
return perc;
}
/**
* Show an error message that actually helps to solve the problem
*/
void fatalErrorNoTranscriptsChecked() {
StringBuilder sb = new StringBuilder();
// Show some transcript IDs
int maxTrIds = 20;
sb.append("Transcript IDs from database (sample):\n" + sampleTrIds(maxTrIds));
sb.append("Transcript IDs from database (fasta file):\n" + sampleTrIdsFasta(maxTrIds));
fatalError("No CDS checked. This is might be caused by differences in FASTA file transcript IDs respect to database's transcript's IDs.\n" + sb);
}
/**
* Parse command line arguments
*/
@Override
public void parseArgs(String[] args) {
for (int i = 0; i < args.length; i++) {
// Argument starts with '-'?
if (args[i].startsWith("-")) {
if ((args[i].equals("-c") || args[i].equalsIgnoreCase("-config"))) {
if ((i + 1) < args.length) configFile = args[++i];
else usage("Option '-c' without config file argument");
} else if (args[i].equals("-v") || args[i].equalsIgnoreCase("-debug")) {
debug = true;
} else usage("Unknown option '" + args[i] + "'");
} else if (genomeVer.isEmpty()) genomeVer = args[i];
else if (cdsFile.isEmpty()) cdsFile = args[i];
else usage("Unknown parameter '" + args[i] + "'");
}
// Check: Do we have all required parameters?
if (genomeVer.isEmpty()) usage("Missing genomer_version parameter");
if (cdsFile.isEmpty()) usage("Missing cds_file parameter");
}
/**
* Read a file that has all CDS
*/
void readCdsFile() {
cdsByTrId = new HashMap<>();
if (cdsFile.endsWith("txt") || cdsFile.endsWith("txt.gz")) readCdsFileTxt();
else readCdsFileFasta();
if (cdsByTrId.isEmpty()) fatalError("CDS file is empty!");
}
/**
* Read CDSs from a file
* Format: Tab-separated format, containing "sequence \t transcriptId"
*/
void readCdsFileFasta() {
// Load file
FastaFileIterator ffi = new FastaFileIterator(cdsFile);
for (String seq : ffi) {
String trId = ffi.getName();
add(trId, seq, ffi.getLineNum(), true);
// Also try processing header line using different separators
List ids = ffi.fastaHeader2Ids();
for (String id : ids) {
// We don't check for uniqueness here since many items in this
// list are tokens that are expected to be repeated
add(id, seq, ffi.getLineNum(), false);
}
}
}
/**
* Read CDSs from a file
* Format: Tab-separated format, containing "sequence \t transcriptId"
*/
void readCdsFileTxt() {
// Load file
String cdsData = Gpr.readFile(cdsFile);
String cdsLines[] = cdsData.split("\n");
// Parse each line
int lineNum = 1;
for (String cdsLine : cdsLines) {
// Split tab separated fields
String field[] = cdsLine.split("\\s+");
// Parse fields
if (field.length >= 2) {
// OK Parse fields
String seq = field[1].trim();
String trId = field[0].trim();
// Repeated transcript Id? => Check that CDS is the same
if ((cdsByTrId.get(trId) != null) && (!cdsByTrId.get(trId).equals(seq))) System.err.println("ERROR: Different CDS for the same transcript ID. This should never happen!!!\n\tLine number: " + lineNum + "\n\tTranscript ID:\t" + trId + "\n\tCDS:\t\t" + cdsByTrId.get(trId) + "\n\tCDS (new):\t" + seq);
cdsByTrId.put(trId, seq); // Add it to the hash
}
lineNum++;
}
}
/**
* Run command
*/
@Override
public boolean run() {
if (verbose) Timer.showStdErr("Checking database using CDS sequences");
// Load config
if (config == null) loadConfig();
// Read CDS form file
if (verbose) Timer.showStdErr("Reading CDSs from file '" + cdsFile + "'...");
readCdsFile(); // Load CDS
if (verbose) Timer.showStdErr("done (" + cdsByTrId.size() + " CDSs).");
// Load predictor
if (config.getSnpEffectPredictor() == null) loadDb();
// Compare CDS
if (verbose) Timer.showStdErr("Comparing CDS...");
cdsCompare();
if (verbose) Timer.showStdErr("done");
return true;
}
/**
* Show same Transcript IDs
*/
String sampleTrIds(int maxTrIds) {
StringBuilder sb = new StringBuilder();
int count = 0;
for (Gene g : config.getGenome().getGenes())
for (Transcript tr : g) {
sb.append("\t'" + tr.getId() + "'\n");
if (count++ > maxTrIds) return sb.toString();
}
return sb.toString();
}
/**
* Show same Transcript IDs from FASTA file
*/
String sampleTrIdsFasta(int maxTrIds) {
StringBuilder sb = new StringBuilder();
int count = 0;
for (String trid : cdsByTrId.keySet()) {
sb.append("\t'" + trid + "'\n");
if (count++ > maxTrIds) return sb.toString();
}
return sb.toString();
}
public void setCheckNumOk(boolean checkNumOk) {
this.checkNumOk = checkNumOk;
}
public void setStoreAlignments(boolean storeAlignments) {
this.storeAlignments = storeAlignments;
}
/**
* Show usage and exit
*/
@Override
public void usage(String message) {
if (message != null) System.err.println("Error: " + message + "\n");
System.err.println("snpEff version " + SnpEff.VERSION);
System.err.println("Usage: snpEff cds [options] genome_version cds_file");
System.err.println("\nOptions:");
System.err.println("\t-c , -config : Specify config file");
System.err.println("\t-noLog : Do not report usage statistics to server");
System.err.println("\t-v : Verbose mode");
System.err.println("\t-d : Debug mode");
System.exit(-1);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy