
org.snpeff.snpEffect.commandLine.SnpEffCmdProtein Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package org.snpeff.snpEffect.commandLine;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.snpeff.SnpEff;
import org.snpeff.align.SmithWaterman;
import org.snpeff.codons.CodonTable;
import org.snpeff.codons.CodonTables;
import org.snpeff.collections.AutoHashMap;
import org.snpeff.fileIterator.FastaFileIterator;
import org.snpeff.genBank.EmblFile;
import org.snpeff.genBank.Feature;
import org.snpeff.genBank.Feature.Type;
import org.snpeff.genBank.Features;
import org.snpeff.genBank.FeaturesFile;
import org.snpeff.genBank.GenBankFile;
import org.snpeff.interval.Chromosome;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Transcript;
import org.snpeff.snpEffect.Config;
import org.snpeff.snpEffect.factory.SnpEffPredictorFactoryEmbl;
import org.snpeff.snpEffect.factory.SnpEffPredictorFactoryGenBank;
import org.snpeff.util.Gpr;
import org.snpeff.util.Timer;
/**
* Command line: Read protein sequences from a file and compare them to the ones calculated from our data structures
*
* Note: This is done in order to see potential incompatibility
* errors between genome sequence and annotation.
*
* @author pcingola
*/
public class SnpEffCmdProtein extends SnpEff {
public static boolean onlyOneError = false; // This is used in some test-cases
public static double MAX_ERROR_RATE = 0.05; // Maximum allowed error is 1% (otherwise test fails)
boolean checkNumOk = true;
boolean codonTables;
boolean storeAlignments; // Store alignments (used for some test cases)
int totalErrors = 0;
int totalOk = 0;
int totalWarnings = 0;
int totalNotFound = 0;
String configFile = Config.DEFAULT_CONFIG_FILE;
String proteinFile = "";
Map proteinByTrId;
AutoHashMap> trByChromo;
HashMap alignmentByTrId = new HashMap<>();
/**
* Count number of differences between strings
*/
public static int diffCount(String s1, String s2) {
int minLen = Math.min(s1.length(), s2.length());
int count = 0;
for (int j = 0; j < minLen; j++)
if (s1.charAt(j) != s2.charAt(j)) count++;
return count;
}
/**
* Show difference between two strings
*/
public static String diffStr(String s1, String s2) {
// Create a string indicating differences
int minLen = Math.min(s1.length(), s2.length());
char diff[] = new char[minLen];
for (int j = 0; j < minLen; j++) {
if (s1.charAt(j) != s2.charAt(j)) {
diff[j] = '|';
} else diff[j] = ' ';
}
return new String(diff);
}
public SnpEffCmdProtein() {
}
public SnpEffCmdProtein(Config config) {
this.config = config;
proteinFile = config.getFileNameProteins();
}
public SnpEffCmdProtein(Config config, String proteinFile) {
this.config = config;
this.proteinFile = proteinFile;
}
public SnpEffCmdProtein(String genomeVer, String configFile, String proteinFile) {
this.configFile = configFile;
this.genomeVer = genomeVer;
this.proteinFile = proteinFile;
}
void add(String trId, String seq, int lineNum, boolean check) {
// Repeated transcript Id? => Check that Protein is the same
if (check && (proteinByTrId.get(trId) != null) && (!proteinByTrId.get(trId).equals(seq))) //
System.err.println("ERROR: Different protein for the same transcript ID. This should never happen!!!"//
+ "\n\tLine number : " + lineNum //
+ "\n\tTranscript ID : '" + trId + "'"//
+ "\n\tProtein : " + proteinByTrId.get(trId) //
+ "\n\tProtein (new) : " + seq //
);
// Use whole trId
proteinByTrId.put(trId, seq); // Add it to the hash
if (debug) Gpr.debug("Adding proteinByTrId{'" + trId + "'} :\t" + seq);
}
/**
* Check proteins using all possible codon tables
*/
void checkCodonTables() {
if (verbose) Timer.showStdErr("Comparing Proteins...");
createTrByChromo(); // Create lists of transcripts by chromosome
// For each chromosome...
for (Chromosome chromo : genome) {
String chr = chromo.getId();
// Check against each codon table
for (CodonTable codonTable : CodonTables.getInstance()) {
setCodonTable(chromo, codonTable);
proteinCompare(chr, false, false);
}
}
if (verbose) Timer.showStdErr("done");
}
/**
* Check proteins
*/
void checkProteins() {
if (verbose) Timer.showStdErr("Comparing Proteins...");
if (codonTables) {
// Compare proteins using ALL codon tables
checkCodonTables();
} else {
// Compare proteins
proteinCompare(null, true, true);
}
}
void createTrByChromo() {
trByChromo = new AutoHashMap<>(new ArrayList());
for (Gene gene : genome.getGenes()) {
for (Transcript tr : gene) {
String chr = tr.getChromosomeName();
trByChromo.getOrCreate(chr).add(tr);
}
}
}
/**
* Compare two protein sequences
*/
boolean equals(String protein, String proteinRef) {
if (protein.isEmpty() && proteinRef.isEmpty()) return true;
protein = proteinFormat(protein);
proteinRef = proteinFormat(proteinRef);
if (protein.equals(proteinRef)) return true;
// Remove last AA in Ref sequence (e.g. when last AA in reference by is 'unknown')
String refUnk = "";
if (proteinRef.length() > 0) {
refUnk = proteinRef.substring(0, proteinRef.length() - 1);
if (protein.equals(refUnk)) return true;
}
// Replace First AA by 'Met'? Start codon may be translated as Met even if it normally encodes other AA.
// Compare everything but start codon.
String proteinNoStart = "", proteinRefNoStart = "";
if ((protein.length() > 0) && (proteinRef.length() > 0)) {
proteinNoStart = protein.substring(1);
proteinRefNoStart = proteinRef.substring(1);
if (proteinNoStart.equals(proteinRefNoStart)) return true;
}
// Rare amino acid translations (U)
String proteinU = protein.replace('*', 'U');
if (proteinU.equals(proteinRef) || proteinU.equals(refUnk)) return true;
String proteinNoStartU = proteinNoStart.replace('*', 'U');
if (proteinNoStartU.equals(proteinRefNoStart)) return true;
String proteinUnknownAa = replaceUnknownAa(proteinNoStartU, proteinRefNoStart);
if (proteinUnknownAa.equals(proteinRefNoStart)) return true;
return false;
}
/**
* Show an error message that actually helps to solve the problem
*/
void fatalErrorNoTranscriptsChecked() {
StringBuilder sb = new StringBuilder();
// Show some transcript IDs
int maxTrIds = 20;
sb.append("Transcript IDs from database (sample):\n" + sampleTrIds(maxTrIds));
sb.append("Transcript IDs from database (fasta file):\n" + sampleTrIdsFasta(maxTrIds));
fatalError("No proteins checked. This is might be caused by differences in FASTA file transcript IDs respect to database's transcript's IDs.\n" + sb);
}
public HashMap getAlignmentByTrId() {
return alignmentByTrId;
}
public int getTotalErrors() {
return totalErrors;
}
public int getTotalOk() {
return totalOk;
}
/**
* Parse command line arguments
*/
@Override
public void parseArgs(String[] args) {
for (int i = 0; i < args.length; i++) {
String arg = args[i];
// Argument starts with '-'?
if (isOpt(arg)) {
if (arg.equalsIgnoreCase("-codonTables")) codonTables = true;
else usage("Unknown option '" + arg + "'"); // Options
} else if (genomeVer.isEmpty()) genomeVer = arg;
else if (proteinFile.isEmpty()) proteinFile = arg;
else usage("Unknown parameter '" + arg + "'");
}
// Check: Do we have all required parameters?
if (genomeVer.isEmpty()) usage("Missing genomer_version parameter");
if (proteinFile.isEmpty()) usage("Missing protein_file parameter");
}
/**
* Compare list of proteins
*/
double proteinCompare(String chr, boolean addTotals, boolean updateTranscriptAaCheck) {
List trList = null;
// No chromosome name specified? => Use all transcripts
if (chr == null) {
trList = new ArrayList<>();
for (Gene g : genome.getGenes())
for (Transcript tr : g)
trList.add(tr);
} else trList = trByChromo.get(chr);
// No transcripts in the list? We are done
if (trList.isEmpty()) return 0;
int i = 1;
if (verbose) {
// Show labels
System.err.println("\tLabels:");
System.err.println("\t\t'+' : OK");
System.err.println("\t\t'.' : Missing");
System.err.println("\t\t'*' : Error");
System.out.print((chr != null ? chr : "") + "\t");
}
// Check each transcript
int countNotFound = 0, countOk = 0, countErrors = 0;
for (Transcript tr : trList) {
char status = ' ';
String protein = tr.protein();
String proteinReference = proteinByTrId.get(tr.getId());
if (proteinReference == null) {
if (tr.isProteinCoding()) {
status = '.';
if (debug) System.err.println("\nWARNING:Cannot find Protein for transcript " + tr.getId());
}
} else if (equals(protein, proteinReference)) {
status = '+';
} else {
status = '*';
if (debug || storeAlignments || onlyOneError) {
protein = proteinFormat(protein);
proteinReference = proteinFormat(proteinReference);
SmithWaterman sw = new SmithWaterman(protein, proteinReference);
if (Math.max(protein.length(), proteinReference.length()) < SnpEffCmdCds.MAX_ALIGN_LENGTH) sw.align();
if (storeAlignments) alignmentByTrId.put(tr.getId(), sw);
int maxScore = Math.min(protein.length(), proteinReference.length());
int score = sw.getAlignmentScore();
if (debug || onlyOneError) {
System.err.println("\nERROR: Proteins do not match for transcript " + tr.getId() //
+ "\tStrand:" + (tr.isStrandPlus() ? "+" : "-") //
+ "\tExons: " + tr.numChilds() //
+ "\n" //
+ String.format("\tSnpEff protein (%6d) : '%s'\n", protein.length(), protein) //
+ String.format("\tReference protein (%6d) : '%s'\n", proteinReference.length(), proteinReference) //
+ "\tAlignment (Snpeff protein vs Reference protein)." //
+ "\tScore: " + score //
+ "\tMax. possible score: " + maxScore //
+ "\tDiff: " + (maxScore - score) //
+ "\n" + sw //
);
System.err.println("Transcript details:\n" + tr);
}
}
if (onlyOneError) {
System.err.println("Transcript details:\n" + tr);
throw new RuntimeException("DIE");
}
}
// Update counters
boolean ok = false;
switch (status) {
case '.':
countNotFound++;
break;
case '+':
countOk++;
ok = true;
break;
case '*':
countErrors++;
break;
case ' ':
break;
default:
throw new RuntimeException("Unknown status '" + status + "'");
}
// Update transcript status
if (ok && updateTranscriptAaCheck) tr.setAaCheck(true);
// Show a mark
if (verbose && (status != ' ')) {
System.out.print(status);
i++;
if (i % 100 == 0) System.out.print("\n\t");
}
}
// Relative error rate
double errorRate = ((double) countErrors) / ((double) (countErrors + countOk));
if (verbose) System.out.println("\n");
System.out.println("\tProtein check:" //
+ "\t" + genome.getVersion() //
+ (chr != null ? "\tChromosome: " + chr : "") //
+ (chr != null ? "\tCodon table: " + CodonTables.getInstance().getTable(genome, chr).getName() : "") //
+ "\tOK: " + countOk //
+ "\tNot found: " + countNotFound //
+ "\tErrors: " + countErrors //
+ "\tError percentage: " + (100 * errorRate) + "%" //
);
// Add to totals
if (addTotals) {
totalNotFound += countNotFound;
totalOk += countOk;
totalErrors += countErrors;
} else if (checkNumOk && totalOk <= 0) {
fatalErrorNoTranscriptsChecked();
}
return errorRate;
}
/**
* Format proteins to make them easier to compare
*/
String proteinFormat(String protein) {
if (protein.isEmpty()) return "";
// We use upper case letters
protein = protein.toUpperCase();
// Stop codon is trimmed
int idxLast = protein.length() - 1;
char lastChar = protein.charAt(idxLast);
if ((lastChar == '*') || (lastChar == '?')) protein = protein.substring(0, idxLast);
// We use '?' as unknown protein
protein = protein.replace('X', '?');
// Remove staring '?' codons
if (protein.startsWith("?")) protein = protein.substring(1);
return protein;
}
/**
* Read a file that has all proteins in fasta format
*/
void readProteinFile() {
if (verbose) Timer.showStdErr("Reading proteins from file '" + proteinFile + "'...");
proteinByTrId = new HashMap<>();
if (proteinFile.endsWith("txt") || proteinFile.endsWith("txt.gz")) readProteinFileTxt();
else if (proteinFile.endsWith(SnpEffPredictorFactoryGenBank.EXTENSION_GENBANK)) readProteinFileGenBank();
else if (proteinFile.endsWith(SnpEffPredictorFactoryEmbl.EXTENSION_EMBL)) readProteinFileEmbl();
else readProteinFileFasta();
if (verbose) Timer.showStdErr("done (" + proteinByTrId.size() + " Proteins).");
}
/**
* Read proteins from EMBL file
*/
void readProteinFileEmbl() {
FeaturesFile featuresFile = new EmblFile(proteinFile);
readProteinFileFeatures(featuresFile);
}
/**
* Read Proteins from a file
* Format: Tab-separated format, containing "sequence \t transcriptId"
*/
void readProteinFileFasta() {
// Load file
FastaFileIterator ffi = new FastaFileIterator(proteinFile);
for (String seq : ffi) {
String trId = ffi.getTranscriptId();
add(trId, seq, ffi.getLineNum(), true);
// Also try processing header line using different separators
List ids = ffi.fastaHeader2Ids();
for (String id : ids) {
// We don't check for uniqueness here since many items in this
// list are tokens that are expected to be repeated
add(id, seq, ffi.getLineNum(), false);
}
}
}
/**
* Read sequences from features file
*/
void readProteinFileFeatures(FeaturesFile featuresFile) {
for (Features features : featuresFile) {
String trIdPrev = null;
for (Feature f : features.getFeatures()) { // Find all CDS
if (f.getType() == Type.GENE) {
// Clean up trId
trIdPrev = null;
} else if (f.getType() == Type.MRNA) {
// Save trId, so that next CDS record can find it
trIdPrev = f.getTranscriptId();
} else if (f.getType() == Type.CDS) { // Add CDS 'translation' record
// Try using the transcript ID found in the previous record
String trId = trIdPrev;
if (trId == null) trId = f.getTranscriptId();
String seq = f.getAasequence();
if (debug) Gpr.debug(trId + "\t" + seq);
if ((trId != null) && (seq != null)) add(trId, seq, -1, true);
}
}
}
}
/**
* Read proteins from geneBank file
*/
void readProteinFileGenBank() {
FeaturesFile featuresFile = new GenBankFile(proteinFile);
readProteinFileFeatures(featuresFile);
}
/**
* Read Proteins from a file
* Format: Tab-separated format, containing "sequence \t transcriptId"
*/
void readProteinFileTxt() {
// Load file
String proteinData = Gpr.readFile(proteinFile);
String proteinLines[] = proteinData.split("\n");
// Parse each line
int lineNum = 1;
for (String proteinLine : proteinLines) {
// Split tab separated fields
String field[] = proteinLine.split("\\s+");
// Parse fields
if (field.length >= 2) {
// OK Parse fields
String seq = field[1].trim();
String trId = field[0].trim();
add(trId, seq, lineNum, true);
}
lineNum++;
}
}
/**
* Replace unknown AA using the reference sequence
* @return A new protein sequence including AAs from the Reference sequence in "Unknown sites"
*/
String replaceUnknownAa(String protein, String proteinRef) {
char p[] = protein.toCharArray();
char pref[] = proteinRef.toCharArray();
if (p.length != pref.length) return protein; // Nothing done
for (int i = 0; i < p.length; i++) {
if (p[i] == '?' && pref[i] != '?') p[i] = pref[i];
}
return new String(p);
}
/**
* Run command
*/
@Override
public boolean run() {
if (verbose) Timer.showStdErr("Checking database using protein sequences");
loadConfig(); // Load config
if (proteinByTrId == null) readProteinFile(); // Read proteins
loadDb(); // Load database
checkProteins(); // Compare proteins
return true;
}
/**
* Show same Transcript IDs
*/
String sampleTrIds(int maxTrIds) {
StringBuilder sb = new StringBuilder();
int count = 0;
for (Gene g : config.getGenome().getGenes())
for (Transcript tr : g) {
sb.append("\t'" + tr.getId() + "'\n");
if (count++ > maxTrIds) return sb.toString();
}
return sb.toString();
}
/**
* Show same Transcript IDs from FASTA file
*/
String sampleTrIdsFasta(int maxTrIds) {
StringBuilder sb = new StringBuilder();
int count = 0;
for (String trid : proteinByTrId.keySet()) {
sb.append("\t'" + trid + "'\n");
if (count++ > maxTrIds) return sb.toString();
}
return sb.toString();
}
public void setCheckNumOk(boolean checkNumOk) {
this.checkNumOk = checkNumOk;
}
/**
* Set codon table for a given chromosome
*/
void setCodonTable(Chromosome chromo, CodonTable codonTable) {
CodonTables.getInstance().set(genome, chromo, codonTable); // Set codon tables
// Reset all protein translations for this chromosome
for (Transcript tr : trByChromo.get(chromo.getId()))
tr.resetCache();
}
public void setProteinByTrId(Map proteinByTrId) {
this.proteinByTrId = proteinByTrId;
}
public void setStoreAlignments(boolean storeAlignments) {
this.storeAlignments = storeAlignments;
}
/**
* Show usage and exit
*/
@Override
public void usage(String message) {
if (message != null) System.err.println("Error: " + message + "\n");
System.err.println("snpEff version " + SnpEff.VERSION);
System.err.println("Usage: snpEff protein [options] genome_version proteing_file");
System.err.println("\nOptions:");
System.err.println("\t-codonTables : Try all codon tables on each chromosome and calculate error rates.");
System.exit(-1);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy