
org.snpeff.snpEffect.commandLine.SnpEffCmdEff Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package org.snpeff.snpEffect.commandLine;
import java.io.*;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.snpeff.SnpEff;
import org.snpeff.fileIterator.BedFileIterator;
import org.snpeff.fileIterator.VariantFileIterator;
import org.snpeff.fileIterator.VcfFileIterator;
import org.snpeff.filter.VariantEffectFilter;
import org.snpeff.interval.Marker;
import org.snpeff.interval.Markers;
import org.snpeff.interval.Transcript;
import org.snpeff.interval.Variant;
import org.snpeff.interval.VariantNonRef;
import org.snpeff.interval.tree.IntervalForest;
import org.snpeff.outputFormatter.BedAnnotationOutputFormatter;
import org.snpeff.outputFormatter.BedOutputFormatter;
import org.snpeff.outputFormatter.OutputFormatter;
import org.snpeff.outputFormatter.VcfOutputFormatter;
import org.snpeff.snpEffect.EffectType;
import org.snpeff.snpEffect.SnpEffectPredictor;
import org.snpeff.snpEffect.VariantEffect;
import org.snpeff.snpEffect.VariantEffect.EffectImpact;
import org.snpeff.snpEffect.VariantEffects;
import org.snpeff.snpEffect.VcfAnnotator;
import org.snpeff.stats.CountByType;
import org.snpeff.stats.VariantEffectStats;
import org.snpeff.stats.VariantStats;
import org.snpeff.stats.VcfStats;
import org.snpeff.util.Gpr;
import org.snpeff.util.Timer;
import org.snpeff.util.Tuple;
import org.snpeff.vcf.EffFormatVersion;
import org.snpeff.vcf.Pedigree;
import org.snpeff.vcf.VcfEntry;
import freemarker.template.Configuration;
import freemarker.template.DefaultObjectWrapper;
import freemarker.template.Template;
import freemarker.template.TemplateException;
/**
* Command line program: Predict variant effects
*
* @author Pablo Cingolani
*/
public class SnpEffCmdEff extends SnpEff implements VcfAnnotator {
public static final String SUMMARY_TEMPLATE = "snpEff_summary.ftl"; // Summary template file name
public static final String SUMMARY_CSV_TEMPLATE = "snpEff_csv_summary.ftl"; // Summary template file name
public static final String SUMMARY_GENES_TEMPLATE = "snpEff_genes.ftl"; // Genes template file name
public static final String DEFAULT_SUMMARY_HTML_FILE = "snpEff_summary.html";
public static final String DEFAULT_SUMMARY_CSV_FILE = "snpEff_summary.csv";
public static final String DEFAULT_SUMMARY_GENES_FILE = "snpEff_genes.txt";
public static final int SHOW_EVERY = 10 * 1000;
boolean anyCancerSample;
boolean cancer = false; // Perform cancer comparisons
boolean chromoPlots = true; // Create mutations by chromosome plots?
boolean createSummaryCsv = false;
boolean createSummaryHtml = true;
boolean lossOfFunction = true; // Create loss of function LOF tag?
boolean useGeneId = false; // Use gene ID instead of gene name (VCF output)
boolean useLocalTemplate = false; // Use template from 'local' file instead of 'jar' (this is only used for
// development and debugging)
boolean useOicr = false; // Use OICR tag
boolean useSequenceOntology = true; // Use Sequence Ontology terms
int totalErrs = 0;
int countVcfEntries = 0;
long countInputLines = 0;
long countVariants = 0;
long countEffects = 0;
String cancerSamples = null;
String chrStr = "";
String inputFile = ""; // Input file
String fastaProt = null;
String summaryFileCsv; // HTML Summary file name
String summaryFileHtml; // CSV Summary file name
String summaryGenesFile; // Gene table file
InputFormat inputFormat = InputFormat.VCF; // Format use in input files
OutputFormat outputFormat = OutputFormat.VCF; // Output format
VariantEffectFilter variantEffectResutFilter; // Filter prediction results
ArrayList filterIntervalFiles;// Files used for filter intervals
ArrayList inputFiles;
IntervalForest filterIntervals; // Filter only variants that match these intervals
VariantStats variantStats;
VariantEffectStats variantEffectStats;
SnpEffectPredictor snpEffectPredictor;
VcfStats vcfStats;
List vcfEntriesDebug = null; // Use for debugging or testing (in some test-cases)
EffFormatVersion formatVersion = EffFormatVersion.DEFAULT_FORMAT_VERSION;
Pedigree pedigree;
CountByType errByType, warnByType;
OutputFormatter outputFormatter = null;
Timer annotateTimer;
public SnpEffCmdEff() {
super();
chrStr = ""; // Default: Don't show 'chr' before chromosome
inputFile = ""; // variant input file
variantEffectResutFilter = new VariantEffectFilter(); // Filter prediction results
filterIntervalFiles = new ArrayList<>(); // Files used for filter intervals
summaryFileHtml = DEFAULT_SUMMARY_HTML_FILE;
summaryFileCsv = DEFAULT_SUMMARY_CSV_FILE;
summaryGenesFile = DEFAULT_SUMMARY_GENES_FILE;
}
public void setOutputWriter(BufferedWriter writer) {
this.outputFormatter.setOutputWriter(writer);
}
@Override
public boolean addHeaders(VcfFileIterator vcfFile) {
// This is done by VcfOutputFormatter, so there is nothing to do here.
return false;
}
/**
* Annotate: Calculate the effect of variants and show results
*/
public boolean annotate(String inputFile, String outputFile) {
// Initialize
annotateInit(outputFile);
VcfFileIterator vcf = null;
// Iterate over input files
switch (inputFormat) {
case VCF:
// FIXME: Need to fix multithreding mode
// vcf = (multiThreaded ? annotateVcfMulti(inputFile, outputFormatter) :
// :annotateVcf(inputFile));
vcf = annotateVcf(inputFile);
break;
case BED:
annotateBed(inputFile, outputFormatter);
break;
default:
throw new RuntimeException("Cannot create variant file iterator on input format '" + inputFormat + "'");
}
outputFormatter.close();
// Create reports and finish up
boolean err = annotateFinish(vcf);
return !err;
}
/**
* Annotate a VCF entry
*/
@Override
public boolean annotate(VcfEntry vcfEntry) {
boolean printed = false;
boolean filteredOut = false;
try {
countInputLines++;
countVcfEntries++;
// Find if there is a pedigree and if it has any 'derived' entry
if (cancer) {
VcfFileIterator vcfFile = vcfEntry.getVcfFileIterator();
if (vcfFile.isHeadeSection()) {
pedigree = readPedigree(vcfFile);
anyCancerSample = pedigree.anyDerived();
}
}
// Sample vcf entry
if (createSummaryHtml || createSummaryCsv)
vcfStats.sample(vcfEntry);
// Skip if there are filter intervals and they are not matched
if ((filterIntervals != null) && (filterIntervals.query(vcfEntry).isEmpty())) {
filteredOut = true;
return false;
}
// Create new 'section'
outputFormatter.startSection(vcfEntry);
// ---
// Analyze all changes in this VCF entry
// Note, this is the standard analysis.
// Next section deals with cancer: Somatic vs Germline comparisons
// ---
boolean impactLowOrHigher = false; // Does this entry have an impact (other than MODIFIER)?
List variants = vcfEntry.variants();
for (Variant variant : variants) {
// Show progress
showProgress();
// Annotate variant
impactLowOrHigher |= annotateVariant(variant);
}
// Perform cancer annotations
if (anyCancerSample && impactLowOrHigher)
annotateVariantCancer(variants, vcfEntry);
// Finish up this section
outputFormatter.printSection(vcfEntry);
printed = true;
} catch (Throwable t) {
totalErrs++;
error(t, "Error while processing VCF entry :\n\t" + vcfEntry + "\n"
+ t);
} finally {
if (!printed && !filteredOut)
outputFormatter.printSection(vcfEntry);
}
return true;
}
/**
* Iterate on all inputs and calculate effects. Note: This is used for all input
* formats except VCF, which has a different iteration modality
*/
void annotateBed(String inputFile, OutputFormatter outputFormatter) {
SnpEffectPredictor snpEffectPredictor = config.getSnpEffectPredictor();
// Create an input file iterator
VariantFileIterator variantFileIterator = new BedFileIterator(inputFile, config.getGenome());
// ---
// Iterate over input file
// ---
for (Variant variant : variantFileIterator) {
try {
countInputLines++;
countVariants++;
if (verbose && (countVariants % SHOW_EVERY == 0))
Timer.showStdErr("\t" + countVariants + " variants");
// Does it pass the filter? => Analyze
// Skip if there are filter intervals and they are not matched
if ((filterIntervals != null) && (filterIntervals.stab(variant).size() <= 0))
continue;
// Perform basic statistics about this variant
if (createSummaryHtml || createSummaryCsv)
variantStats.sample(variant);
// Calculate effects
VariantEffects variantEffects = snpEffectPredictor.variantEffect(variant);
// Create new 'section'
outputFormatter.startSection(variant);
// Show results
for (VariantEffect variantEffect : variantEffects) {
variantEffectStats.sample(variantEffect); // Perform basic statistics about this result
outputFormatter.add(variantEffect);
countEffects++;
}
// Finish up this section
outputFormatter.printSection(variant);
} catch (Throwable t) {
totalErrs++;
error(t, "Error while processing variant (line " + variantFileIterator.getLineNum() + ") :\n\t"
+ variant + "\n" + t);
}
}
// Close file iterator (not really needed, but just in case)
variantFileIterator.close();
}
/**
* Finish annotations and create reports
*/
@Override
public boolean annotateFinish(VcfFileIterator vcfFile) {
boolean ok = true;
if (vcfFile != null)
vcfFile.close();
// Creates a summary output file
if (createSummaryCsv) {
if (verbose)
Timer.showStdErr("Creating summary file: " + summaryFileCsv);
ok &= summary(SUMMARY_CSV_TEMPLATE, summaryFileCsv, true);
}
if (createSummaryHtml) {
if (verbose)
Timer.showStdErr("Creating summary file: " + summaryFileHtml);
ok &= summary(SUMMARY_TEMPLATE, summaryFileHtml, false);
}
// Creates genes output file
if (createSummaryHtml || createSummaryCsv) {
if (verbose)
Timer.showStdErr("Creating genes file: " + summaryGenesFile);
ok &= summary(SUMMARY_GENES_TEMPLATE, summaryGenesFile, true);
}
if (totalErrs > 0)
System.err.println(totalErrs + " errors.");
return !ok;
}
/**
* Calculate the effect of variants and show results
*/
public void annotateInit(String outputFile) {
snpEffectPredictor = config.getSnpEffectPredictor();
// Reset all counters
totalErrs = 0;
countInputLines = countVariants = countEffects = 0; // = countVariantsFilteredOut = 0;
anyCancerSample = false;
pedigree = null;
errByType = new CountByType();
warnByType = new CountByType();
countVcfEntries = 0;
annotateTimer = new Timer();
// Create 'stats' objects
variantStats = new VariantStats(config.getGenome());
variantEffectStats = new VariantEffectStats(config.getGenome());
variantEffectStats.setUseSequenceOntology(useSequenceOntology);
vcfStats = new VcfStats();
if (fastaProt != null) {
if ((new File(fastaProt)).delete() && verbose) {
Timer.showStdErr("Deleted protein fasta output file '" + fastaProt + "'");
}
}
// ---
// Create output formatter
// ---
outputFormatter = null;
switch (outputFormat) {
case VCF:
VcfOutputFormatter vof = new VcfOutputFormatter(vcfEntriesDebug);
vof.setFormatVersion(formatVersion);
vof.setLossOfFunction(lossOfFunction);
vof.setConfig(config);
outputFormatter = vof;
break;
case GATK:
outputFormatter = new VcfOutputFormatter(vcfEntriesDebug);
((VcfOutputFormatter) outputFormatter).setGatk(true);
break;
case BED:
outputFormatter = new BedOutputFormatter();
break;
case BEDANN:
outputFormatter = new BedAnnotationOutputFormatter();
break;
default:
throw new RuntimeException("Unknown output format '" + outputFormat + "'");
}
outputFormatter.setVersion(VERSION_AUTHOR);
outputFormatter.setCommandLineStr(commandLineStr(false));
outputFormatter.setVariantEffectResutFilter(variantEffectResutFilter);
outputFormatter.setSupressOutput(suppressOutput);
outputFormatter.setChrStr(chrStr);
outputFormatter.setUseSequenceOntology(useSequenceOntology);
outputFormatter.setUseOicr(useOicr);
outputFormatter.setUseHgvs(hgvs);
outputFormatter.setUseGeneId(useGeneId);
outputFormatter.setOutputFile(outputFile);
}
@Override
public boolean annotateInit(VcfFileIterator vcfFile) {
if (inputFormat != InputFormat.VCF || outputFormat != OutputFormat.VCF)
throw new RuntimeException();
annotateInit((String) null);
return false;
}
/**
* Annotate a single variant
*
* @param variant
* @return true if there is any impact 'Low' or higher
*/
boolean annotateVariant(Variant variant) {
// Calculate effects: By default do not annotate non-variant sites
if (!variant.isVariant())
return false;
boolean impactModerateOrHigh = false; // Does this entry have a 'MODERATE' or 'HIGH' impact?
boolean impactLowOrHigher = false; // Does this entry have an impact (other than MODIFIER)?
// Perform basic statistics about this variant
if (createSummaryHtml || createSummaryCsv)
variantStats.sample(variant);
VariantEffects variantEffects = snpEffectPredictor.variantEffect(variant);
// Create new 'section'
outputFormatter.startSection(variant);
// Show results
for (VariantEffect variantEffect : variantEffects) {
if (createSummaryHtml || createSummaryCsv)
variantEffectStats.sample(variantEffect); // Perform basic statistics about this result
// Any errors or warnings?
if (variantEffect.hasError())
errByType.inc(variantEffect.getError());
if (variantEffect.hasWarning())
warnByType.inc(variantEffect.getWarning());
// Does this entry have an impact (other than MODIFIER)?
impactLowOrHigher |= (variantEffect.getEffectImpact() != EffectImpact.MODIFIER);
impactModerateOrHigh |= (variantEffect.getEffectImpact() == EffectImpact.MODERATE)
|| (variantEffect.getEffectImpact() == EffectImpact.HIGH);
outputFormatter.add(variantEffect);
countEffects++;
}
// Finish up this section
outputFormatter.printSection(variant);
// Output protein changes to FASTA file
if (fastaProt != null && impactModerateOrHigh)
proteinAltSequence(variant, variantEffects);
return impactLowOrHigher;
}
/**
* Compare two genotypes
*/
void annotateVariantCancer(List variants, int altGtNum, int refGtNum) {
VariantNonRef varNonRef = variantCancer(variants, altGtNum, refGtNum);
// No net variation? Skip
if (!varNonRef.isVariant())
return;
// Calculate effects
VariantEffects variantEffects = snpEffectPredictor.variantEffect(varNonRef);
// Create new 'section'
outputFormatter.startSection(varNonRef);
// Show results (note, we don't add these to the statistics)
for (VariantEffect variantEffect : variantEffects)
outputFormatter.add(variantEffect);
// Finish up this section
outputFormatter.printSection(varNonRef);
}
/**
* Do we analyze cancer samples? Here we deal with Somatic vs Germline
* comparisons
*/
void annotateVariantCancer(List variants, VcfEntry vcfEntry) {
if (!shouldAnnotateVariantCancer(variants, vcfEntry))
return;
// Calculate all required comparisons
Set> comparisons = pedigree.compareCancerGenotypes(vcfEntry);
// Analyze each comparison
for (Tuple comp : comparisons) {
// We have to compare comp.first vs comp.second
int altGtNum = comp.first; // comp.first is 'derived' (our new ALT)
int refGtNum = comp.second; // comp.second is 'original' (our new REF)
annotateVariantCancer(variants, altGtNum, refGtNum);
}
}
/**
* Iterate on all inputs (VCF) and calculate effects. Note: This is used only on
* input format VCF, which has a different iteration modality
*/
VcfFileIterator annotateVcf(String inputFile) {
return annotateVcf(new VcfFileIterator(inputFile, genome));
}
public VcfFileIterator annotateVcf(VcfFileIterator vcfFile) {
// Open VCF file
vcfFile.setDebug(debug);
// Iterate over VCF entries
for (VcfEntry vcfEntry : vcfFile)
annotate(vcfEntry);
// Empty file? Show at least the header
if (countVcfEntries == 0)
outputFormatter.print(vcfFile.getVcfHeader().toString());
// Show errors and warnings
if (verbose) {
if (!errByType.isEmpty())
System.err.println(
"\nERRORS: Some errors were detected\nError type\tNumber of errors\n" + errByType + "\n");
if (!warnByType.isEmpty())
System.err.println("\nWARNINGS: Some warning were detected\nWarning type\tNumber of warnings\n"
+ warnByType + "\n");
}
return vcfFile;
}
// /**
// * Multi-threaded iteration on VCF inputs and calculates effects.
// * Note: This is used only on input format VCF, which has a different
// iteration modality
// */
// VcfFileIterator annotateVcfMulti(String inputFile, final OutputFormatter
// outputFormatter) {
// if (verbose) Timer.showStdErr("Running multi-threaded mode (numThreads=" +
// numWorkers + ").");
//
// outputFormatter.setShowHeader(false); // Master process takes care of the
// header (instead of outputFormatter). Otherwise you get the header printed one
// time per worker.
//
// // We need final variables for the inner class
// final SnpEffectPredictor snpEffectPredictor = config.getSnpEffectPredictor();
// final VcfOutputFormatter vcfOutForm = (VcfOutputFormatter) outputFormatter;
// final SnpEffCmdEff snpEffCmdEff = this;
//
// VcfFileIterator vcf = new VcfFileIterator(inputFile, config.getGenome());
//
// // Master factory
// Props props = new Props(new UntypedActorFactory() {
//
// private static final long serialVersionUID = 1L;
//
// @Override
// public Actor create() {
// MasterEff master = new MasterEff(numWorkers, snpEffCmdEff,
// snpEffectPredictor, outputFormatter, filterIntervals);
// master.setAddHeader(vcfOutForm.getNewHeaderLines().toArray(new String[0]));
// return master;
// }
// });
//
// // Create and run queue
// int batchSize = 10;
// VcfWorkQueue vcfWorkQueue = new VcfWorkQueue(inputFile, config, batchSize,
// -1, props);
// vcfWorkQueue.run(true);
//
// return vcf;
// }
public VariantEffectStats getChangeEffectResutStats() {
return variantEffectStats;
}
public int getTotalErrs() {
return totalErrs;
}
public VariantStats getvariantStats() {
return variantStats;
}
/**
* Create a suitable output file name
*/
String outputFile(String inputFile) {
// Remove GZ extension
String base = Gpr.baseName(inputFile, ".gz");
// Remove extension according to input format
switch (inputFormat) {
case VCF:
base = Gpr.baseName(inputFile, ".vcf");
break;
case BED:
base = Gpr.baseName(inputFile, ".bed");
break;
default:
throw new RuntimeException("Unimplemented option for input file type " + inputFormat);
}
String outputFile = Gpr.dirName(inputFile) + "/" + base + ".eff";
// Add extension according to output format
switch (outputFormat) {
case BED:
case BEDANN:
outputFile += ".bed";
break;
case VCF:
case GATK:
outputFile += ".vcf";
break;
default:
throw new RuntimeException("Unimplemented option for output file type " + outputFormat);
}
// Create summary file names
if (createSummaryCsv)
summaryFileCsv = Gpr.dirName(inputFile) + "/" + base + "_summary.csv";
if (createSummaryHtml)
summaryFileHtml = Gpr.dirName(inputFile) + "/" + base + "_summary.html";
summaryGenesFile = Gpr.dirName(inputFile) + "/" + base + "_genes.txt";
return outputFile;
}
/**
* Parse command line arguments
*/
@Override
public void parseArgs(String[] args) {
if (args == null)
return;
boolean isFileList = false;
this.args = args;
for (int i = 0; i < args.length; i++) {
String arg = args[i];
// Is it a command line option?
// Note: Generic options (such as config, verbose, debug, quiet, etc.) are
// parsed by SnpEff class
// ---
if (isOpt(arg)) {
if (arg.equalsIgnoreCase("-fileList"))
isFileList = true;
else {
arg = arg.toLowerCase();
switch (arg) {
// ---
// Output options
// ---
case "-chr":
chrStr = args[++i];
break;
case "-csvstats":
createSummaryCsv = true; // Create a CSV formatted summary file.
if ((i + 1) < args.length) {
summaryFileCsv = args[++i];
String base = Gpr.baseName(summaryFileCsv, ".csv");
String dir = Gpr.dirName(summaryFileCsv);
summaryGenesFile = (dir != null ? dir + "/" : "") + base + ".genes.txt";
} else
usage("Missing parameter: CSV stats file name ");
break;
case "-fastaprot":
if ((i + 1) < args.length)
fastaProt = args[++i]; // Output protein sequences in fasta files
else
usage("Missing -cancerSamples argument");
break;
case "-nochromoplots":
chromoPlots = false;
break;
case "-nostats":
createSummaryHtml = createSummaryCsv = false;
break;
case "-o": // Output format
if ((i + 1) < args.length) {
String outFor = args[++i].toUpperCase();
// if (outFor.equals("TXT")) outputFormat = OutputFormat.TXT;
if (outFor.equals("VCF"))
outputFormat = OutputFormat.VCF;
else if (outFor.equals("GATK")) {
outputFormat = OutputFormat.GATK;
useSequenceOntology = false;
hgvs = false;
nextProt = false;
motif = false;
// GATK doesn't support SPLICE_REGION at the moment.
// Set parameters to zero so that splcie regions are not created.
spliceRegionExonSize = spliceRegionIntronMin = spliceRegionIntronMax = 0;
} else if (outFor.equals("BED")) {
outputFormat = OutputFormat.BED;
lossOfFunction = false;
} else if (outFor.equals("BEDANN")) {
outputFormat = OutputFormat.BEDANN;
lossOfFunction = false;
} else if (outFor.equals("TXT"))
usage("Output format 'TXT' has been deprecated. Please use 'VCF' instead.\nYou can extract VCF fields to a TXT file using 'SnpSift extractFields' (http://snpeff.sourceforge.net/SnpSift.html#Extract).");
else
usage("Unknown output file format '" + outFor + "'");
}
break;
case "-s":
case "-stats":
case "-htmlstats":
createSummaryHtml = true;
if ((i + 1) < args.length) {
summaryFileHtml = args[++i];
String base = Gpr.baseName(summaryFileHtml, ".html");
String dir = Gpr.dirName(summaryFileHtml);
summaryGenesFile = (dir != null ? dir + "/" : "") + base + ".genes.txt";
} else
usage("Missing parameter: HTML stats file name ");
break;
case "-uselocaltemplate": // Undocumented option (only used for development & debugging)
useLocalTemplate = true;
break;
// ---
// Annotation options
// ---
case "-cancer":
cancer = true; // Perform cancer comparisons
break;
case "-cancersamples":
if ((i + 1) < args.length)
cancerSamples = args[++i]; // Read cancer samples from TXT files
else
usage("Missing -cancerSamples argument");
break;
case "-classic":
useSequenceOntology = false;
formatVersion = EffFormatVersion.FORMAT_EFF_4;
hgvs = hgvsForce;
break;
case "-formateff":
formatVersion = EffFormatVersion.FORMAT_EFF_4;
break;
case "-geneid":
useGeneId = true; // Use gene ID instead of gene name
break;
case "-lof":
lossOfFunction = true; // Add LOF tag
break;
case "-nohgvs":
hgvs = false; // Do not use HGVS notation
hgvsShift = false;
break;
case "-noshifthgvs":
case "-no_shift_hgvs":
hgvsShift = false;
break;
case "-nolof":
lossOfFunction = false; // Do not add LOF tag
break;
case "-oicr":
useOicr = true; // Use OICR tag
break;
case "-sequenceontology":
useSequenceOntology = true; // Use SO temrs
break;
// ---
// Input options
// ---
case "-fi":
case "-filterinterval":
if ((i + 1) < args.length)
filterIntervalFiles.add(args[++i]);
else
usage("Option '-fi' without config filter_interval_file argument");
break;
case "-i":
// Input format
if ((i + 1) < args.length) {
String inFor = args[++i].toUpperCase();
if (inFor.equals("VCF")) {
inputFormat = InputFormat.VCF;
outputFormat = OutputFormat.VCF;
} else if (inFor.equals("BED")) {
inputFormat = InputFormat.BED;
outputFormat = OutputFormat.BED;
lossOfFunction = false;
} else if (inFor.equals("TXT"))
usage("Input format 'TXT' has been deprecated. Please use 'VCF' instead.");
else
usage("Unknown input file format '" + inFor + "'");
} else
usage("Missing input format in command line option '-i'");
break;
// ---
// Filters
// ---
case "-no-downstream":
variantEffectResutFilter.add(EffectType.DOWNSTREAM);
break;
case "-no-upstream":
variantEffectResutFilter.add(EffectType.UPSTREAM);
break;
case "-no-intergenic":
variantEffectResutFilter.add(EffectType.INTERGENIC);
break;
case "-no-intron":
variantEffectResutFilter.add(EffectType.INTRON);
break;
case "-no-utr":
variantEffectResutFilter.add(EffectType.UTR_3_PRIME);
variantEffectResutFilter.add(EffectType.UTR_3_DELETED);
variantEffectResutFilter.add(EffectType.UTR_5_PRIME);
variantEffectResutFilter.add(EffectType.UTR_5_DELETED);
break;
case "-no":
String filterOut = "";
if ((i + 1) < args.length)
filterOut = args[++i];
String filterOutArray[] = filterOut.split(",");
for (String filterStr : filterOutArray) {
if (filterStr.equalsIgnoreCase("utr")) {
variantEffectResutFilter.add(EffectType.UTR_3_PRIME);
variantEffectResutFilter.add(EffectType.UTR_3_DELETED);
variantEffectResutFilter.add(EffectType.UTR_5_PRIME);
variantEffectResutFilter.add(EffectType.UTR_5_DELETED);
} else if (filterStr.equalsIgnoreCase("None"))
; // OK, nothing to do
else
variantEffectResutFilter.add(EffectType.valueOf(filterStr.toUpperCase()));
}
break;
default:
usage("Unknown option '" + arg + "'");
}
}
} else if (genomeVer.isEmpty())
genomeVer = arg;
else if (inputFile.isEmpty())
inputFile = arg;
else
usage("Unknown parameter '" + arg + "'");
}
// ---
// Sanity checks
// ---
// Check: Do we have all required parameters?
if (genomeVer == null || genomeVer.isEmpty())
usage("Missing genomer_version parameter");
// Check input file
if (inputFile.isEmpty())
inputFile = "-"; // Use STDIN as default
else if (!Gpr.canRead(inputFile))
usage("Cannot read input file '" + inputFile + "'");
// Read input files from file list?
if (isFileList) {
inputFiles = new ArrayList<>();
for (String file : readFile(inputFile).split("\n"))
inputFiles.add(file);
}
// Sanity checks for VCF output format
boolean isOutVcf = (outputFormat == OutputFormat.VCF) || (outputFormat == OutputFormat.GATK);
if (isOutVcf && (inputFormat != InputFormat.VCF))
usage("Output in VCF format is only supported when the input is also in VCF format");
if (!isOutVcf && lossOfFunction)
usage("Loss of function annotation is only supported when when output is in VCF format");
if (!isOutVcf && cancer)
usage("Canccer annotation is only supported when when output is in VCF format");
// Sanity check for multi-threaded version
if (multiThreaded) {
createSummaryHtml = false; // This is implied ( '-t' => '-noStats' )
createSummaryCsv = false;
}
if (multiThreaded && cancer)
usage("Cancer analysis is currently not supported in multi-threaded mode.");
if (multiThreaded && !isOutVcf)
usage("Multi-threaded option is only supported when when output is in VCF format");
if (multiThreaded && (createSummaryHtml || createSummaryCsv))
usage("Multi-threaded option should be used with 'noStats'.");
}
/**
* Append ALT protein sequence to 'fastaProt' file
*/
void proteinAltSequence(Variant var, VariantEffects variantEffects) {
Set doneTr = new HashSet<>();
for (VariantEffect varEff : variantEffects) {
Transcript tr = varEff.getTranscript();
if (tr == null || doneTr.contains(tr))
continue;
// Calculate sequence after applying variant
Transcript trAlt = tr.apply(var);
// Build fasta entries and append to file
StringBuilder sb = new StringBuilder();
sb.append(">" + tr.getId() + " Ref\n" + tr.protein() + "\n");
sb.append(">" + tr.getId() + " Variant " //
+ var.getChromosomeName() //
+ ":" + (var.getStart() + 1) //
+ "-" + (var.getEnd() + 1) //
+ " Ref:" + var.getReference() //
+ " Alt:" + var.getAlt() //
+ " HGVS.p:" + varEff.getHgvsProt() //
+ "\n" //
+ trAlt.protein() + "\n" //
);
Gpr.toFile(fastaProt, sb, true);
doneTr.add(tr);
}
}
/**
* Read a file after checking for some common error conditions
*/
String readFile(String fileName) {
File file = new File(fileName);
if (!file.exists())
fatalError("No such file '" + fileName + "'");
if (!file.canRead())
fatalError("Cannot open file '" + fileName + "'");
return Gpr.readFile(fileName);
}
/**
* Read a filter custom interval file
*/
int readFilterIntFile(String intFile) {
Markers markers = loadMarkers(intFile);
if (filterIntervals == null)
filterIntervals = new IntervalForest();
for (Marker filterInterval : markers)
filterIntervals.add(filterInterval);
return markers.size();
}
/**
* Read pedigree either from VCF header or from cancerSample file
*/
Pedigree readPedigree(VcfFileIterator vcfFile) {
if (cancerSamples != null)
return new Pedigree(vcfFile, cancerSamples);
return new Pedigree(vcfFile);
}
@Override
public HashMap reportValues() {
HashMap report = super.reportValues();
if (variantStats != null)
report.put("variants", variantStats.getCount() + "");
return report;
}
/**
* Run according to command line options
*/
@Override
public boolean run() {
return run(false) != null;
}
/**
* Run according to command line options
*/
public List run(boolean createList) {
// ---
// Prepare to run
// ---
// Nothing to filter out => don't waste time
if (!variantEffectResutFilter.anythingSet())
variantEffectResutFilter = null;
filterIntervals = null;
loadConfig(); // Read config file
loadDb(); // Load database
// Check if we can open the input file (no need to check if it is STDIN)
if (!Gpr.canRead(inputFile))
usage("Cannot open input file '" + inputFile + "'");
// Read filter interval files
for (String filterIntFile : filterIntervalFiles) {
if (filterIntervals == null)
filterIntervals = new IntervalForest();
if (verbose)
Timer.showStdErr("Reading filter interval file '" + filterIntFile + "'");
int count = readFilterIntFile(filterIntFile);
if (verbose)
Timer.showStdErr("done (" + count + " intervals loaded). ");
}
// Build interval forest for filter (if any)
if (filterIntervals != null) {
if (verbose)
Timer.showStdErr("Building filter interval forest");
filterIntervals.build();
if (verbose)
Timer.showStdErr("done.");
}
// Store VCF results in a list?
if (createList)
vcfEntriesDebug = new ArrayList<>();
// Predict
boolean ok = true;
if (verbose)
Timer.showStdErr("Predicting variants");
if (inputFiles == null) {
// Single input file, output to STDOUT (typical usage)
ok = annotate(inputFile, null);
} else {
// Multiple input and output files
for (String inputFile : inputFiles) {
String outputFile = outputFile(inputFile);
if (verbose)
Timer.showStdErr("Analyzing file" //
+ "\n\tInput : '" + inputFile + "'" //
+ "\n\tOutput : '" + outputFile + "'" //
+ (createSummaryHtml ? "\n\tSummary (HTML): '" + summaryFileHtml + "'" : "") //
+ (createSummaryCsv ? "\n\tSummary (CSV) : '" + summaryFileCsv + "'" : "") //
);
ok &= annotate(inputFile, outputFile);
}
}
if (verbose)
Timer.showStdErr("done.");
if (!ok)
return null;
if (vcfEntriesDebug == null)
return new ArrayList<>();
return vcfEntriesDebug;
}
public void setFormatVersion(EffFormatVersion formatVersion) {
this.formatVersion = formatVersion;
}
/**
* Should we annotate cancer variants?
*/
boolean shouldAnnotateVariantCancer(List variants, VcfEntry vcfEntry) {
if (vcfEntry.isMultiallelic())
return true;
// Bi-allelic are analyzed only if there are "back to REF" mutation
return pedigree.anyBackToRef(vcfEntry);
}
/**
* Show annotation progress
*/
void showProgress() {
countVariants++;
if (verbose && (countVariants % SHOW_EVERY == 0)) {
int millisec = ((int) annotateTimer.elapsed());
int secs = millisec / 1000;
if (secs > 0) {
int varsPerSec = (int) (countVariants * 1000.0 / millisec);
Timer.showStdErr("\t" + countVariants + " variants (" + varsPerSec + " variants per second), "
+ countVcfEntries + " VCF entries");
}
}
}
/**
* Creates a summary output file (using freeMarker and a template)
*/
boolean summary(String templateFile, String outputFile, boolean noCommas) {
try {
// Configure FreeMaker
Configuration cfg = new Configuration();
// Specify the data source where the template files come from
if (useLocalTemplate)
cfg.setDirectoryForTemplateLoading(new File("./templates/")); // Use local 'template' directory
else
cfg.setClassForTemplateLoading(SnpEffCmdEff.class, "/"); // Use current directory in JAR file
cfg.setObjectWrapper(new DefaultObjectWrapper()); // Specify how templates will see the data-model. This is
// an advanced topic...
cfg.setLocale(java.util.Locale.US);
if (noCommas)
cfg.setNumberFormat("0.######");
// Create the root hash (where data objects are)
HashMap root = summaryCreateHash();
// Get the template
Template temp = cfg.getTemplate(templateFile);
// Process the template
Writer out = new OutputStreamWriter(new FileOutputStream(new File(outputFile)));
temp.process(root, out);
out.flush();
out.close();
} catch (IOException e) {
error(e, "Error creating summary: " + e.getMessage());
return false;
} catch (TemplateException e) {
error(e, "Error creating summary: " + e.getMessage());
return false;
}
return true;
}
/**
* Create a hash with all variables needed for creating summary pages
*/
HashMap summaryCreateHash() {
// Create the root hash (where data objects are)
HashMap root = new HashMap<>();
root.put("args", commandLineStr(createSummaryCsv ? false : true));
root.put("changeStats", variantEffectStats);
root.put("chromoPlots", chromoPlots);
root.put("countEffects", countEffects);
root.put("countInputLines", countInputLines);
root.put("countVariants", countVariants);
root.put("date", String.format("%1$TY-%1$Tm-%1$Td %1$TH:%1$TM", new Date()));
root.put("genesFile", Gpr.baseName(summaryGenesFile, ""));
root.put("genome", config.getGenome());
root.put("genomeVersion", genomeVer);
root.put("variantEffectResutFilter", variantEffectResutFilter);
root.put("variantStats", variantStats);
root.put("snpEffectPredictor", config.getSnpEffectPredictor());
root.put("vcfStats", vcfStats);
root.put("version", SnpEff.VERSION); // Version used
return root;
}
/**
* Show 'usage;' message and exit with an error code '-1'
*
* @param message
*/
@Override
public void usage(String message) {
if (message != null) {
System.err.println("Error :\t" + message);
System.err.println("Command line :\t" + commandLineStr(false) + "\n");
}
System.err.println("snpEff version " + VERSION);
System.err.println("Usage: snpEff [eff] [options] genome_version [input_file]");
System.err.println("\n");
System.err.println("\tvariants_file : Default is STDIN");
System.err.println("\n");
System.err.println("\nOptions:");
System.err.println(
"\t-chr : Prepend 'string' to chromosome name (e.g. 'chr1' instead of '1'). Only on TXT output.");
System.err.println(
"\t-classic : Use old style annotations instead of Sequence Ontology and Hgvs.");
System.err.println("\t-csvStats : Create CSV summary file.");
System.err.println(
"\t-download : Download reference genome if not available. Default: " + download);
System.err.println("\t-i : Input format [ vcf, bed ]. Default: VCF.");
System.err.println("\t-fileList : Input actually contains a list of files to process.");
System.err
.println("\t-o : Ouput format [ vcf, gatk, bed, bedAnn ]. Default: VCF.");
System.err.println("\t-s , -stats, -htmlStats : Create HTML summary file. Default is '"
+ DEFAULT_SUMMARY_HTML_FILE + "'");
System.err.println("\t-noStats : Do not create stats (summary) file");
System.err.println("\nResults filter options:");
System.err.println(
"\t-fi , -filterInterval : Only analyze changes that intersect with the intervals specified in this file (you may use this option many times)");
System.err.println("\t-no-downstream : Do not show DOWNSTREAM changes");
System.err.println("\t-no-intergenic : Do not show INTERGENIC changes");
System.err.println("\t-no-intron : Do not show INTRON changes");
System.err.println("\t-no-upstream : Do not show UPSTREAM changes");
System.err.println("\t-no-utr : Do not show 5_PRIME_UTR or 3_PRIME_UTR changes");
System.err.println(
"\t-no : Do not show 'EffectType'. This option can be used several times.");
System.err.println("\nAnnotations options:");
System.err.println(
"\t-cancer : Perform 'cancer' comparisons (Somatic vs Germline). Default: "
+ cancer);
System.err.println(
"\t-cancerSamples : Two column TXT file defining 'oringinal \\t derived' samples.");
System.err.println(
"\t-fastaProt : Create an output file containing the resulting protein sequences.");
System.err.println(
"\t-formatEff : Use 'EFF' field compatible with older versions (instead of 'ANN').");
System.err
.println("\t-geneId : Use gene ID instead of gene name (VCF output). Default: "
+ useGeneId);
System.err.println(
"\t-hgvs : Use HGVS annotations for amino acid sub-field. Default: " + hgvs);
System.err.println("\t-hgvsOld : Use old HGVS notation. Default: " + hgvsOld);
System.err.println(
"\t-hgvs1LetterAa : Use one letter Amino acid codes in HGVS notation. Default: "
+ hgvsOneLetterAa);
System.err.println(
"\t-hgvsTrId : Use transcript ID in HGVS notation. Default: " + hgvsTrId);
System.err.println(
"\t-lof : Add loss of function (LOF) and Nonsense mediated decay (NMD) tags.");
System.err.println("\t-noHgvs : Do not add HGVS annotations.");
System.err.println("\t-noLof : Do not add LOF and NMD annotations.");
System.err.println(
"\t-noShiftHgvs : Do not shift variants according to HGVS notation (most 3prime end).");
System.err.println("\t-oicr : Add OICR tag in VCF file. Default: " + useOicr);
System.err.println(
"\t-sequenceOntology : Use Sequence Ontology terms. Default: " + useSequenceOntology);
usageGenericAndDb();
System.exit(-1);
}
/**
* Create a cancer variant using alt and ref genotypes
*/
VariantNonRef variantCancer(List variants, int altGtNum, int refGtNum) {
// Is this a "back to reference" variant?
// Example:
// Germline: 'A' -> 'T'
// Somatic: 'T' -> 'A' (i.e. reverses the gerline mutation to the genome
// reference)
if (altGtNum == 0) {
Variant variantRef = variants.get(refGtNum - 1); // After applying this variant, we get the new 'reference'
Variant variantAlt = variantRef.reverse(); // This our new 'variant'. The effect of this variant is "back to
// reference"
return new VariantNonRef(variantAlt, variantRef);
}
Variant variantRef = variants.get(refGtNum - 1); // After applying this variant, we get the new 'reference'
Variant variantAlt = variants.get(altGtNum - 1); // This our new 'variant'
return new VariantNonRef(variantAlt, variantRef);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy