
org.snpeff.snpEffect.commandLine.SnpEffCmdBuild Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package org.snpeff.snpEffect.commandLine;
import java.io.File;
import java.util.Collection;
import java.util.Map;
import org.snpeff.RegulationConsensusMultipleBed;
import org.snpeff.RegulationFileConsensus;
import org.snpeff.RegulationFileSplitBytType;
import org.snpeff.SnpEff;
import org.snpeff.codons.FindRareAaIntervals;
import org.snpeff.fileIterator.MotifFileIterator;
import org.snpeff.fileIterator.RegulationFileIterator;
import org.snpeff.fileIterator.RegulationGffFileIterator;
import org.snpeff.interval.ExonSpliceCharacterizer;
import org.snpeff.interval.Markers;
import org.snpeff.interval.Motif;
import org.snpeff.interval.RareAminoAcid;
import org.snpeff.motif.Jaspar;
import org.snpeff.snpEffect.SnpEffectPredictor;
import org.snpeff.snpEffect.factory.SnpEffPredictorFactory;
import org.snpeff.snpEffect.factory.SnpEffPredictorFactoryEmbl;
import org.snpeff.snpEffect.factory.SnpEffPredictorFactoryGenBank;
import org.snpeff.snpEffect.factory.SnpEffPredictorFactoryGenesFile;
import org.snpeff.snpEffect.factory.SnpEffPredictorFactoryGff2;
import org.snpeff.snpEffect.factory.SnpEffPredictorFactoryGff3;
import org.snpeff.snpEffect.factory.SnpEffPredictorFactoryGtf22;
import org.snpeff.snpEffect.factory.SnpEffPredictorFactoryKnownGene;
import org.snpeff.snpEffect.factory.SnpEffPredictorFactoryRefSeq;
import org.snpeff.util.Gpr;
import org.snpeff.util.Timer;
/**
* Command line program: Build database
*
* @author pcingola
*/
public class SnpEffCmdBuild extends SnpEff {
GeneDatabaseFormat geneDatabaseFormat; // Database format (only used if 'buildDb' is active)
boolean checkNumOk = true;
boolean storeAlignments; // Store alignments (used for some test cases)
boolean storeSequences = false; // Store full sequences
boolean regSortedByType = false;
String cellType = null;
SnpEffCmdProtein snpEffCmdProtein;
SnpEffCmdCds snpEffCmdCds;
public SnpEffCmdBuild() {
super();
geneDatabaseFormat = null; // GeneDatabaseFormat.GTF22; // Database format (only used if 'buildDb' is active)
}
/**
* Check if database is OK
*/
void checkDb(SnpEffPredictorFactory snpEffectPredictorFactory) {
//---
// Check using CDS file
//---
String cdsFile = config.getFileNameCds();
if (Gpr.canRead(cdsFile)) {
// Use FASTA format
if (verbose) Timer.showStdErr("CDS check (FASTA file): '" + cdsFile + "'\n");
snpEffCmdCds = new SnpEffCmdCds(config);
snpEffCmdCds.setVerbose(verbose);
snpEffCmdCds.setDebug(debug);
snpEffCmdCds.setStoreAlignments(storeAlignments);
snpEffCmdCds.setCheckNumOk(checkNumOk);
snpEffCmdCds.run();
} else if (debug) Timer.showStdErr("\tOptional file '" + cdsFile + "' not found, nothing done.");
//---
// Check using proteins file
//---
String protFile = null;
Map proteinByTrId = snpEffectPredictorFactory.getProteinByTrId(); // Some factories provide the transcritpId -> protein_sequence mapping
if (geneDatabaseFormat == GeneDatabaseFormat.GENBANK) {
// GenBank format
protFile = config.getBaseFileNameGenes() + SnpEffPredictorFactoryGenBank.EXTENSION_GENBANK;
} else if (geneDatabaseFormat == GeneDatabaseFormat.EMBL) {
// EMBL format
protFile = config.getBaseFileNameGenes() + SnpEffPredictorFactoryEmbl.EXTENSION_EMBL;
} else {
// Protein fasta file
protFile = config.getFileNameProteins();
}
if (Gpr.canRead(protFile)) {
if (verbose) Timer.showStdErr("Protein check file: '" + protFile + "'\n");
snpEffCmdProtein = new SnpEffCmdProtein(config, protFile);
snpEffCmdProtein.setVerbose(verbose);
snpEffCmdProtein.setDebug(debug);
snpEffCmdProtein.setStoreAlignments(storeAlignments);
snpEffCmdProtein.setCheckNumOk(checkNumOk);
snpEffCmdProtein.setProteinByTrId(proteinByTrId);
snpEffCmdProtein.run();
} else if (debug) Timer.showStdErr("\tOptional file '" + protFile + "' not found, nothing done.");
}
/**
* Create SnpEffectPredictor
*/
SnpEffPredictorFactory createSnpEffPredictorFactory() {
if (geneDatabaseFormat == null) geneDatabaseFormat = guessGenesFormat();
// Create factory
SnpEffPredictorFactory factory = null;
if (geneDatabaseFormat == GeneDatabaseFormat.GTF22) factory = new SnpEffPredictorFactoryGtf22(config);
else if (geneDatabaseFormat == GeneDatabaseFormat.GFF3) factory = new SnpEffPredictorFactoryGff3(config);
else if (geneDatabaseFormat == GeneDatabaseFormat.GFF2) factory = new SnpEffPredictorFactoryGff2(config);
else if (geneDatabaseFormat == GeneDatabaseFormat.REFSEQ) factory = new SnpEffPredictorFactoryRefSeq(config);
else if (geneDatabaseFormat == GeneDatabaseFormat.KNOWN_GENES) factory = new SnpEffPredictorFactoryKnownGene(config);
else if (geneDatabaseFormat == GeneDatabaseFormat.GENBANK) factory = new SnpEffPredictorFactoryGenBank(config);
else if (geneDatabaseFormat == GeneDatabaseFormat.EMBL) factory = new SnpEffPredictorFactoryEmbl(config);
else if (geneDatabaseFormat == GeneDatabaseFormat.BIOMART) factory = new SnpEffPredictorFactoryGenesFile(config);
else throw new RuntimeException("Unimplemented format " + geneDatabaseFormat);
// Create SnpEffPredictor
factory.setVerbose(verbose);
factory.setDebug(debug);
factory.setStoreSequences(storeSequences);
return factory;
}
/**
* Does either 'path' or 'path'+'.gz' exist?
*/
protected boolean fileExists(String path) {
return Gpr.exists(path) || Gpr.exists(path + ".gz");
}
public SnpEffCmdCds getSnpEffCmdCds() {
return snpEffCmdCds;
}
public SnpEffCmdProtein getSnpEffCmdProtein() {
return snpEffCmdProtein;
}
/**
* Try to guess database format by checking which file type is present
*/
protected GeneDatabaseFormat guessGenesFormat() {
String genesBase = config.getBaseFileNameGenes();
if (fileExists(genesBase + ".gtf")) return GeneDatabaseFormat.GTF22;
if (fileExists(genesBase + ".gff")) return GeneDatabaseFormat.GFF3;
if (fileExists(genesBase + ".gff2")) return GeneDatabaseFormat.GFF2;
if (fileExists(genesBase + ".gbk")) return GeneDatabaseFormat.GENBANK;
if (fileExists(genesBase + ".embl")) return GeneDatabaseFormat.EMBL;
if (fileExists(genesBase + ".refseq")) return GeneDatabaseFormat.REFSEQ;
if (fileExists(genesBase + ".kg")) return GeneDatabaseFormat.KNOWN_GENES;
if (fileExists(genesBase + ".biomart")) return GeneDatabaseFormat.BIOMART;
if (geneDatabaseFormat == null) fatalError("Cannot guess input database format for genome '" + genomeVer + "'. No genes file found '" + genesBase + ".*'");
return null;
}
/**
* Parse command line arguments
*/
@Override
public void parseArgs(String[] args) {
this.args = args;
for (int i = 0; i < args.length; i++) {
String arg = args[i];
// It is an argument?
if (isOpt(arg)) {
switch (arg.toLowerCase()) {
case "-gff3":
geneDatabaseFormat = GeneDatabaseFormat.GFF3;
break;
case "-gff2":
geneDatabaseFormat = GeneDatabaseFormat.GFF2;
break;
case "-gtf22":
geneDatabaseFormat = GeneDatabaseFormat.GTF22;
break;
case "-refseq":
geneDatabaseFormat = GeneDatabaseFormat.REFSEQ;
break;
case "-genbank":
geneDatabaseFormat = GeneDatabaseFormat.GENBANK;
break;
case "-knowngenes":
geneDatabaseFormat = GeneDatabaseFormat.KNOWN_GENES;
break;
case "-embl":
geneDatabaseFormat = GeneDatabaseFormat.EMBL;
break;
case "-txt":
geneDatabaseFormat = GeneDatabaseFormat.BIOMART;
break;
case "-storeseqs":
storeSequences = true;
break;
case "-nostoreseqs":
storeSequences = false;
break;
case "-onlyreg":
onlyRegulation = true;
break;
case "-celltype":
if ((i + 1) < args.length) cellType = args[++i];
else usage("Missing 'regType' argument");
break;
case "-regsortedbytype":
regSortedByType = true;
break;
default:
usage("Unknown option '" + arg + "'");
}
} else if (genomeVer.length() <= 0) genomeVer = arg;
else usage("Unknown parameter '" + arg + "'");
}
// Check: Do we have all required parameters?
if (genomeVer.isEmpty()) usage("Missing genomer_version parameter");
}
/**
* Calculate and add annotations for rare amino acids
*/
void rareAa(SnpEffectPredictor snpEffectPredictor) {
if (verbose) Timer.showStdErr("[Optional] Rare amino acid annotations");
String proteinsFile = config.getFileNameProteins();
try {
// Find a list of 'rare' amino acids
FindRareAaIntervals findRare = new FindRareAaIntervals(snpEffectPredictor.getGenome());
findRare.setVerbose(verbose);
Collection raas = findRare.findRareAa(proteinsFile);
// Add them all
for (RareAminoAcid raa : raas) {
if (verbose) System.err.println("\tAdding: " + raa);
snpEffectPredictor.add(raa);
}
if (verbose) Timer.showStdErr("Done.");
} catch (Throwable t) {
// If file does not exists, no problem
if (verbose) Timer.showStdErr("Warning: Cannot read optional protein sequence file '" + proteinsFile + "', nothing done.");
if (debug) t.printStackTrace();
}
}
/**
* Read regulatory elements from multiple BED files
*/
void readRegulationBed() {
if (verbose) Timer.showStdErr("[Optional] Reading regulation elements: BED ");
String inDir = config.getDirRegulationBed();
String outDir = config.getDirDataGenomeVersion();
// Is the directory present?
File dir = new File(inDir);
if (!dir.exists() || !dir.isDirectory()) {
if (verbose) Timer.showStdErr("Cannot find optional regulation dir '" + inDir + "', nothing done.");
return;
}
RegulationConsensusMultipleBed regBeds = new RegulationConsensusMultipleBed(inDir, outDir);
regBeds.setVerbose(verbose);
regBeds.setCellType(cellType);
regBeds.run();
}
/**
* Read regulation elements (only GFF3 file supported)
*/
void readRegulationGff() {
if (verbose) Timer.showStdErr("[Optional] Reading regulation elements: GFF");
String regulationFileName = config.getBaseFileNameRegulation() + ".gff";
// If file does not exists, no problem
if (!Gpr.canRead(regulationFileName)) {
if (verbose) Timer.showStdErr("Warning: Cannot read optional regulation file '" + regulationFileName + "', nothing done.");
return;
}
// Split large GFF files into smaller ones
RegulationFileIterator regulationFileIterator = new RegulationGffFileIterator(regulationFileName);
RegulationFileSplitBytType regSplit = new RegulationFileSplitBytType();
regSplit.setVerbose(verbose);
regSplit.splitFile(regulationFileIterator, config.getDirDataGenomeVersion());
// Create database for each individual GFF file
for (String regFileName : regSplit.getRegFileNames()) {
// Open the regulation file and create a consensus
regulationFileIterator = new RegulationGffFileIterator(regFileName);
RegulationFileConsensus regulationGffConsensus = new RegulationFileConsensus();
regulationGffConsensus.setVerbose(verbose);
regulationGffConsensus.setOutputDir(config.getDirDataGenomeVersion());
regulationGffConsensus.readFile(regulationFileIterator); // Read info from file
regulationGffConsensus.save(); // Save database
}
if (verbose) Timer.showStdErr("Done.");
}
/**
* Read regulation motif files
*/
void readRegulationMotif() {
if (verbose) Timer.showStdErr("[Optional] Reading motifs: GFF");
String motifFileName = config.getBaseFileNameMotif() + ".gff";
String motifBinFileName = config.getBaseFileNameMotif() + ".bin";
String pwmsFileName = config.getDirDataGenomeVersion() + "/pwms.bin";
if (!Gpr.exists(pwmsFileName)) {
if (verbose) Timer.showStdErr("Warning: Cannot open PWMs file " + pwmsFileName + ". Nothing done");
return;
}
try {
// Load all PWMs
if (verbose) Timer.showStdErr("\tLoading PWMs from : " + pwmsFileName);
Jaspar jaspar = new Jaspar();
jaspar.load(pwmsFileName);
// Open the regulation file and create a consensus
if (verbose) Timer.showStdErr("\tLoading motifs from : " + motifFileName);
MotifFileIterator motifFileIterator = new MotifFileIterator(motifFileName, config.getGenome(), jaspar);
Markers motifs = new Markers();
for (Motif motif : motifFileIterator)
motifs.add(motif);
if (verbose) Timer.showStdErr("\tLoadded motifs: " + motifs.size());
if (verbose) Timer.showStdErr("\tSaving motifs to: " + motifBinFileName);
motifs.save(motifBinFileName);
} catch (Throwable t) {
// If file does not exists, no problem
if (verbose) Timer.showStdErr("Warning: Cannot read optional motif file '" + motifFileName + "', nothing done.");
if (debug) t.printStackTrace();
}
}
/**
* Build database
*/
@Override
public boolean run() {
if (verbose) Timer.showStdErr("Building database for '" + genomeVer + "'");
loadConfig(); // Read configuration file
// Create SnpEffectPredictor
if (!onlyRegulation) {
SnpEffPredictorFactory snpEffectPredictorFactory = createSnpEffPredictorFactory();
SnpEffectPredictor snpEffectPredictor = snpEffectPredictorFactory.create();
config.setSnpEffectPredictor(snpEffectPredictor);
// Characterize exons (if possible)
ExonSpliceCharacterizer exonSpliceCharacterizer = new ExonSpliceCharacterizer(snpEffectPredictor.getGenome());
exonSpliceCharacterizer.setVerbose(verbose);
exonSpliceCharacterizer.characterize();
// Add read rare codons annotations, if possible
rareAa(snpEffectPredictor);
// Check database
checkDb(snpEffectPredictorFactory);
// Save database
if (verbose) Timer.showStdErr("Saving database");
snpEffectPredictor.save(config);
}
// Read regulation elements
if (cellType == null) readRegulationGff(); // CellType specific is meant for BED files.
readRegulationBed();
readRegulationMotif();
if (verbose) Timer.showStdErr("Done");
return true;
}
public void setCheckNumOk(boolean checkNumOk) {
this.checkNumOk = checkNumOk;
}
public void setStoreAlignments(boolean storeAlignments) {
this.storeAlignments = storeAlignments;
}
/**
* Show 'usage;' message and exit with an error code '-1'
*/
@Override
public void usage(String message) {
if (message != null) System.err.println("Error: " + message + "\n");
System.err.println("snpEff version " + VERSION);
System.err.println("Usage: snpEff build [options] genome_version");
System.err.println("\nBuild DB options:");
System.err.println("\nDatabase format option (default: Auto detect):");
System.err.println("\t-embl : Use Embl format.");
System.err.println("\t-genbank : Use GenBank format.");
System.err.println("\t-gff2 : Use GFF2 format (obsolete).");
System.err.println("\t-gff3 : Use GFF3 format.");
System.err.println("\t-gtf22 : Use GTF 2.2 format.");
System.err.println("\t-knowngenes : Use KnownGenes table from UCSC.");
System.err.println("\t-refseq : Use RefSeq table from UCSC.");
System.err.println("\nDatabase build options:");
System.err.println("\t-cellType : Only build regulation tracks for cellType .");
System.err.println("\t-noStoreSeqs : Do not store sequence in binary files. Default: " + !storeSequences);
System.err.println("\t-onlyReg : Only build regulation tracks.");
System.err.println("\t-regSortedByType : The 'regulation.gff' file is sorted by 'regulation type' instead of sorted by chromosome:pos. Default: " + regSortedByType);
System.err.println("\t-storeSeqs : Store sequence in binary files. Default: " + storeSequences);
usageGeneric();
System.exit(-1);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy