
org.snpeff.snpEffect.commandLine.SnpEffCmdGsa Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package org.snpeff.snpEffect.commandLine;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import org.snpeff.SnpEff;
import org.snpeff.collections.AutoHashMap;
import org.snpeff.fileIterator.BedFileIterator;
import org.snpeff.fileIterator.LineFileIterator;
import org.snpeff.fileIterator.VcfFileIterator;
import org.snpeff.geneSets.GeneSets;
import org.snpeff.geneSets.GeneSetsRanked;
import org.snpeff.geneSets.GeneStats;
import org.snpeff.geneSets.algorithm.EnrichmentAlgorithm;
import org.snpeff.geneSets.algorithm.EnrichmentAlgorithmGreedyVariableSize;
import org.snpeff.geneSets.algorithm.FisherPValueAlgorithm;
import org.snpeff.geneSets.algorithm.FisherPValueGreedyAlgorithm;
import org.snpeff.geneSets.algorithm.LeadingEdgeFractionAlgorithm;
import org.snpeff.geneSets.algorithm.NoneAlgorithm;
import org.snpeff.geneSets.algorithm.RankSumPValueAlgorithm;
import org.snpeff.geneSets.algorithm.RankSumPValueGreedyAlgorithm;
import org.snpeff.geneSets.algorithm.EnrichmentAlgorithm.EnrichmentAlgorithmType;
import org.snpeff.gsa.ChrPosScoreList;
import org.snpeff.gsa.PvaluesList;
import org.snpeff.gsa.ScoreList;
import org.snpeff.gsa.ScoreList.ScoreSummary;
import org.snpeff.interval.Chromosome;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Genome;
import org.snpeff.interval.Marker;
import org.snpeff.interval.Markers;
import org.snpeff.interval.Variant;
import org.snpeff.interval.VariantWithScore;
import org.snpeff.snpEffect.SnpEffectPredictor;
import org.snpeff.util.Gpr;
import org.snpeff.util.Timer;
import org.snpeff.vcf.VcfEntry;
/**
* Command line: Gene-Sets Analysis
*
* Perform gene set analysys
*
* @author pcingola
*/
public class SnpEffCmdGsa extends SnpEff {
public static int READ_INPUT_SHOW_EVERY = 1000;
public static int MAX_WARNS = 20;
InputFormat inputFormat = InputFormat.VCF;
boolean useClosestGene = false; // Map to 'any' closest gene?
boolean useGeneId = false; // Use geneId instead of geneName
boolean usePvalues = true;
boolean removeUnusedSets = false;
boolean orderDescending = false; // If 'true', high scores are better (sort descending and get the first values)
int upDownStreamLength = SnpEffectPredictor.DEFAULT_UP_DOWN_LENGTH;
int minGeneSetSize = 0;
int maxGeneSetSize = Integer.MAX_VALUE;
int numberofGeneSetsToSelect = 20;
int initGeneSetSize = 100;
int randIterations = 0;
double maxPvalueAdjusted = 0.05;
double maxPvalue = Double.NaN;
double interestingPerc = 0.05;
String inputFile = "";
String infoName = "";
String msigdb = "";
String geneScoreFile = "";
String geneScoreFileSave = null;
String commandsFile = null;
String geneInterestingFile = "";
String saveFile = null;
String correctionCmd = null;
ScoreSummary scoreSummary = ScoreSummary.MIN;
SnpEffectPredictor snpEffectPredictor;
Genome genome;
GeneSets geneSets;
ChrPosScoreList chrPosScoreList; // List of
AutoHashMap geneScores; // A map of geneId -> List[scores]
HashMap geneScore; // A map
HashSet genesInteresting; // A set of interesting genes
EnrichmentAlgorithmType enrichmentAlgorithmType = EnrichmentAlgorithmType.RANKSUM_GREEDY;
public SnpEffCmdGsa() {
super();
}
/**
* Read config file, load & build database
*/
protected void config() {
loadConfig();
// Read database (if gene level scores are provided, we don't need to map p_values to genes (we can skip this step)
if (geneScoreFile.isEmpty() && geneInterestingFile.isEmpty()) {
loadDb();
// Set upstream-downstream interval length
SnpEffectPredictor snpEffectPredictor = config.getSnpEffectPredictor();
snpEffectPredictor.setUpDownStreamLength(upDownStreamLength);
// Build tree
if (verbose) Timer.showStdErr("Building interval forest");
snpEffectPredictor.buildForest();
if (verbose) Timer.showStdErr("done.");
}
}
/**
* Correct scores (e.g. using covariates)
*/
void correctScores() {
// Assign input and output file names (scores and residues files)
String scoresFile = geneScoreFileSave;
String residuesFile;
try {
if (geneScoreFileSave == null) {
// No predefined name
scoresFile = File.createTempFile("geneScoreFile_in_", ".txt").getCanonicalPath();
residuesFile = File.createTempFile("geneScoreFile_out_", ".txt").getCanonicalPath();
// Delete tmp files on exit
new File(scoresFile).deleteOnExit();
new File(residuesFile).deleteOnExit();
} else {
// Predefined name, use residues file accordingly
residuesFile = Gpr.dirName(geneScoreFileSave) + "/" + Gpr.baseName(geneScoreFileSave) + ".corrected.txt";
}
} catch (IOException e) {
throw new RuntimeException(e);
}
// No correction method? No scores?
// Nothing to do
if (correctionCmd == null || geneScores.isEmpty()) {
if (geneScoreFileSave != null) {
createScoresFile(geneScoreFileSave, false); // may be we were asked to write these files anyway...
createScoresFile(residuesFile, true); // Only scores in this files (no additional information). Since we don't perform correction, residues and scores are the same
}
return;
}
// Create scores files
createScoresFile(scoresFile, false);
//---
// Invoke method
//---
String commandLine = correctionCmd + " " + scoresFile + " " + residuesFile;
try {
if (verbose) Timer.showStdErr("Correction: Invoking command " + commandLine);
Process process = Runtime.getRuntime().exec(commandLine);
process.waitFor();
if (process.exitValue() > 0) throw new RuntimeException("Process execution error, exit value '" + process.exitValue() + "'\n\tCommand line:\t" + commandLine);
} catch (Exception e) {
throw new RuntimeException("Error executing command: " + commandLine, e);
}
//---
// Read output
//---
if (verbose) Timer.showStdErr("Correction: Reading results from file '" + residuesFile + "'");
if (!Gpr.canRead(residuesFile)) throw new RuntimeException("Cannot read correction's results from file '" + residuesFile + "'");
geneScore = new HashMap();
String lines[] = Gpr.readFile(residuesFile).split("\n");
for (String line : lines) {
// Parse line
String recs[] = line.split("\t");
String geneId = recs[0];
double score = Gpr.parseDoubleSafe(recs[1]);
// Add to map
geneScore.put(geneId, score);
}
if (verbose) Timer.showStdErr("Correction: Done, " + lines.length + " values added.");
}
/**
* Create interesting genes
*/
void createInterestingGenes() {
if (!geneInterestingFile.isEmpty()) createInterestingGenesFile();
else createInterestingGenesScores();
}
/**
* Create interesting genes
*/
void createInterestingGenesFile() {
int hasGene = 0;
// Interesting genes from file
for (String geneId : genesInteresting) {
if (geneSets.hasGene(geneId)) hasGene++;
geneSets.addInteresting(geneId);
}
if (verbose) {
Timer.showStdErr("Intereting genes from file" //
+ "\n\tIntereting genes in file : " + genesInteresting.size() //
+ "\n\tFound genes : " + hasGene //
);
}
}
/**
* Create interesting genes
*/
void createInterestingGenesScores() {
// Get
ScoreList scores = new ScoreList();
for (double pval : geneScore.values())
scores.add(pval);
// Get p-value threshold
double quantile = interestingPerc;
if (orderDescending) quantile = 1 - interestingPerc;
double scoreThreshold = scores.quantile(quantile);
// Mark all scores lower than that as 'interesting'
int count = 0, countAdded = 0;
geneSets.setDoNotAddIfNotInGeneSet(true);
for (String geneId : geneScore.keySet()) {
if ((orderDescending && (geneScore.get(geneId) >= scoreThreshold)) //
|| (!orderDescending && (geneScore.get(geneId) <= scoreThreshold)) //
) {
if (geneSets.addInteresting(geneId)) countAdded++; // Count added genes
count++;
}
}
// Show info
if (verbose) {
double realPerc = (100.0 * count) / geneScore.size();
double realPercAdded = (100.0 * countAdded) / geneScore.size();
Timer.showStdErr(String.format("Score threshold:"//
+ "\n\tRange : [ %f , %f ]"//
+ "\n\tQuantile : %.2f%%"//
+ "\n\tThreshold : %f"//
+ "\n\tInteresting genes : %d (%.2f%%)" //
+ "\n\tInteresting genes added : %d (%.2f%%)" //
, scores.min(), scores.max(), 100.0 * interestingPerc, scoreThreshold, count, realPerc, countAdded, realPercAdded));
}
}
/**
* Creates a file with scores and several gene values.
* Users can create regression algorithms (e.g. in R) and return the residuals
*
* @param fileName
*/
void createScoresFile(String fileName, boolean scoresOnly) {
StringBuilder scores = new StringBuilder();
if (geneScoreFileSave != null) {
scores.append("geneId\tscore");
if (!scoresOnly) scores.append("\tscoreCount\t" + (new GeneStats()).title() + "\n");
scores.append("\n");
}
// Calculate statistics on each gene
AutoHashMap genesStats = new AutoHashMap(new GeneStats());
for (Gene gene : genome.getGenes()) {
String geneName = useGeneId ? gene.getId() : gene.getGeneName();
genesStats.getOrCreate(geneName).add(gene, useGeneId);
}
// Add all genes to output buffer
for (String geneId : geneScores.keySet()) {
// Calculate aggregated score
ScoreList gpl = geneScores.get(geneId);
double score = gpl.score(scoreSummary);
// Add to output
scores.append(geneId + "\t" + score);
if (!scoresOnly) scores.append("\t" + gpl.size() + "\t" + genesStats.getOrCreate(geneId));
scores.append("\n");
}
// Save to file
if (verbose) Timer.showStdErr("Saving gene scores to file: '" + fileName + "'");
Gpr.toFile(fileName, scores);
}
/**
* Perform enrichment analysis
*/
void enrichmentAnalysis() {
GeneSetsRanked geneSetsRanked = null;
// Initialize gene set values
if (geneScore != null) {
for (String geneId : geneScore.keySet())
geneSets.setValue(geneId, geneScore.get(geneId));
// Do we need to rank? Rank them by ascending p-value
geneSets.setVerbose(verbose);
if (enrichmentAlgorithmType.isRank()) {
geneSetsRanked = new GeneSetsRanked(geneSets);
geneSetsRanked.rankByValue(!orderDescending);
if (removeUnusedSets) geneSetsRanked.removeUnusedSets(); // Remove unused gene sets
} else {
if (removeUnusedSets) geneSets.removeUnusedSets(); // Remove unused gene sets
}
}
//---
// Run enrichment algorithm
//---
EnrichmentAlgorithm algorithm = null;
switch (enrichmentAlgorithmType) {
case NONE:
algorithm = new NoneAlgorithm(geneSets);
break;
case RANKSUM_GREEDY:
algorithm = new RankSumPValueGreedyAlgorithm(geneSetsRanked, numberofGeneSetsToSelect);
break;
case RANKSUM:
algorithm = new RankSumPValueAlgorithm(geneSetsRanked, numberofGeneSetsToSelect);
break;
case FISHER_GREEDY:
algorithm = new FisherPValueGreedyAlgorithm(geneSets, numberofGeneSetsToSelect);
break;
case FISHER:
algorithm = new FisherPValueAlgorithm(geneSets, numberofGeneSetsToSelect);
break;
case LEADING_EDGE_FRACTION:
algorithm = new LeadingEdgeFractionAlgorithm(geneSets, numberofGeneSetsToSelect, orderDescending);
break;
default:
throw new RuntimeException("Unimplemented algorithm!");
}
// Create 'interesting' genes
if (enrichmentAlgorithmType.isBinary()) createInterestingGenes();
// Initialize algorithm parameters
algorithm.setMaxGeneSetSize(maxGeneSetSize);
algorithm.setMinGeneSetSize(minGeneSetSize);
algorithm.setMaxPValue(maxPvalue);
algorithm.setMaxPvalueAdjusted(maxPvalueAdjusted);
algorithm.setVerbose(verbose);
algorithm.setDebug(debug);
// if (enrichmentAlgorithmType.isRank() && enrichmentAlgorithmType.isGreedy()) {
if (enrichmentAlgorithmType.isGreedy()) {
if (debug) Gpr.debug("Setting initGeneSetSize:" + initGeneSetSize);
((EnrichmentAlgorithmGreedyVariableSize) algorithm).setInitialSize(initGeneSetSize);
}
// Run algorithm
algorithm.select();
if (saveFile != null) {
if (verbose) Timer.showStdErr("Saving results to '" + saveFile + "'");
Gpr.toFile(saveFile, algorithm.getOutput());
}
}
/**
* Initialize: read config, database, etc.
*/
void initialize() {
// Read config file
if (config == null) config();
// Read database (if gene level scores are provided, we don't neet to map p_values to genes (we can skip this step)
if (geneScoreFile.isEmpty() && geneInterestingFile.isEmpty()) {
snpEffectPredictor = config.getSnpEffectPredictor();
genome = config.getGenome();
}
// Read gene set database
if (verbose) Timer.showStdErr("Reading MSigDb from file: '" + msigdb + "'");
geneSets = enrichmentAlgorithmType.isRank() ? new GeneSetsRanked(msigdb) : new GeneSets(msigdb);
if (verbose) Timer.showStdErr("Done." //
+ "\n\t\tGene sets added : " + geneSets.getGeneSetCount() //
+ "\n\t\tGenes added : " + geneSets.getGeneCount() //
);
}
/**
* Map to gene
*/
void mapToGenes() {
if (verbose) Timer.showStdErr("Mapping scores to genes.");
// Create an auto-hash
if (usePvalues) geneScores = new AutoHashMap(new PvaluesList());
else geneScores = new AutoHashMap(new ScoreList());
//---
// Map every chr:pos
//---
int unmapped = 0, mappedMultiple = 0;
for (int i = 0; i < chrPosScoreList.size(); i++) {
List geneIds = mapToGenes(chrPosScoreList.getChromosomeName(i), chrPosScoreList.getStart(i), chrPosScoreList.getEnd(i));
// Update counters
if (geneIds == null || geneIds.isEmpty()) {
unmapped++;
continue; // Nothing to do...
} else if (geneIds.size() > 1) mappedMultiple++;
// Add score to every geneId
double score = chrPosScoreList.getScore(i);
for (String geneId : geneIds) {
ScoreList gpl = geneScores.getOrCreate(geneId);
gpl.setGeneId(geneId);
gpl.add(score);
}
}
//---
// Show a summary
//---
if (verbose) Timer.showStdErr("Done:" //
+ "\n\tNumber of scores : " + chrPosScoreList.size() //
+ "\n\tUnmapped : " + unmapped //
+ "\n\tMapped to multiple genes : " + mappedMultiple //
);
if (debug) {
System.err.println("Mapping Gene to Score:");
ArrayList geneIds = new ArrayList(geneScores.keySet());
Collections.sort(geneIds);
for (String geneId : geneIds)
System.err.println("\t" + geneScores.get(geneId));
}
}
/**
* Map a position to a geneId
* @param chr
* @param start
* @return
*/
List mapToGenes(String chr, int start, int end) {
LinkedList geneIds = new LinkedList();
// Query
Marker m = new Marker(genome.getChromosome(chr), start, end, false, "");
// Map only to closest gene?
if (useClosestGene) {
Gene gene = snpEffectPredictor.queryClosestGene(m);
if (gene != null) geneIds.add(useGeneId ? gene.getId() : gene.getGeneName());
return geneIds;
}
// Add all genes to list
Markers hits = snpEffectPredictor.query(m);
for (Marker mm : hits) {
if (mm instanceof Gene) {
Gene gene = (Gene) mm;
geneIds.add(useGeneId ? gene.getId() : gene.getGeneName());
}
}
return geneIds;
}
/**
* Parse command line arguments
*/
@Override
public void parseArgs(String[] args) {
if (args.length == 0) usage(null);
// Parse comamnd line
for (int i = 0; i < args.length; i++) {
String arg = args[i];
// Is it an option?
if (isOpt(arg)) {
if (arg.equals("-i")) {
// Input format
if ((i + 1) < args.length) inputFormat = InputFormat.valueOf(args[++i].toUpperCase());
else usage("Missing input format in command line option '-i'");
} else if (arg.equals("-info")) {
// INFO field name
if ((i + 1) < args.length) infoName = args[++i];
else usage("Missing value in command line option '-info'");
} else if (arg.equals("-ud") || arg.equalsIgnoreCase("-upDownStreamLen")) {
// Up-downstream length
if ((i + 1) < args.length) upDownStreamLength = Gpr.parseIntSafe(args[++i]);
else usage("Missing value in command line option '-ud'");
} else if (arg.equals("-geneScore")) {
// Method for p-value scoring (gene level)
if ((i + 1) < args.length) {
String method = args[++i].toUpperCase();
scoreSummary = ScoreSummary.valueOf(method);
} else usage("Missing value in command line option '-geneScore'");
} else if (arg.equals("-algo")) {
// Algorithm to use
if ((i + 1) < args.length) {
String algo = args[++i].toUpperCase();
enrichmentAlgorithmType = EnrichmentAlgorithmType.valueOf(algo);
} else usage("Missing value in command line option '-algo'");
} else if (arg.equals("-geneScoreFile")) {
// Read gene scores from file
if ((i + 1) < args.length) geneScoreFile = args[++i];
else usage("Missing value in command line option '-geneScoreFile'");
} else if (arg.equals("-saveGeneScoreFile")) {
// Save gene scores to file
if ((i + 1) < args.length) geneScoreFileSave = args[++i];
else usage("Missing value in command line option '-saveGeneScoreFile'");
} else if (arg.equals("-commands")) {
// Load multiple commands from file
if ((i + 1) < args.length) commandsFile = args[++i];
else usage("Missing value in command line option '-commands'");
} else if (arg.equals("-save")) {
// Save results to file
if ((i + 1) < args.length) saveFile = args[++i];
else usage("Missing value in command line option '-save'");
} else if (arg.equals("-correction")) {
// Save results to file
if ((i + 1) < args.length) correctionCmd = args[++i];
else usage("Missing value in command line option '-correction'");
} else if (arg.equals("-maxPvalue")) {
// Save results to file
if ((i + 1) < args.length) maxPvalue = Gpr.parseDoubleSafe(args[++i]);
else usage("Missing value in command line option '-maxPvalue'");
} else if (arg.equals("-maxPvalueAdj")) {
// Save results to file
if ((i + 1) < args.length) maxPvalueAdjusted = Gpr.parseDoubleSafe(args[++i]);
else usage("Missing value in command line option '-maxPvalueAdj'");
} else if (arg.equals("-geneInterestingFile")) {
// Algorithm to use
if ((i + 1) < args.length) geneInterestingFile = args[++i];
else usage("Missing value in command line option '-geneScoreFile'");
} else if (arg.equals("-minSetSize")) minGeneSetSize = Gpr.parseIntSafe(args[++i]);
else if (arg.equals("-maxSetSize")) maxGeneSetSize = Gpr.parseIntSafe(args[++i]);
else if (arg.equals("-initSetSize")) initGeneSetSize = Gpr.parseIntSafe(args[++i]);
else if (arg.equals("-rand")) randIterations = Gpr.parseIntSafe(args[++i]);
else if (arg.equals("-interesting")) interestingPerc = Gpr.parseDoubleSafe(args[++i]);
else if (arg.equals("-mapClosestGene")) useClosestGene = true;
else if (arg.equals("-geneId")) useGeneId = true;
else if (arg.equals("-score")) usePvalues = false;
else if (arg.equals("-desc")) orderDescending = true; // Sort descending: High scores are better
else usage("Unknown option '" + arg + "'");
} else if (genomeVer.isEmpty()) genomeVer = arg;
else if (msigdb.isEmpty()) msigdb = arg;
else if (inputFile.isEmpty()) inputFile = arg;
}
//---
// Sanity checks
//---
if (genomeVer.isEmpty() && geneScoreFile.isEmpty() && geneInterestingFile.isEmpty()) usage("Missing genome version.");
if (commandsFile == null) {
// All these check are only performed when "commands" is not set
if ((inputFormat == InputFormat.VCF) && infoName.isEmpty() && geneScoreFile.isEmpty() && geneInterestingFile.isEmpty()) usage("Missing '-info' comamnd line option.");
if (inputFile.isEmpty()) inputFile = "-"; // Default is STDIN
if (!Gpr.canRead(inputFile)) fatalError("Cannot read input file '" + inputFile + "'");
if (msigdb.isEmpty()) fatalError("Missing Gene-Sets file");
if (!Gpr.canRead(msigdb)) fatalError("Cannot read Gene-Sets file '" + msigdb + "'");
if (maxGeneSetSize <= 0) usage("MaxSetSize must be a positive number.");
if (minGeneSetSize >= maxGeneSetSize) usage("MaxSetSize (" + maxGeneSetSize + ") must larger than MinSetSize (" + minGeneSetSize + ").");
if ((interestingPerc < 0) || (interestingPerc > 1)) usage("Interesting percentile must be in the [0 , 1.0] range.");
if (!geneInterestingFile.isEmpty() && !enrichmentAlgorithmType.isBinary()) usage("Cannot specify '-geneInterestingFile' using algorithm '" + enrichmentAlgorithmType + "'");
} else {
if (!Gpr.canRead(commandsFile)) fatalError("Cannot read commands file '" + commandsFile + "'");
}
}
/**
* Read interesting genes from file
* @param geneScoreFile
*/
void readGeneInteresting(String geneScoreFile) {
if (verbose) Timer.showStdErr("Reading interesting genes file '" + geneInterestingFile + "'");
String lines[] = Gpr.readFile(geneScoreFile).split("\n");
genesInteresting = new HashSet();
for (String g : lines)
genesInteresting.add(g.trim());
if (verbose) Timer.showStdErr("Done. Added: " + genesInteresting.size());
}
/**
* Read gene-Score file
* Format: "geneId \t p_value \n"
*
* @param geneScoreFile
*/
void readGeneScores(String geneScoreFile) {
if (verbose) Timer.showStdErr("Reading gene scores file '" + geneScoreFile + "'");
geneScore = new HashMap();
// Read the whole file
String lines[] = Gpr.readFile(geneScoreFile).split("\n");
// Parse each line
double minp = Double.POSITIVE_INFINITY, maxp = Double.NEGATIVE_INFINITY;
for (String line : lines) {
String rec[] = line.split("\\s");
String geneId = rec[0].trim();
double score = Gpr.parseDoubleSafe(rec[1]);
if ((score > 0) && (score <= 1.0)) { // Assume that a p-value of zero is a parsing error
geneScore.put(geneId, score);
minp = Math.min(minp, score);
maxp = Math.max(maxp, score);
} else if (verbose) System.err.println("\tWarning: Ignoring entry (zero p-value):\t'" + line + "'");
}
if (verbose) Timer.showStdErr("Done."//
+ "\n\tScores added : " + geneScore.size() //
+ "\n\tMin score (p-value) : " + minp //
+ "\n\tMax score (p-value) : " + maxp //
);
}
/**
* Read input file and populate 'chrPosScoreList'
*/
void readInput() {
if (verbose) Timer.showStdErr("Reading input file '" + inputFile + "' (format '" + inputFormat + "')");
switch (inputFormat) {
case VCF:
chrPosScoreList = readInputVcf();
break;
case BED:
chrPosScoreList = readInputBed();
break;
// case TXT:
// chrPosScoreList = readInputTxt();
// break;
default:
fatalError("Input format '" + inputFormat + "' not supported!");
}
if (verbose) {
System.err.println("");
Timer.showStdErr("Done.");
}
if (debug) {
// Show data
System.err.println("scores:\n\tchr\tstart\tend\tp_value");
for (int i = 0; i < chrPosScoreList.size(); i++)
System.err.println("\t" + chrPosScoreList.getChromosomeName(i) + "\t" + chrPosScoreList.getStart(i) + "\t" + chrPosScoreList.getEnd(i) + "\t" + chrPosScoreList.getScore(i));
}
}
/**
* Read input in BED format
*
* Format: "chr \t start \t end \t id \t Score \n"
* start : zero-based
* end : zero-based open
*
*/
ChrPosScoreList readInputBed() {
ChrPosScoreList cppList = new ChrPosScoreList();
int num = 1;
BedFileIterator bfi = new BedFileIterator(inputFile);
for (Variant sc : bfi) {
cppList.add(sc.getChromosome(), sc.getStart(), sc.getEnd(), ((VariantWithScore) sc).getScore());
if (verbose) Gpr.showMark(num++, READ_INPUT_SHOW_EVERY);
}
return cppList;
}
/**
* Read input in TXT format
*
* Format: "chr \t pos \t score \n"
*
* Note: BED format + score (0-based open close interval)
*/
ChrPosScoreList readInputTxt() {
ChrPosScoreList cppList = new ChrPosScoreList();
Genome genome = config.getGenome();
int num = 1;
LineFileIterator lfi = new LineFileIterator(inputFile);
for (String line : lfi) {
if (line.startsWith("#")) continue; // Ignore lines that start with '#'
String fields[] = line.split("\t");
// Sanity check
if (fields.length < 3) {
System.err.println("Warning: Ignoring line number " + lfi.getLineNum() + "." //
+ " Exepcting format 'chr \t pos \t score \n'.\n" //
+ "\tLine:\t'" + line + "'" //
);
continue;
}
// Parse fields
String chr = fields[0];
int start = Gpr.parseIntSafe(fields[1]); // Input format is 0-based
double score = Gpr.parseDoubleSafe(fields[2]);
// Add data to list
Chromosome chromo = genome.getOrCreateChromosome(chr);
cppList.add(chromo, start, start, score);
if (verbose) Gpr.showMark(num++, READ_INPUT_SHOW_EVERY);
}
return cppList;
}
/**
* Read input in VCF format
*/
ChrPosScoreList readInputVcf() {
ChrPosScoreList cppList = new ChrPosScoreList();
int num = 1, warns = 0;
VcfFileIterator vcf = new VcfFileIterator(inputFile);
vcf.setDebug(debug);
for (VcfEntry ve : vcf) {
double score = ve.getInfoFloat(infoName);
if (Double.isNaN(score)) {
// Error
if (warns <= MAX_WARNS) {
System.err.println("Warning: Cannot find INFO field '" + infoName + "'. Ignoring VCF entry " + vcf.getLineNum() + "\t" + ve);
if (warns == MAX_WARNS) System.err.println("Too many warnings! No more warnings shown.");
}
warns++;
} else {
// Add to list
cppList.add(ve.getChromosome(), ve.getStart(), ve.getEnd(), score);
}
if (verbose) Gpr.showMark(num++, READ_INPUT_SHOW_EVERY);
}
return cppList;
}
/**
* Run command
*/
@Override
public boolean run() {
// Normal usage: Just run analysis
if (commandsFile == null) return runAnalisis();
// Run several commands (no need to reload genomic database each time)
return runCommands();
}
/**
* Run command
*/
protected boolean runAnalisis() {
initialize();
if (geneScoreFile.isEmpty() && geneInterestingFile.isEmpty()) {
// Perform 'normal' procedure
readInput(); // Read input files (scores)
mapToGenes(); // Map to gene
scoreGenes(); // Get one score (Score) per gene
correctScores(); // Correct gene scores
} else if (!geneScoreFile.isEmpty()) {
// Scores already mapped to genes, provided in a file
readGeneScores(geneScoreFile);
correctScores(); // Correct gene scores
} else if (!geneInterestingFile.isEmpty()) {
// Interesting genes from file (not calculated)
readGeneInteresting(geneInterestingFile);
}
enrichmentAnalysis(); // Perform enrichment analysis
if (randIterations > 0) runAnalisisRand(); // Perform random iterations
if (verbose) Timer.showStdErr("Done.");
return true;
}
/**
* Run enrichment analysis using random scores
*/
protected boolean runAnalisisRand() {
HashMap geneScoreOri = geneScore; // Save original scores
for (int iter = 1; iter <= randIterations; iter++) {
Timer.showStdErr("Random scores. Iteration " + iter);
// Create random Scores based on input
geneScore = new HashMap();
for (String gene : geneScoreOri.keySet())
geneScore.put(gene, Math.random());
// Perform enrichment analysis
enrichmentAnalysis();
}
geneScore = geneScoreOri; // Restore original values
if (verbose) Timer.showStdErr("Done.");
return true;
}
/**
* Read "command" lines from file.
* Loads database only once to save time
* @return
*/
protected boolean runCommands() {
boolean ok = true;
// Read config file, load & build database
config();
// Parse commands from file
for (String commnadLine : Gpr.readFile(commandsFile).split("\n")) {
if (verbose) {
Timer.showStdErr("COMMAND: " + commnadLine);
System.out.println("COMMAND: " + commnadLine);
}
// Parse command line (tab-separated)
String args[] = commnadLine.split("\t");
// Create new 'SnpEffCmdGsa' and set database
SnpEffCmdGsa snpEffCmdGsa = new SnpEffCmdGsa();
// Set config (and database)
snpEffCmdGsa.setConfig(config);
// Run
StringBuilder err = new StringBuilder();
ok &= run(snpEffCmdGsa, args, err);
}
if (verbose) Timer.showStdErr("Done!");
return ok;
}
/**
* Get one score (Score) per gene
*/
void scoreGenes() {
if (verbose) Timer.showStdErr("Aggregating scores by gene (scoring genes)");
// Create one Score per gene
double scoreMin = Double.MAX_VALUE, scoreMax = Double.MIN_VALUE;
geneScore = new HashMap();
for (String geneId : geneScores.keySet()) {
// Calculate aggregated score
ScoreList gpl = geneScores.get(geneId);
double score = gpl.score(scoreSummary);
scoreMax = Math.max(score, scoreMax);
scoreMin = Math.min(score, scoreMin);
// Add to map
geneScore.put(geneId, score);
}
if (verbose) Timer.showStdErr("Done. Score range: [ " + scoreMin + " , " + scoreMax + " ]");
}
/**
* Show usage and exit
*/
@Override
public void usage(String message) {
if (message != null) System.err.println("Error: " + message + "\n");
System.err.println("snpEff version " + SnpEff.VERSION);
System.err.println("Usage: snpEff gsa [options] genome_version geneSets.gmt [input_file]");
System.err.println("\n\tInput data options:");
System.err.println("\t-commands : Read commands from file (allows multiple analysis loading the database only once).");
System.err.println("\t-geneId : Use geneID instead of gene names. Default: " + useGeneId);
System.err.println("\t-i : Input format {vcf, bed, txt}. Default: " + inputFormat);
System.err.println("\t-info : INFO tag used for scores (in VCF input format).");
System.err.println("\t-desc : Sort scores in descending order (high score are better then low scores). Default " + orderDescending);
System.err.println("\t-save : Save results to file.");
System.err.println("\t-score : Treat input data as scores instead of p-values.");
System.err.println("\n\tAlgorithm options:");
System.err.println("\t-algo : Gene set enrichment algorithm {FISHER_GREEDY, RANKSUM_GREEDY, FISHER, RANKSUM, LEADING_EDGE_FRACTION, NONE}. Default: " + enrichmentAlgorithmType);
System.err.println("\t-correction : Correction of scores using command 'cmd' (e.g. an R script).");
System.err.println("\t-geneScore : Method to summarize gene scores {MIN, MAX, AVG, AVG_MIN_10, AVG_MAX_10, FISHER_CHI_SQUARE, Z_SCORES, SIMES}. Default: " + scoreSummary);
System.err.println("\t-geneScoreFile : Read gene score from file instead of calculating them. Format: 'geneId \\t score'");
System.err.println("\t-mapClosestGene : Map to closest gene. Default: " + useClosestGene);
System.err.println("\t-maxPvalue : Maximum un-adjusted p-value to show result. Default: None");
System.err.println("\t-maxPvalueAdj : Maximum adjusted p-value to show result. Default: " + maxPvalueAdjusted);
System.err.println("\t-saveGeneScoreFile : Save gene scores to file.");
System.err.println("\t-rand : Perform 'num' iterations using random scores. Default: " + randIterations);
System.err.println("\n\tAlgorithm specific options: FISHER and FISHER_GREEDY");
System.err.println("\t-interesting : Consider a gene 'interesting' if the score is in the 'num' percentile. Default: " + interestingPerc);
System.err.println("\t-geneInterestingFile : Use 'interesting' genes from file instead of calculating them.");
System.err.println("\n\tGene Set options:");
System.err.println("\t-minSetSize : Minimum number of genes in a gene set. Default: " + minGeneSetSize);
System.err.println("\t-maxSetSize : Maximum number of genes in a gene set. Default: " + maxGeneSetSize);
System.err.println("\t-initSetSize : Initial number of genes in a gene set (size range algorithm). Default: " + initGeneSetSize);
System.exit(-1);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy