All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.snpEffect.commandLine.SnpEffCmdGenes2Bed Maven / Gradle / Ivy

The newest version!
package org.snpeff.snpEffect.commandLine;

import java.util.HashSet;

import org.snpeff.SnpEff;
import org.snpeff.interval.Cds;
import org.snpeff.interval.Exon;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Genome;
import org.snpeff.interval.Transcript;
import org.snpeff.util.Gpr;
import org.snpeff.util.Timer;

/**
 * Simple test program
 * @author pcingola
 */
public class SnpEffCmdGenes2Bed extends SnpEff {

	HashSet geneIds;
	String fileName = null;
	boolean onlyProteinCoding;
	boolean showCds;
	boolean showExons;
	boolean showTranscripts;
	int expandUpstreamDownstream = 0;

	public static void main(String[] args) {
		SnpEffCmdGenes2Bed conf2down = new SnpEffCmdGenes2Bed();
		conf2down.parseArgs(args);
		conf2down.run();
	}

	public SnpEffCmdGenes2Bed() {
		super();
		geneIds = new HashSet<>();
	}

	void loadGenes() {
		// Parse file
		if (fileName != null) {
			if (verbose) Timer.showStdErr("Loading genes list from file '" + fileName + "'");

			String lines[] = Gpr.readFile(fileName).split("\n");
			if (lines.length <= 0) throw new RuntimeException("Cannot read file '" + fileName + "'");

			for (String line : lines) {
				String id = line.trim();
				if (!id.isEmpty()) geneIds.add(id);
			}
		}
	}

	@Override
	public void parseArgs(String[] args) {
		if (args.length < 1) usage(null);

		// Parse command line arguments
		for (int i = 0; i < args.length; i++) {
			String arg = args[i];
			if (isOpt(arg)) {
				switch (arg.toLowerCase()) {
				case "-e":
					// Show exons for all transcripts
					showExons = true;
					break;

				case "-cds":
					// Show exons in CDS
					showCds = true;
					break;

				case "-f":
					// List in a file?
					if ((i + 1) < args.length) fileName = args[++i];
					else usage("Option '-f' without file argument");
					break;

				case "-pc":
					// Use only protein coding genes
					onlyProteinCoding = true;
					break;

				case "-tr":
					// Show transcripts
					showTranscripts = true;
					break;

				case "-ud":
					// Expand upstream & downstream
					if ((i + 1) < args.length) expandUpstreamDownstream = Gpr.parseIntSafe(args[++i]);
					else usage("Option '-ud' without file argument");
					break;

				default:
					usage("Unknown commnand line option '" + arg + "'");
				}
			} else if ((genomeVer == null) || genomeVer.isEmpty()) {
				// Genome version
				genomeVer = args[i];
			} else geneIds.add(args[i]);
		}

		int countMutex = 0;
		if (showCds) countMutex++;
		if (showExons) countMutex++;
		if (showTranscripts) countMutex++;
		if (countMutex > 1) usage("Options '-e', '-cds' and '-tr' are mutually exclusive");
	}

	@Override
	public boolean run() {
		loadGenes();
		if (verbose) {
			Timer.showStdErr("Number of gene IDs to look up: " + geneIds.size());
			if (geneIds.isEmpty()) Timer.showStdErr("Empty list of IDs. Using all genes.");
		}

		// Load config & database
		loadConfig();
		loadDb();

		// Show genes
		showGenes();

		return true;
	}

	/**
	 * Show either a gene or all exons for all transcripts within a gene
	 */
	void show(Gene g) {
		if (showCds) showCds(g);
		else if (showExons) showExons(g);
		else if (showTranscripts) showTr(g);
		else showGene(g);
	}

	/**
	 * Show CDS coordinates
	 */
	void showCds(Gene g) {
		for (Transcript tr : g) {
			for (Exon ex : tr) {
				Cds cds = tr.findCds(ex); // Find corresponding CDS

				if (cds != null) {
					int start = cds.getStart() - expandUpstreamDownstream;
					int end = cds.getEnd() + 1 + expandUpstreamDownstream;

					System.out.println(cds.getChromosomeName() //
							+ "\t" + start //
							+ "\t" + end //
							+ "\t" + g.getGeneName() //
							+ ";" + g.getId() //
							+ ";" + tr.getId() //
							+ ";" + ex.getRank() //
							+ ";" + (ex.isStrandPlus() ? "+" : "-") //
					);
				}
			}
		}
	}

	/**
	 * Show exons coordinates
	 */
	void showExons(Gene g) {
		// Show exon for each transcript
		for (Transcript tr : g) {
			for (Exon ex : tr) {
				int start = ex.getStart() - expandUpstreamDownstream;
				int end = ex.getEnd() + 1 + expandUpstreamDownstream;

				System.out.println(ex.getChromosomeName() //
						+ "\t" + start //
						+ "\t" + end //
						+ "\t" + g.getGeneName() //
						+ ";" + g.getId() //
						+ ";" + tr.getId() //
						+ ";" + ex.getRank() //
						+ ";" + (ex.isStrandPlus() ? "+" : "-") //
				);
			}
		}
	}

	/**
	 * Show a gene coordiantes
	 */
	void showGene(Gene g) {
		// Show gene
		int start = g.getStart() - expandUpstreamDownstream;
		int end = g.getEnd() + 1 + expandUpstreamDownstream;

		System.out.println(g.getChromosomeName() //
				+ "\t" + start //
				+ "\t" + end //
				+ "\t" + g.getGeneName() //
				+ ";" + g.getId() //
				+ ";" + (g.isStrandPlus() ? "+" : "-") //
		);
	}

	/**
	 * Show genes in BED format
	 */
	public void showGenes() {
		// Show title
		if (showCds) System.out.println("#chr\tstart\tend\tgeneName;geneId;transcriptId;exonRank;strand");
		else if (showExons) System.out.println("#chr\tstart\tend\tgeneName;geneId;transcriptId;exonRank;stramd");
		else if (showTranscripts) System.out.println("#chr\tstart\tend\tgeneName;geneId;transcriptId;strand");
		else System.out.println("#chr\tstart\tend\tgeneName;geneId;strand");

		// Find genes
		Genome genome = config.getGenome();
		if (verbose) Timer.showStdErr("Finding genes.");
		int found = 0, filtered = 0;
		for (Gene g : genome.getGenesSortedPos()) {
			// Is gene.id or gene.name in geneSet? => Show it
			if (geneIds.isEmpty() // Empty set means 'use all genes'
					|| geneIds.contains(g.getId()) // Is the geneId in the list?
					|| geneIds.contains(g.getGeneName()) // Id the geneName in the list?
			) {
				found++;

				// Show or filter?
				if (!onlyProteinCoding || g.isProteinCoding()) show(g);
				else filtered++;
			}
		}

		if (verbose) {
			Timer.showStdErr("Done\n\tFound      : " + found + " / " + geneIds.size() //
					+ (filtered > 0 ? "\n\tFiltered out : " + filtered + " / " + found : "") //
			);
		}
	}

	/**
	 * Show transcript coordinates
	 */
	void showTr(Gene g) {
		for (Transcript tr : g) {
			int start = tr.getStart() - expandUpstreamDownstream;
			int end = tr.getEnd() + 1 + expandUpstreamDownstream;

			System.out.println(tr.getChromosomeName() //
					+ "\t" + start //
					+ "\t" + end //
					+ "\t" + g.getGeneName() //
					+ ";" + g.getId() //
					+ ";" + tr.getId() //
					+ ";" + (tr.isStrandPlus() ? "+" : "-") //
			);
		}
	}

	@Override
	public void usage(String message) {
		if (message != null) System.err.println("Error: " + message + "\n");
		System.err.println("Usage: " + SnpEffCmdGenes2Bed.class.getSimpleName() + " genomeVer [-f genes.txt | geneList]}");
		System.err.println("Options: ");
		System.err.println("\t-cds           : Show coding exons (no UTRs).");
		System.err.println("\t-e             : Show exons.");
		System.err.println("\t-f   : A TXT file having one gene ID (or name) per line.");
		System.err.println("\t-pc            : Use only protein coding genes.");
		System.err.println("\t-tr            : Show transcript coordinates.");
		System.err.println("\t-ud       : Expand gene interval upstream and downstream by 'num' bases.");
		System.err.println("\tgeneList       : A list of gene IDs or names. One per command line argument: geneId_1 geneId_2 geneId_3 ... geneId_N");
		System.exit(-1);
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy