All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.PromoterSequences Maven / Gradle / Ivy

The newest version!
package org.snpeff;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;

import org.snpeff.fileIterator.FastaFileIterator;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Genome;
import org.snpeff.snpEffect.Config;
import org.snpeff.snpEffect.SnpEffectPredictor;
import org.snpeff.util.GprSeq;
import org.snpeff.util.Timer;

/**
 * Get promoter sequences from genes
 * 
 * @author pablocingolani
 */
public class PromoterSequences {

	public static String HOME = System.getProperty("user.home");

	public static int LEN_UPSTREAM = 3000;
	public static int LEN_AFTER_TSS = 200;

	SnpEffectPredictor snpEffectPredictor;
	Config config;
	Genome genome;
	HashSet geneIds;
	HashSet genes;
	String fastaFile;

	public static void main(String[] args) {

		//---
		//  Parse command line argument
		//---
		if (args.length < 2) {
			System.err.println("Usage: " + PromoterSequences.class.getSimpleName() + " genomeName fastaFile.fa geneId_1 geneId_2 ... geneId_N");
			System.exit(1);
		}

		String genomeName = args[0];
		String fastaFile = args[1];
		ArrayList geneIds = new ArrayList();
		for (int i = 2; i < args.length; i++)
			geneIds.add(args[i]);

		//---
		// Run
		//---
		PromoterSequences promoterSequences = new PromoterSequences(genomeName, fastaFile, geneIds);
		promoterSequences.run();
	}

	public PromoterSequences(String genomeName, String fastaFile, List geneIds) {
		this.fastaFile = fastaFile;

		this.geneIds = new HashSet();
		this.geneIds.addAll(geneIds);

		genes = new HashSet();

		Timer.showStdErr("Loading database");
		String configFile = HOME + "/snpEff/snpEff.config";
		config = new Config(genomeName, configFile);
		config.loadSnpEffectPredictor();
		snpEffectPredictor = config.getSnpEffectPredictor();
		genome = snpEffectPredictor.getGenome();

		//		Timer.showStdErr("Building forest");
		//		snpEffectPredictor.buildForest();
	}

	public void run() {
		Timer.showStdErr("Finding genes ");
		for (Gene gene : genome.getGenes()) {
			if (geneIds.contains(gene.getId())) {
				System.err.println("\t" + gene.getId());
				genes.add(gene);
				geneIds.remove(gene.getId());
			}
		}
		if (!geneIds.isEmpty()) Timer.showStdErr("Not found: " + geneIds);

		//---
		// Read fasta file
		//---
		Timer.showStdErr("Reading fasta file: " + fastaFile);
		HashSet done = new HashSet();
		FastaFileIterator ffi = new FastaFileIterator(fastaFile);
		for (String seq : ffi) {
			String chrName = ffi.getName();
			Timer.showStdErr("Read: " + chrName);

			for (Gene gene : genes) {
				if (gene.getChromosomeName().equals(chrName)) {
					// Extract sequence
					int start = 0, end = 0;

					if (gene.isStrandPlus()) {
						start = gene.getStart() - LEN_UPSTREAM;
						end = gene.getStart() + LEN_AFTER_TSS;
					} else {
						start = gene.getEnd() - LEN_AFTER_TSS;
						end = gene.getEnd() + LEN_UPSTREAM;
					}

					// Show subsequence
					String subSeq = seq.substring(start, end + 1);
					System.out.println(GprSeq.string2fasta(gene.getId(), subSeq));

					// Remove gene
					done.add(gene);
				}

				// No more genes to find?
				if (done.containsAll(genes)) break;
			}
		}
		ffi.close();

		Timer.showStdErr("Done");
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy