All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.interval.Genome Maven / Gradle / Ivy

The newest version!
package org.snpeff.interval;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;

import org.snpeff.binseq.GenomicSequences;
import org.snpeff.fileIterator.FastaFileIterator;
import org.snpeff.serializer.MarkerSerializer;
import org.snpeff.snpEffect.EffectType;
import org.snpeff.util.Gpr;

/**
 *
 * This is just used for the Interval class.
 * It is NOT a representation of an entire genome.
 *
 * @author pcingola
 */
public class Genome extends Marker implements Serializable, Iterable {

	private static final long serialVersionUID = -330362012383572257L;

	private static int genomeIdCounter = 0;

	int genomeId;
	long length = -1;
	String species;
	String version;
	String fastaDir;
	List chromosomeNamesSorted = null;
	String chromoFastaFiles[];
	HashMap chromosomes;
	Genes genes; // All genes, transcripts, exons, UTRs, CDS, etc.
	Boolean codingInfo = null; // Do we have coding info from genes?
	Boolean transcriptSupportLevelInfo = null; // Do we have 'TranscriptSupportLevel' info in transcripts?
	GenomicSequences genomicSequences; // Store all genomic sequences (of interest) here
	CytoBands cytoBands;

	/**
	 * Create a genome from a faidx file.
	 * See "samtools faidx" command (reference http://samtools.sourceforge.net/samtools.shtml)
	 *
	 * @param genomeName : Genome's name (version)
	 * @param faidxFile : FAI file used to create all chromosomes
	 * @return
	 */
	public static Genome createFromFaidx(String genomeName, String faidxFile) {
		Genome genome = new Genome(genomeName);

		// Read the whole file
		String lines[] = Gpr.readFile(faidxFile).split("\n");
		for (String line : lines) {
			String vals[] = line.split("\t");
			String chrName = vals[0];
			int len = Gpr.parseIntSafe(vals[1]);

			// Create chromo
			Chromosome chromosome = new Chromosome(genome, 0, len, chrName);
			genome.add(chromosome);
		}
		return genome;
	}

	public Genome() {
		super();
		id = version = "";
		type = EffectType.GENOME;
		// chromosomeNames = new ArrayList();
		chromosomes = new HashMap<>();
		genes = new Genes(this);
		genomicSequences = new GenomicSequences(this);
		genomicSequences.build();
		setGenomeId();
	}

	public Genome(String version) {
		super(null, 0, Integer.MAX_VALUE, false, version);
		this.version = version;
		type = EffectType.GENOME;
		chromosomes = new HashMap<>();
		genes = new Genes(this);
		genomicSequences = new GenomicSequences(this);
		genomicSequences.build();
		setGenomeId();
	}

	public Genome(String version, Properties properties) {
		super(null, 0, Integer.MAX_VALUE, false, version);
		this.version = version;
		type = EffectType.GENOME;
		genes = new Genes(this);
		genomicSequences = new GenomicSequences(this);
		genomicSequences.build();

		species = properties.getProperty(version + ".genome");
		if (species == null) throw new RuntimeException("Property: '" + version + ".genome' not found");
		species = species.trim();

		//		chromosomeNames = new ArrayList();
		String[] chromosomeNames = propertyToStringArray(properties, version + ".chromosomes");

		// Fasta file & dir (optional)
		if (properties.getProperty(version + ".fasta_dir") != null) fastaDir = properties.getProperty(version + ".fasta_dir").trim();
		else fastaDir = "";
		if (properties.getProperty(version + ".chromo_fasta_files") != null) chromoFastaFiles = propertyToStringArray(properties, version + ".chromo_fasta_files");
		else chromoFastaFiles = new String[0];

		chromosomes = new HashMap<>();
		for (String chName : chromosomeNames)
			add(new Chromosome(this, 0, 0, chName));

		setGenomeId();
	}

	/**
	 * Add a chromosome
	 */
	public synchronized void add(Chromosome chromo) {
		//		chromosomeNames.add(chromo.getId());
		chromosomes.put(chromo.getId(), chromo);
		chromo.setParent(this);
	}

	/**
	 * Get a sorted list of chromosomes
	 */
	public List chromosomeNamesSorted() {
		if (chromosomeNamesSorted != null) return chromosomeNamesSorted; // Already done? => return previous result

		// Sort chromosomes by name
		ArrayList chromosArr = new ArrayList<>(chromosomes.size());
		chromosArr.addAll(chromosomes.values());
		Collections.sort(chromosArr);

		// Create a list and add all names to list
		chromosomeNamesSorted = new ArrayList<>();
		for (int i = 0; i < chromosArr.size(); i++)
			chromosomeNamesSorted.add(chromosArr.get(i).getId());

		return chromosomeNamesSorted;
	}

	/**
	 * Create a chromosome named 'chromoName'
	 */
	synchronized Chromosome createChromosome(String chromoName) {
		Chromosome chr = getChromosome(chromoName);
		if (chr != null) return chr; // Already created => Nothing done (some race condition might get you here)

		String ch = Chromosome.simpleName(chromoName);
		chr = new Chromosome(this, 0, 0, ch);
		add(chr);
		return chr;
	}

	public String[] getChromoFastaFiles() {
		return chromoFastaFiles;
	}

	/**
	 * Find chromosome 'chromoName'
	 */
	public Chromosome getChromosome(String chromoName) {
		String ch = Chromosome.simpleName(chromoName);
		return chromosomes.get(ch);
	}

	public int getChromosomeCount() {
		return chromosomes.size();
	}

	public Collection getChromosomes() {
		return chromosomes.values();
	}

	/**
	 * Return chromosomes sorted by size (largest chromosomes first)
	 */
	public List getChromosomesSortedSize() {
		ArrayList chrs = new ArrayList<>();
		chrs.addAll(chromosomes.values());

		// Sort by size (and then by name)
		Collections.sort(chrs, new Comparator() {

			@Override
			public int compare(Chromosome chr1, Chromosome chr2) {
				int cmp = chr2.size() - chr1.size();
				if (cmp != 0) return cmp;
				return chr1.getId().compareTo(chr2.getId());
			}
		});

		return chrs;
	}

	public synchronized CytoBands getCytoBands() {
		if (cytoBands == null) cytoBands = new CytoBands(this);
		return cytoBands;
	}

	public String getFastaDir() {
		return fastaDir;
	}

	public Genes getGenes() {
		return genes;
	}

	/**
	 * Create a sorted list of genes (sorted by gene Id)
	 */
	public List getGenesSorted() {
		ArrayList genesSorted = new ArrayList<>();
		genesSorted.addAll(genes.values());
		Collections.sort(genesSorted, new Comparator() {

			@Override
			public int compare(Gene gene1, Gene gene2) {
				return gene1.getId().compareTo(gene2.getId());
			}
		});

		return genesSorted;
	}

	/**
	 * Create a sorted list of genes (sorted by genomic position)
	 */
	public List getGenesSortedPos() {
		ArrayList genesSorted = new ArrayList<>();
		genesSorted.addAll(genes.values());
		Collections.sort(genesSorted);
		return genesSorted;
	}

	public String getGenomeId() {
		return id + "[" + genomeId + "]";
	}

	public GenomicSequences getGenomicSequences() {
		return genomicSequences;
	}

	/**
	 * Get or create a chromosome
	 */
	public Chromosome getOrCreateChromosome(String chromoName) {
		Chromosome chr = getChromosome(chromoName);
		if (chr == null) chr = createChromosome(chromoName);
		return chr;
	}

	public String getSpecies() {
		return species;
	}

	public String getVersion() {
		return version;
	}

	/**
	 * Is this chromosome in this genome?
	 */
	public boolean hasChromosome(String chromo) {
		return chromosomes.containsKey(chromo);
	}

	public boolean hasCodingInfo() {
		// Is this already calculated?
		if (codingInfo == null) {
			int countCoding = 0;

			for (Gene gene : genes)
				if (gene.isProteinCoding()) countCoding++;

			codingInfo = (countCoding != 0);
		}

		return codingInfo;
	}

	/**
	 * Do we have coding info from genes?
	 */
	public boolean hasTranscriptSupportLevelInfo() {
		// Is this already calculated?
		if (transcriptSupportLevelInfo == null) {
			int countTsl = 0;

			for (Gene gene : genes)
				for (Transcript tr : gene)
					if (tr.hasTranscriptSupportLevelInfo()) countTsl++;

			transcriptSupportLevelInfo = (countTsl != 0);
		}

		return codingInfo;
	}

	/**
	 * Do most exons have sequence?
	 * This is an indicator that something went really bad building the database.
	 *
	 * @return Check if most exons have sequence assigned.
	 */
	public boolean isMostExonsHaveSequence() {
		int exonSeq = 0, exonNoSeq = 0;

		// For each gene, transcript and exon, count the ones having sequences
		for (Gene g : getGenes())
			for (Transcript tr : g)
				for (Exon e : tr)
					if (e.getSequence().isEmpty()) exonNoSeq++;
					else exonSeq++;

		return exonNoSeq < exonSeq;
	}

	@Override
	public Iterator iterator() {
		return chromosomes.values().iterator();
	}

	/**
	 * Total genome length: add all chromosomes
	 */
	public long length() {
		if (length <= 0) {
			length = 0;
			for (Chromosome chr : chromosomes.values())
				length += chr.getEnd() - chr.getStart() + 1;
		}

		return length;
	}

	/**
	 * Parse a comma separated property as a string array
	 */
	String[] propertyToStringArray(Properties properties, String attr) {
		String value = properties.getProperty(attr);
		if (value == null) return new String[0];

		String values[] = value.split("[\\s+,]");
		LinkedList list = new LinkedList<>();
		for (String val : values)
			if (val.length() > 0) list.add(val);

		return list.toArray(new String[0]);
	}

	/**
	 * Read the whole genome sequence into memory
	 * @param fastaFile : Path to a Fasta file
	 * @return true if it was successful
	 */
	public boolean readGenomeSequence(String fastaFile) {
		// Read fasta sequence
		FastaFileIterator ffi = new FastaFileIterator(fastaFile);
		for (String seq : ffi) {
			String chrName = ffi.getName();
			Chromosome chromosome = getChromosome(chrName);
			if (chromosome != null) {
				chromosome.setSequence(seq);
			} else {
				// Chromosome not found, create a new one
				chromosome = new Chromosome(this, 0, seq.length(), chrName);
				chromosome.setSequence(seq);
				add(chromosome);
			}
		}

		return true;
	}

	/**
	 * Remove a chromosome
	 * WARINIG: Doesn't check any dependencies!
	 */
	public void remove(Chromosome chromo) {
		chromosomes.remove(chromo.getId());
	}

	/**
	 * Save genome to file
	 */
	public void save(String fileName) {
		// Create a list of 'markers' to save
		Markers markers = new Markers();
		markers.add(this);

		for (Chromosome chr : this)
			markers.add(chr);

		for (Gene g : this.getGenes())
			markers.add(g);

		// Save markers to file
		markers.save(fileName);
	}

	/**
	 * Parse a line from a serialized file
	 */
	@Override
	public void serializeParse(MarkerSerializer markerSerializer) {
		super.serializeParse(markerSerializer);
		version = markerSerializer.getNextField();
		species = markerSerializer.getNextField();

		for (Marker m : markerSerializer.getNextFieldMarkers())
			add((Chromosome) m);
	}

	/**
	 * Create a string to serialize to a file
	 */
	@SuppressWarnings({ "unchecked", "rawtypes" })
	@Override
	public String serializeSave(MarkerSerializer markerSerializer) {
		return super.serializeSave(markerSerializer) //
				+ "\t" + version //
				+ "\t" + species //
				+ "\t" + markerSerializer.save((Iterable) chromosomes.values()) //
		;
	}

	private void setGenomeId() {
		genomeId = genomeIdCounter++;
	}

	/**
	 * Show number of genes, transcripts & exons
	 */
	@Override
	public String toString() {
		return toString(null);
	}

	/**
	 * Show number of genes, transcripts & exons
	 * Arr all errors to buffer
	 */
	public String toString(StringBuilder errors) {
		StringBuilder sb = new StringBuilder();

		// Initialize counters
		int exonSeq = 0, exonNoSeq = 0;
		int countGenes = 0, countGenesProteinCoding = 0;
		int countTranscripts = 0, countTranscriptsProteinCoding = 0;
		int countTsl = 0;
		int countExons = 0, countCds = 0;
		int countCheckAa = 0, countCheckDna = 0;
		int countUtrs = 0;
		int errorProteinLength = 0;
		int errorProteinStopCodons = 0;
		int warningStopCodon = 0;
		int errorStartCodon = 0;
		int errorTr = 0;
		Genes genes = getGenes();

		// For each gene
		for (Gene g : genes) {
			countGenes++;
			if (g.isProteinCoding()) countGenesProteinCoding++;

			for (Transcript tr : g) {
				if (tr.isProteinCoding()) countTranscriptsProteinCoding++;
				if (tr.isAaCheck()) countCheckAa++;
				if (tr.isDnaCheck()) countCheckDna++;
				if (tr.hasTranscriptSupportLevelInfo()) countTsl++;

				int numCds = tr.getCds().size();
				int numExons = tr.subIntervals().size();

				countTranscripts++;
				countExons += numExons;
				countCds += numCds;

				for (Exon e : tr) {
					if (e.getSequence().isEmpty()) exonNoSeq++;
					else exonSeq++;
				}

				//---
				// Transcript sanity check: Check if there are any common errors in protein coding transcript
				//---
				if (tr.isProteinCoding()) {
					boolean hasError = false, hasWarning = false;

					if (!tr.getUtrs().isEmpty()) countUtrs++;

					if (tr.isErrorProteinLength()) {
						hasError = true;
						errorProteinLength++; // Protein length error
						if (errors != null) errors.append("ERROR: Protein coding transcript '" + tr.getId() + "' has length " + tr.cds().length() + " (not mutiple of 3).\n");
					} else if (tr.isWarningStopCodon()) {
						// This is considered a warning, not an error (sometimes
						// the annotations exclude STOP codon on pourpose, although GTF
						// say they should not)
						// Note: If there are length errors, we should not check
						//       this, since it will pop up almost surely.
						warningStopCodon++; // Protein does not end with STOP codon
						hasWarning = true;
						if (errors != null) errors.append("WARNING: Protein coding transcript '" + tr.getId() + "' last codon is not a STOP codon\n");
					}

					if (tr.isErrorStopCodonsInCds()) {
						hasError = true;
						errorProteinStopCodons++; // Protein has STOP codons in CDS
						if (errors != null) errors.append("ERROR: Protein coding transcript '" + tr.getId() + "' has STOP codons in non-last position\n");
					}

					if (tr.isErrorStartCodon()) {
						hasError = true;
						errorStartCodon++; // Protein does not start with START codon
						if (errors != null) errors.append("ERROR: Protein coding transcript '" + tr.getId() + "' first codon is not a START codon\n");
					}

					if (hasError) errorTr++;

					if (errors != null && (hasError || hasWarning)) {
						errors.append(tr + "\n");
					}
				}
			}
		}

		// Show summary
		double avgTrPerGene = countTranscripts / ((double) countGenes);
		double avgExonPerTr = countExons / ((double) countTranscripts);

		// Genome & Genes
		sb.append("#-----------------------------------------------\n");
		sb.append("# Genome name                : '" + species + "'" + "\n");
		sb.append("# Genome version             : '" + version + "'\n");
		sb.append("# Genome ID                  : '" + getGenomeId() + "'" + "\n");
		sb.append("# Has protein coding info    : " + hasCodingInfo() + "\n");
		sb.append("# Has Tr. Support Level info : " + hasTranscriptSupportLevelInfo() + "\n");
		sb.append("# Genes                      : " + countGenes + "\n");
		sb.append("# Protein coding genes       : " + countGenesProteinCoding + "\n");

		// Transcripts
		sb.append("#-----------------------------------------------\n");
		sb.append("# Transcripts                : " + countTranscripts + "\n");
		sb.append(String.format("# Avg. transcripts per gene  : %.2f", avgTrPerGene) + "\n");
		if (hasTranscriptSupportLevelInfo()) {
			sb.append("# TSL transcripts            : " + countTsl + "\n");
		}

		// Checked transcripts
		sb.append("#-----------------------------------------------\n");
		sb.append("# Checked transcripts        : \n");
		if (countTranscriptsProteinCoding > 0) sb.append(String.format("#               AA sequences : %6d ( %.2f%% )\n", countCheckAa, (100.0 * countCheckAa / countTranscriptsProteinCoding)));
		if (countTranscripts > 0) sb.append(String.format("#              DNA sequences : %6d ( %.2f%% )\n", countCheckDna, (100.0 * countCheckDna / countTranscripts)));

		// Coding transcripts
		sb.append("#-----------------------------------------------\n");
		sb.append("# Protein coding transcripts : " + countTranscriptsProteinCoding + "\n");
		if (countTranscriptsProteinCoding > 0) {
			sb.append(String.format("#              Length errors : %6d ( %.2f%% )\n", errorProteinLength, (100.0 * errorProteinLength / countTranscriptsProteinCoding)));
			sb.append(String.format("#  STOP codons in CDS errors : %6d ( %.2f%% )\n", errorProteinStopCodons, (100.0 * errorProteinStopCodons / countTranscriptsProteinCoding)));
			sb.append(String.format("#         START codon errors : %6d ( %.2f%% )\n", errorStartCodon, (100.0 * errorStartCodon / countTranscriptsProteinCoding)));
			sb.append(String.format("#        STOP codon warnings : %6d ( %.2f%% )\n", warningStopCodon, (100.0 * warningStopCodon / countTranscriptsProteinCoding)));
			sb.append(String.format("#              UTR sequences : %6d ( %.2f%% )\n", countUtrs, (100.0 * countUtrs / countTranscripts)));
			sb.append(String.format("#               Total Errors : %6d ( %.2f%% )\n", errorTr, (100.0 * errorTr / countTranscriptsProteinCoding)));
			if (countUtrs <= 0) sb.append("# WARNING                    : No protein coding transcript has UTR\n");
		}

		// Exons & CDS
		sb.append("#-----------------------------------------------\n");
		sb.append("# Cds                        : " + countCds + "\n");
		sb.append("# Exons                      : " + countExons + "\n");
		sb.append("# Exons with sequence        : " + exonSeq + "\n");
		sb.append("# Exons without sequence     : " + exonNoSeq + "\n");
		sb.append(String.format("# Avg. exons per transcript  : %.2f", avgExonPerTr) + "\n");

		// MT check: Only check if number of chromosomes in the genome is more than one
		// Check that MT chromosome has a proper codon table
		ArrayList mtChrs = new ArrayList<>();
		for (Chromosome chr : this)
			if (chr.isMt()) mtChrs.add(chr);

		// It there a mi
		if ((getChromosomes().size() > 1) && (mtChrs.size() <= 0)) {
			sb.append("# WARNING                    : No mitochondrion chromosome found\n");
		} else {
			// Check if mitochondrion chromosome has a mitochondrion codon table
			for (Chromosome chr : mtChrs) {
				String mtCodonTable = chr.codonTable().getName();
				if (mtCodonTable.toUpperCase().indexOf("MITO") < 0) sb.append("# WARNING!                   : Mitochondrion chromosome '" + chr.getId() + "' does not have a mitochondrion codon table (codon table = '" + mtCodonTable + "'). You should update the config file.\n");
			}
		}

		// Chromosomes
		sb.append("#-----------------------------------------------\n");
		sb.append("# Number of chromosomes      : " + getChromosomes().size() + "\n");
		sb.append("# Chromosomes                : Format 'chromo_name size codon_table'\n");
		for (Chromosome chr : getChromosomesSortedSize())
			sb.append("#\t\t'" + chr.getId() + "'\t" + chr.size() + "\t" + chr.getCodonTable().getName() + "\n");

		if (countTranscriptsProteinCoding <= 0) sb.append("\n# WARNING! : No protein coding transcripts found.\n");

		// Done
		sb.append("#-----------------------------------------------\n");

		return sb.toString();
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy