All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.codons.FindRareAaIntervals Maven / Gradle / Ivy

The newest version!
package org.snpeff.codons;

import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;

import org.snpeff.fileIterator.FastaFileIterator;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Genome;
import org.snpeff.interval.RareAminoAcid;
import org.snpeff.interval.Transcript;
import org.snpeff.util.GprSeq;

/**
 * Find intervals where rare amino acids occur
 * 
 * @author pablocingolani
 */
public class FindRareAaIntervals {

	public static final double RARE_THRESHOLD = 1e-5;

	boolean verbose = false;
	double rareThreshold = RARE_THRESHOLD;
	Genome genome;
	CodonTable codonTable;
	int count[];
	int countTotal;
	boolean isInTable[];
	HashMap trById;
	HashMap rareAaByPos;

	public FindRareAaIntervals(Genome genome) {
		this.genome = genome;
		codonTable = genome.codonTable();
		count = new int['Z']; // It's all upper case
		isInTable = new boolean['Z']; // It's all upper case
		countTotal = 0;
		rareAaByPos = new HashMap();
	}

	/**
	 * Add a rare amino acid (if not already added)
	 * @param tr
	 * @param start
	 * @param end
	 */
	void addRareAa(Transcript tr, int start, int end) {
		int s = Math.min(start, end);
		int e = Math.max(start, end);

		String key = tr.getChromosomeName() + ":" + s + "-" + e;
		if( rareAaByPos.containsKey(key) ) return; // Nothing to add

		RareAminoAcid raa = new RareAminoAcid(tr, s, e, "");
		rareAaByPos.put(key, raa);
	}

	/**
	 * Find all sites having rare amino acids
	 * @param proteinFastaFile
	 */
	public Collection findRareAa(String proteinFastaFile) {
		// Find all amino acids in table
		isInTable();

		// Find rare amino acids from protein sequences
		proteingFileStats(proteinFastaFile);

		// Find rare amino acids
		String rare = findRareNames();
		if( rare.isEmpty() ) return new LinkedList(); // Nothing to do

		// Find all sites were rare amino acids are
		findRareAaSites(proteinFastaFile, rare);

		return rareAaByPos.values();
	}

	/**
	 * Find interval AA index in a transcript
	 * @param trId
	 * @param aaIdx
	 */
	void findRareAaSites(String trId, int aaIdx) {
		// Create index?
		if( trById == null ) {
			trById = new HashMap();
			for( Gene gene : genome.getGenome().getGenes() )
				for( Transcript tr : gene )
					trById.put(tr.getId(), tr);
		}

		// Find transcript
		Transcript tr = trById.get(trId);
		if( tr == null ) {
			if( verbose ) System.err.println("WARNING: Cannot find transcript '" + trId + "'");
			return;
		}

		// Create markers. There might be more than one interval since an intron can be in the middle of the codon.
		int cds2pos[] = tr.baseNumberCds2Pos();
		int pos = 0, posPrev = 0, start = -1;
		int step = tr.isStrandPlus() ? 1 : -1;
		for( int cds = aaIdx * 3; cds < (aaIdx + 1) * 3; cds++ ) {
			pos = cds2pos[cds];
			if( start < 0 ) start = pos;
			else if( pos != posPrev + step ) {
				// Non-contiguous: Create a new marker and add it to the list
				addRareAa(tr, start, pos);
				start = -1;
			}

			posPrev = pos;
		}

		if( start >= 0 ) addRareAa(tr, start, pos);
	}

	/**
	 * Calculate AA frequency
	 */
	void findRareAaSites(String proteinFastaFile, String rareAa) {
		char rareAaChr[] = rareAa.toCharArray();
		FastaFileIterator ffi = new FastaFileIterator(proteinFastaFile);

		// For every protein sequence
		for( String seq : ffi ) {
			// For every rare AA
			for( char aa : rareAaChr ) {
				int aaIdx = seq.indexOf(aa);
				if( aaIdx >= 0 ) { // Has this protein sequence any rare AA?
					String trId = ffi.getTranscriptId();
					findRareAaSites(trId, aaIdx);
				}
			}
		}
	}

	/**
	 * A string containing all rare amino acids
	 * @return
	 */
	String findRareNames() {
		StringBuilder rare = new StringBuilder();
		for( int i = 0; i < count.length; i++ ) {
			double p = ((double) count[i]) / countTotal;
			if( (count[i] > 0) && ((p < rareThreshold) || !isInTable[i]) ) rare.append((char) i);
		}
		return rare.toString();
	}

	public double getRareThreshold() {
		return rareThreshold;
	}

	/**
	 * Calculate all AA in a codon table
	 */
	void isInTable() {
		for( char c1 : GprSeq.BASES )
			for( char c2 : GprSeq.BASES )
				for( char c3 : GprSeq.BASES ) {
					String codon = "" + c1 + c2 + c3;
					String aa = codonTable.aa(codon);
					isInTable[aa.toUpperCase().charAt(0)] = true;
				}
	}

	/**
	 * Calculate AA frequency
	 * @param proteinFastaFile
	 */
	void proteingFileStats(String proteinFastaFile) {
		FastaFileIterator ffi = new FastaFileIterator(proteinFastaFile);
		for( String seq : ffi ) {
			// For each protein sequence, count amino acids
			for( char c : seq.toUpperCase().toCharArray() ) {
				// Ignore non-letters (e.g. '*' = stop codon)
				// Ignore unknown amino acids
				if( Character.isLetter(c) && (c != 'X') ) {
					count[c]++;
					countTotal++;
				}
			}
		}
	}

	public void setRareThreshold(double rareThreshold) {
		this.rareThreshold = rareThreshold;
	}

	public void setVerbose(boolean verbose) {
		this.verbose = verbose;
	}

	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder();
		for( int i = 0; i < count.length; i++ ) {
			double p = ((double) count[i]) / countTotal;
			if( count[i] > 0 ) sb.append(String.format("\t%s\t%d\t%.2e\t%b\n", (char) i, count[i], p, isInTable[i]));
		}

		return sb.toString();
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy