All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.stats.ReadsOnMarkersModel Maven / Gradle / Ivy

The newest version!
package org.snpeff.stats;

import java.util.HashSet;

import org.snpeff.coverage.MarkerTypes;
import org.snpeff.interval.Chromosome;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Marker;
import org.snpeff.interval.Markers;
import org.snpeff.probablility.RandMarker;
import org.snpeff.snpEffect.SnpEffectPredictor;
import org.snpeff.util.Gpr;

/**
 * Calculate the maximum interval length by type, for all markers in a genome
 * Create a probability model based on binomial ditribution.
 * 
 * @author pcingola
 */
public class ReadsOnMarkersModel {

	boolean verbose;
	int readLength, numIterations, numReads;
	CountByType countBases; // Number of bases covered by each marker type
	CountByType countMarkers; // Number of markers (for each marker type)
	CountByType rawCountMarkers; // Number of markers (before join or overlap)
	CountByType rawCountBases; // Number of bases covered by each marker type (befoew join or overlap)
	CountByType prob; // Binomial probability (for each marker type)
	SnpEffectPredictor snpEffectPredictor;
	MarkerTypes markerTypes;

	public ReadsOnMarkersModel(SnpEffectPredictor snpEffectPredictor) {
		super();
		this.snpEffectPredictor = snpEffectPredictor;
		countBases = new CountByType();
		countMarkers = new CountByType();
		rawCountMarkers = new CountByType();
		rawCountBases = new CountByType();
		prob = null;
		markerTypes = new MarkerTypes();
	}

	/**
	 * Count bases covered for each marker type
	 */
	public void countBases() {
		//---
		// Add all markers
		//---
		Markers markers = new Markers();
		markers.add(snpEffectPredictor.getMarkers());
		for (Gene gene : snpEffectPredictor.getGenome().getGenes()) {
			markers.add(gene);
			markers.add(gene.markers());
		}

		for (Chromosome chr : snpEffectPredictor.getGenome())
			markers.add(chr);

		//---
		// Calculate raw counts
		//---
		for (Marker m : markers) {
			String mtype = markerTypes.getType(m);
			String msubtype = markerTypes.getSubType(m);

			rawCountMarkers.inc(mtype);
			rawCountBases.inc(mtype, m.size());

			// Count sub-types (if any)
			if (msubtype != null) {
				rawCountMarkers.inc(msubtype);
				rawCountBases.inc(msubtype, m.size());
			}
		}

		//---
		// Count number of bases for each marker type (overlap and join)
		//---
		for (String mtype : rawCountMarkers.keysSorted()) {
			if (mtype.equals(Chromosome.class.getSimpleName())) continue; // We calculate chromosomes later (it's faster) 

			if (verbose) System.err.print(mtype + ":");

			if (countMarkers.get(mtype) == 0) {
				for (Chromosome chr : snpEffectPredictor.getGenome())
					countBases(mtype, chr, markers);
			}

			if (verbose) System.err.println("");
		}

		// Show chromosomes length
		String mtype = Chromosome.class.getSimpleName();
		for (Chromosome chr : snpEffectPredictor.getGenome()) {
			countBases.inc(mtype, chr.size());
			countMarkers.inc(mtype);
		}
	}

	/**
	 * Count number of bases, for a given chromosome and marker type
	 * @param mtype
	 * @param chr
	 * @param markers
	 * @return
	 */
	void countBases(String mtype, Chromosome chr, Markers markers) {
		String chrName = chr.getChromosomeName();
		if (verbose) System.err.print(" " + chrName);

		// Initialize
		byte busy[] = new byte[chr.size()];
		for (int i = 0; i < busy.length; i++)
			busy[i] = 0;

		for (Marker m : markers) {
			// Same marker type & same chromo? Count bases
			if (m.getChromosomeName().equals(chrName) && markerTypes.isType(m, mtype)) {
				for (int i = m.getStart(); i <= m.getEnd(); i++)
					busy[i] = 1;
			}
		}

		int latest = 0;
		for (int i = 0; i < busy.length; i++) {
			// Transition? Count another marker
			if ((i > 0) && (busy[i] != 0) && (busy[i - 1] == 0)) {
				if ((i - latest) <= readLength) countBases.inc(mtype, i - latest); // Intervals are less than one read away? Unify them
				else countMarkers.inc(mtype);
			}

			// Base busy? Count another base
			if (busy[i] != 0) {
				countBases.inc(mtype);
				latest = i;
			}
		}
	}

	public CountByType getCountBases() {
		return countBases;
	}

	public CountByType getCountMarkers() {
		return countMarkers;
	}

	public CountByType getProb() {
		return prob;
	}

	public CountByType getRawCountBases() {
		return rawCountBases;
	}

	public CountByType getRawCountMarkers() {
		return rawCountMarkers;
	}

	/**
	 * Load data from a file
	 * @param fileName
	 */
	public void load(String fileName) {

		boolean header = true;
		for (String line : Gpr.readFile(fileName).split("\n")) {
			if (header) {
				header = false;
				continue;
			}

			// Split line and parse data 
			String recs[] = line.split("\t");

			String mtype = recs[0];
			countBases.inc(mtype, Gpr.parseIntSafe(recs[1]));
			countMarkers.inc(mtype, Gpr.parseIntSafe(recs[2]));
		}
	}

	/**
	 * Calculate probabilities
	 */
	void probabilities() {
		// Already done, nothing to do
		if (prob != null) return;

		// Get total length and count for chromosomes (chromosome size is total genome length)
		String chrType = Chromosome.class.getSimpleName();
		long chrSize = countBases.get(chrType);
		long chrCount = countMarkers.get(chrType);
		if (chrCount <= 0) return; // Zero length genome? Forgot to count bases?

		// Correct readLength 
		int readLength = this.readLength;
		if (readLength < 1) readLength = 1;

		// Probabilities for each marker
		prob = new CountByType();
		for (String mtype : countMarkers.keysSorted()) {
			long size = countBases.get(mtype);
			long count = countMarkers.get(mtype);

			// Calculate and cap probability value
			double p = ((double) (size + (readLength - 1) * count)) / ((double) (chrSize - (readLength - 1) * chrCount));
			p = Math.min(1.0, p);
			p = Math.max(0.0, p);

			prob.setScore(mtype, p);
		}
	}

	/**
	 * Sample and calculate the probability of hitting each type 
	 * of marker (marker.class). Creates 'numReads' reads of 
	 * size 'readLen' and count how many of them hit each marker 
	 * type.
	 */
	CountByType randomSampling(int readLen, int numReads) {
		CountByType countReads = new CountByType();
		RandMarker randMarker = new RandMarker(snpEffectPredictor.getGenome());

		for (int i = 0; i < numReads; i++) {
			// Random read
			Marker read = randMarker.rand(readLen);

			// Where does it hit?
			Markers regions = snpEffectPredictor.queryDeep(read);
			HashSet doneRegion = new HashSet();
			for (Marker m : regions) {
				String mtype = markerTypes.getType(m);
				String msubtype = markerTypes.getSubType(m);

				if (!doneRegion.contains(mtype)) {
					countReads.inc(mtype); // Count reads
					doneRegion.add(mtype); // Do not count twice
				}

				if ((msubtype != null) && !doneRegion.contains(msubtype)) {
					countReads.inc(msubtype); // Count reads
					doneRegion.add(msubtype); // Do not count twice
				}

			}
		}

		return countReads;
	}

	/**
	 * Sample and calculate the probability of hitting each type 
	 * of marker (marker.class). Creates 'numReads' reads of 
	 * size 'readLen' and count how many of them hit each marker 
	 * type. Iterate 'iterations' times to obtain a distribution.
	 */
	public void randomSampling(int iterations, int readLen, int numReads) {
		System.out.print("Iteration");
		for (String type : rawCountMarkers.keysSorted())
			System.out.print("\t" + type);
		System.out.println("");

		for (int it = 0; it < iterations; it++) {
			CountByType count = randomSampling(readLen, numReads);
			System.out.print(it);
			for (String type : rawCountMarkers.keysSorted())
				System.out.print("\t" + count.get(type));
			System.out.println("");
		}
	}

	/**
	 * Run
	 * @return 
	 */
	public boolean run() {
		countBases(); // Count 
		probabilities(); // Calculate probabilities
		return true;
	}

	/**
	 * Save model to file
	 * @param fileName
	 */
	public void save(String fileName) {
		StringBuilder sb = new StringBuilder();
		sb.append("marker_type\tsize\tcount\tbinomial_p\n");

		probabilities();
		for (String mtype : markerTypes.markerTypesClass())
			sb.append(mtype + "\t" + countBases.get(mtype) + "\t" + countMarkers.get(mtype) + "\n");

		Gpr.toFile(fileName, sb.toString());
	}

	public void setMarkerTypes(MarkerTypes markerTypes) {
		this.markerTypes = markerTypes;
	}

	public void setNumIterations(int numIterations) {
		this.numIterations = numIterations;
	}

	public void setNumReads(int numReads) {
		this.numReads = numReads;
	}

	public void setReadLength(int readLength) {
		this.readLength = readLength;
	}

	public void setSnpEffectPredictor(SnpEffectPredictor snpEffectPredictor) {
		this.snpEffectPredictor = snpEffectPredictor;
	}

	public void setVerbose(boolean verbose) {
		this.verbose = verbose;
	}

	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder();
		sb.append("marker_type\tsize\tcount\traw_size\traw_count\tbinomial_p\n");

		probabilities();
		for (String mtype : countMarkers.keysSorted())
			sb.append(mtype + "\t" + countBases.get(mtype) + "\t" + countMarkers.get(mtype) + "\t" + rawCountBases.get(mtype) + "\t" + rawCountMarkers.get(mtype) + "\t" + prob.getScore(mtype) + "\n");

		return sb.toString();
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy