All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.coverage.CountReadsOnMarkers Maven / Gradle / Ivy

The newest version!
package org.snpeff.coverage;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;

import org.snpeff.interval.Chromosome;
import org.snpeff.interval.Genome;
import org.snpeff.interval.Marker;
import org.snpeff.probablility.Binomial;
import org.snpeff.snpEffect.SnpEffectPredictor;
import org.snpeff.stats.CountByType;
import org.snpeff.stats.CoverageByType;
import org.snpeff.stats.plot.GoogleBarChart;
import org.snpeff.stats.plot.GoogleGeneRegionChart;
import org.snpeff.stats.plot.GoogleGeneRegionNumExonsChart;
import org.snpeff.util.Gpr;
import org.snpeff.util.Timer;

/**
 * Count how many reads map (from many SAM/BAM files) onto markers
 * @author pcingola
 */
public class CountReadsOnMarkers {

	public static final int SHOW_EVERY = 10000;
	public static final int MAX_EXONS_CHART = 10;
	public static boolean debug = true;

	boolean verbose = false; // Be verbose
	List fileNames;
	List names;
	Genome genome;
	SnpEffectPredictor snpEffectPredictor;
	ArrayList countReadsByFile;
	MarkerTypes markerTypes;
	CountByType readsByFile;

	public CountReadsOnMarkers(SnpEffectPredictor snpEffectPredictor) {
		init(snpEffectPredictor);
	}

	/**
	 * Add a SAM/BAM file to be processed
	 * @param samFileName
	 */
	public void addFile(String samFileName) {
		fileNames.add(samFileName);
		names.add(Gpr.removeExt(Gpr.baseName(samFileName)));
	}

	public void addMarkerType(Marker marker, String type) {
		markerTypes.addType(marker, type);
	}

	/**
	 * Create a collection of all markers
	 * @return
	 */
	List allMarkers() {
		// Retrieve all possible keys, sort them
		HashSet keys = new HashSet();
		for (CountReads cr : countReadsByFile)
			keys.addAll(cr.allMarkers());

		ArrayList keysSorted = new ArrayList(keys.size());
		keysSorted.addAll(keys);
		Collections.sort(keysSorted);
		return keysSorted;
	}

	/**
	 * Count markers from all files
	 */
	public void count() {
		genome = snpEffectPredictor.getGenome();

		// Iterate over all BAM/SAM files
		for (String fileName : fileNames) {
			CountReads countReads = new CountReads(fileName, snpEffectPredictor);
			countReads.setMarkerTypes(markerTypes);
			countReads.setVerbose(verbose);
			countReads.count();

			countReadsByFile.add(countReads); // Add count to list
		}

		if (verbose) Timer.showStdErr("Done.");
	}

	/**
	 * Count how many of each marker type are there
	 * @return
	 */
	CountByType countMarkerTypes(Collection markersToCount) {
		CountByType countByMarkerType = new CountByType();
		for (Marker marker : markersToCount) {
			String type = markerTypes.getType(marker);
			String subtype = markerTypes.getSubType(marker);
			countByMarkerType.inc(type);
			if (subtype != null) countByMarkerType.inc(subtype);
		}
		return countByMarkerType;
	}

	public MarkerTypes getMarkerTypes() {
		return markerTypes;
	}

	/**
	 * Average read length
	 * @return
	 */
	public int getReadLengthAvg() {
		long readLengthCount = 0;
		long readLengthSum = 0;

		for (CountReads cr : countReadsByFile) {
			readLengthSum += cr.getReadLengthSum();
			readLengthCount += cr.getReadLengthCount();
		}

		if (readLengthCount <= 0) return 0;
		double rl = ((double) readLengthSum) / readLengthCount;
		return (int) Math.round(rl);
	}

	/**
	 * Show charts in html 
	 * @return
	 */
	public String html() {
		StringBuilder sbHead = new StringBuilder();
		StringBuilder sbBody = new StringBuilder();

		//---
		// Barchart: By Marker types (all files toghether)
		//---
		ArrayList countTypesByFile = new ArrayList();
		for (CountReads cr : countReadsByFile)
			countTypesByFile.add(cr.getCountTypes());

		// Create 3 charts: One for all intervals, one for Exons and one for Introns.
		HashSet keySetAll = new HashSet();
		HashSet keySetExon = new HashSet();
		HashSet keySetIntron = new HashSet();
		for (CountByType ct : countTypesByFile)
			for (String key : ct.keySet())
				if (key.startsWith("Exon:")) keySetExon.add(key);
				else if (key.startsWith("Intron:")) keySetIntron.add(key);
				else keySetAll.add(key);

		keySetAll.remove("Chromosome"); // We don't want this number in the chart (usually it's too big)
		HashMap> keySets = new HashMap>();
		keySets.put("", keySetAll);
		keySets.put("Exons", keySetExon);
		keySets.put("Introns", keySetIntron);

		// Sort names
		ArrayList keySetNames = new ArrayList();
		keySetNames.addAll(keySets.keySet());
		Collections.sort(keySetNames);

		// Create one barchart for each keySet
		for (String ksname : keySetNames) {
			HashSet keySet = keySets.get(ksname);
			// Sort keys
			ArrayList keys = new ArrayList();
			keys.addAll(keySet);
			Collections.sort(keys);

			// Do not create empty charts
			if (keys.size() <= 0) continue;

			// Add all columns
			GoogleBarChart barchart = new GoogleBarChart("Count by file " + ksname);
			barchart.setxLables(keys);

			GoogleBarChart barchartPercent = new GoogleBarChart("Count by file " + ksname + " [Percent]");
			barchartPercent.setxLables(keys);

			// Add all files
			for (int i = 0; i < names.size(); i++) {
				String name = names.get(i);
				CountByType ct = countTypesByFile.get(i);

				// Add all values
				ArrayList columnValues = new ArrayList();
				for (String key : keys)
					if (keys.contains(key)) columnValues.add(ct.get(key) + "");

				// Add column to chart
				barchart.addColumn(name, columnValues);
				barchartPercent.addColumn(name, columnValues);
			}

			// Add header and body
			sbHead.append(barchart.toStringHtmlHeader());
			sbBody.append(barchart.toStringHtmlBody());

			barchartPercent.percentColumns();
			sbHead.append(barchartPercent.toStringHtmlHeader());
			sbBody.append(barchartPercent.toStringHtmlBody());

		}

		//---
		// Barchart: By Marker types (one by file)
		//---

		// Add all files
		for (int i = 0; i < names.size(); i++) {
			String name = names.get(i);

			// Create one barchart for each keySet
			for (String ksname : keySetNames) {
				// Sort keys
				HashSet keySet = keySets.get(ksname);
				ArrayList keys = new ArrayList();
				keys.addAll(keySet);
				Collections.sort(keys);

				// Do not create empty charts
				if (keys.size() <= 0) continue;

				GoogleBarChart barchart = new GoogleBarChart("Count by file " + name + " " + ksname);
				barchart.setxLables(keys);

				// Add all columns
				CountByType ct = countTypesByFile.get(i);

				// Add all values
				ArrayList columnValues = new ArrayList();
				for (String key : keys)
					if (keys.contains(key)) columnValues.add(ct.get(key) + "");

				// Create chart
				barchart.addColumn(name, columnValues);
				sbHead.append(barchart.toStringHtmlHeader());
				sbBody.append(barchart.toStringHtmlBody());
			}
		}

		//---
		// Genomic region charts
		//---
		ArrayList genRegCharts = new ArrayList();
		for (int i = 0; i < names.size(); i++) {
			String name = names.get(i);
			CoverageByType cvt = countReadsByFile.get(i).getCoverageByType();

			GoogleGeneRegionChart grc = new GoogleGeneRegionChart(cvt, name);
			genRegCharts.add(grc);

			// Show for each transcript length
			ArrayList coverageByExons = countReadsByFile.get(i).getCoverageByExons();
			for (int exons = 1; exons < coverageByExons.size(); exons++) {
				CoverageByType cbt = coverageByExons.get(exons);

				if (!cbt.isEmpty() && (exons <= MAX_EXONS_CHART)) {
					GoogleGeneRegionNumExonsChart grcCbt = new GoogleGeneRegionNumExonsChart(cbt, name + " [ " + exons + " exons ]", exons);
					genRegCharts.add(grcCbt);
				}
			}
		}

		// Add all headers
		for (GoogleGeneRegionChart grc : genRegCharts)
			sbHead.append(grc.toStringHtmlHeader());

		// Add all bodies
		for (GoogleGeneRegionChart grc : genRegCharts)
			sbBody.append(grc.toStringHtmlBody());

		// Return all html code
		return "\n" + sbHead.toString() + "\n\n" + sbBody.toString();
	}

	/**
	 * Initialize
	 * @param snpEffectPredictor
	 */
	void init(SnpEffectPredictor snpEffectPredictor) {
		fileNames = new ArrayList();
		names = new ArrayList();
		countReadsByFile = new ArrayList();
		markerTypes = new MarkerTypes();
		readsByFile = new CountByType();

		if (snpEffectPredictor != null) this.snpEffectPredictor = snpEffectPredictor;
		else this.snpEffectPredictor = new SnpEffectPredictor(new Genome());
	}

	/**
	 * Show probabilities
	 * 
	 * @param prob : Probabilities for each 
	 * 
	 * @return A string showing a tab delimited table
	 */
	public String probabilityTable(CountByType prob) {
		StringBuilder sb = new StringBuilder();

		// Create title line
		sb.append("type\tp.binomial"); // Show 'type' information in first columns
		for (int j = 0; j < countReadsByFile.size(); j++)
			sb.append("\treads." + names.get(j) + "\texpected." + names.get(j) + "\tpvalue." + names.get(j));
		sb.append("\n");

		String chrType = Chromosome.class.getSimpleName();

		// Show counts by type
		CountByType countByType = countMarkerTypes(allMarkers());
		for (String type : countByType.keysSorted()) {
			sb.append(type); // Show 'type' information in first columns

			// Binomial probability model
			double p = 0;
			if ((prob != null) && prob.contains(type)) {
				p = prob.getScore(type);
				sb.append("\t" + p);
			} else sb.append("\t\t");

			// Show counts for each file
			for (int idx = 0; idx < countReadsByFile.size(); idx++) {
				CountByType countTypesFile = countReadsByFile.get(idx).getCountTypes();

				// Stats
				int n = (int) countTypesFile.get(chrType); // Number of reads in the file
				int k = (int) countTypesFile.get(type); // Number of reads hitting this marker type

				long expected = Math.round(countTypesFile.get(chrType) * p);
				double pvalue = Binomial.get().cdfUpEq(p, k, n);

				long countType = countTypesFile.get(type);
				if ((prob != null) && prob.contains(type)) sb.append("\t" + countType + "\t" + expected + "\t" + pvalue);
				else sb.append("\t" + countType + "\t\t");
			}
			sb.append("\n");
		}

		return sb.toString();
	}

	public void setVerbose(boolean verbose) {
		this.verbose = verbose;
	}

	/**
	 * Print table to STDOUT
	 */
	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder();

		// Show title
		sb.append("chr\tstart\tend\ttype\tIDs");
		for (int j = 0; j < countReadsByFile.size(); j++)
			sb.append("\tReads:" + names.get(j) + "\tBases:" + names.get(j));
		sb.append("\n");

		//---
		// Show counts by marker
		//---
		// Show counts for each marker
		for (Marker key : allMarkers()) {
			// Show 'key' information in first columns
			sb.append(key.getChromosomeName() //
					+ "\t" + (key.getStart() + 1) //
					+ "\t" + (key.getEnd() + 1) //
					+ "\t" + key.idChain() //
			);

			// Show counts for each file
			for (int idx = 0; idx < countReadsByFile.size(); idx++)
				sb.append("\t" + countReadsByFile.get(idx).getCountReads().get(key) //
						+ "\t" + countReadsByFile.get(idx).getCountBases().get(key) //
				);
			sb.append("\n");
		}
		sb.append("\n");

		return sb.toString();
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy