All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.samtools.tabix.TabixIndex Maven / Gradle / Ivy

The newest version!
package net.sf.samtools.tabix;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;

import org.snpeff.util.Gpr;

/**
 * Tabix Index (i.e. the structure stored in *.tbi file)
 *
 * From the paper:
 *
 * Binnig index:
 *      In Tabix, each bin k, 0 <= k <= 37449, represents a half-close-half-open interval
 *
 *          [ (k-ol) sl , (k-ol+1) sl )
 *
 *      , where
 *      	'l' is the level of the bin             l = floor[ log2(7k + 1) / 3 ]
 *      	'sl' is the size of the bin at level l  sl = 2^(29 - 3 l)
 *			'ol': is the offset at l.               ol = (23 l - 1)/7
 *
 *      In this scheme, bins span different sizes depending on their levels:
 *      	Level	Bins		 Size (sl)
 *      	------------------------------
 *      	0		0			512Mb	2^29
 *      	1		1-8			 64Mb	2^26
 *      	2		9-72		  8Mb	2^23
 *      	3		73-584		  1Mb	2^20
 *      	4		585-4680	128kb	2^17
 *      	5		4681-37449	 16kb	2^14
 *
 * Linear index: In the linear index, we keep for each tiling 16kb window
 * 		the virtual file offset of the leftmost record (i.e. having the
 * 		smallest start coordinate) that overlaps the window. When we search
 * 		for records overlapping a query interval, we will know from the
 * 		index the leftmost record that possibly overlaps the query interval.
 * 		Records having smaller coordinates than this leftmost record can be
 * 		skipped and unsuccessful seek calls can be saved.
 */
public class TabixIndex {

	public static final int TAD_LIDX_SHIFT = 14; // Minimum bin size is 2^TAD_LIDX_SHIFT = 2^14 = 16KB

	boolean debug;
	private HashMap binningIndex; // Binning index
	private long[] linearIndex; // Linear index

	public static String binInfo(int binNumber) {
		int binLevel = (int) Math.floor((Math.log(7 * binNumber + 1) / (3 * Math.log(2.0))));
		int binSize = 1 << (29 - 3 * binLevel);
		int offsetLevel = ((1 << (3 * binLevel)) - 1) / 7;

		int start = (binNumber - offsetLevel) * binSize;
		int end = (binNumber + 1 - offsetLevel) * binSize;

		return "bin: " + binNumber //
				+ ", level: " + binLevel //
				+ ", size: " + binSize //
				+ ", offset: " + offsetLevel //
				+ ", interval: [ " + start + " , " + end + " )" //
				;
	}

	public TabixIndex() {
		binningIndex = new HashMap();
	}

	public TPair64[] get(int binNum) {
		return binningIndex.get(binNum);
	}

	public long minOffset(int beg) {
		// Minimum offset within file
		// Linear index has the offset of the smallest start coordinate that
		// overlaps the each 16KB window (i.e. all possible lowest level bins)
		if (linearIndex.length > 0) {
			int begTad = beg >> TAD_LIDX_SHIFT;
			if (begTad >= linearIndex.length) return linearIndex[linearIndex.length - 1]; // Pick last position in linear index
			else return linearIndex[begTad]; // Use linear index
		}

		return 0;
	}

	public void put(int bin, TPair64[] chunks) {
		binningIndex.put(bin, chunks);
	}

	public void readIndex(InputStream is) throws IOException {
		int numBins = TabixReader.readInt(is);
		if (debug) Gpr.debug("Number of bins: " + numBins);

		// Load each bin
		for (int j = 0; j < numBins; ++j) {
			int binNumber = TabixReader.readInt(is); // Bin number
			int numChunks = TabixReader.readInt(is); // How many 'chunks' in this bin?

			TPair64[] chunks = new TPair64[numChunks];
			if (debug) Gpr.debug("\t" + binInfo(binNumber) + "\tnumChunks: " + numChunks);

			for (int chunNum = 0; chunNum < chunks.length; ++chunNum) {
				TPair64 tp = new TPair64();
				tp.readIndex(is);
				chunks[chunNum] = tp;
				if (debug) Gpr.debug("\t\tchunk[" + chunNum + "]: " + chunks[chunNum]);
			}

			put(binNumber, chunks);
		}

		// Load linear index
		int linearIndexLen = TabixReader.readInt(is);
		long[] linearIndex = new long[linearIndexLen];
		for (int tid = 0; tid < linearIndex.length; ++tid) {
			linearIndex[tid] = TabixReader.readLong(is);
			if (debug) Gpr.debug("\tlinearIndex[" + tid + "] :" + linearIndex[tid]);
		}
		setLinearIndex(linearIndex);

	}

	public void setDebug(boolean debug) {
		this.debug = debug;
	}

	public void setLinearIndex(long[] linearIndex) {
		this.linearIndex = linearIndex;
	}

	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder();

		ArrayList keys = new ArrayList<>();
		keys.addAll(binningIndex.keySet());
		Collections.sort(keys);

		sb.append("Binning index size:" + binningIndex.size() + "\n");
		for (Integer binNum : keys) {
			TPair64[] chunks = binningIndex.get(binNum);
			sb.append("\t" + binInfo(binNum) + "\n\tNumber of chunks:" + chunks.length + "\n");

			for (int i = 0; i < chunks.length; i++)
				sb.append("\t\tchunk " + i + "\t" + chunks[i] + "\n");
		}

		sb.append("Linear index size: " + linearIndex.length + "\n");
		for (int i = 0; i < linearIndex.length; i++)
			sb.append("\t" + i + "\t" + linearIndex[i] + "\n");

		return sb.toString();
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy