net.sf.samtools.tabix.TabixReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package net.sf.samtools.tabix;

/* The MIT License

   Copyright (c) 2010 Broad Institute.

   Permission is hereby granted, free of charge, to any person obtaining
   a copy of this software and associated documentation files (the
   "Software"), to deal in the Software without restriction, including
   without limitation the rights to use, copy, modify, merge, publish,
   distribute, sublicense, and/or sell copies of the Software, and to
   permit persons to whom the Software is furnished to do so, subject to
   the following conditions:

   The above copyright notice and this permission notice shall be
   included in all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
 */

/* Contact: Heng Li  */

/* Minor changes by Pablo Cingolani (Jan-2014) */

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Arrays;
import java.util.HashMap;

import org.snpeff.interval.Chromosome;
import org.snpeff.interval.Variant;
import org.snpeff.util.Gpr;

import htsjdk.samtools.util.BlockCompressedInputStream;

public class TabixReader implements Iterable {

	private static int MAX_BIN = 37450; // Maximum possible number of bins

	boolean debug = false;
	private String fileName;
	private BlockCompressedInputStream fileInputStream;
	private int mPreset;
	private int mSc;
	private int mBc;
	private int mEc;
	private int mMeta;
	private int mSkip;
	private String[] sequenceNames;
	private HashMap sequenceName2tid;
	private TabixIndex[] tabixIndexes;
	private TabixIterator tabixIterator;
	protected boolean showHeader;

	public static void main(String[] args) {
		if (args.length < 1) {
			System.out.println("Usage: java -cp .:sam.jar TabixReader  [region]");
			System.exit(1);
		}
		try {
			TabixReader tr = new TabixReader(args[0]);
			String s;
			if (args.length == 1) { // no region is specified; print the whole file
				while ((s = tr.readLine()) != null)
					System.out.println(s);
			} else { // a region is specified; random access
				TabixIterator iter = tr.query(args[1]); // get the iterator
				while (iter != null && (s = iter.next()) != null)
					System.out.println(s);
			}
		} catch (IOException e) {
		}
	}

	public static int readInt(final InputStream is) throws IOException {
		byte[] buf = new byte[4];
		is.read(buf);
		return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt();
	}

	public static long readLong(final InputStream is) throws IOException {
		byte[] buf = new byte[8];
		is.read(buf);
		return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong();
	}

	/**
	 * Bins span different sizes depending on their levels: Bins Size (sl)
	 * ------------------------ 0 512Mb 2^29 1-8 64Mb 2^26 9-72 8Mb 2^23 73-584 1Mb
	 * 2^20 585-4680 128kb 2^17 4681-37449 16kb 2^14
	 */
	private static int reg2bins(final int beg, final int _end, final int[] list) {
		int i = 0, k, end = _end;
		if (beg >= end)
			return 0;

		// Max size = 2^29
		if (end >= 1 << 29)
			end = 1 << 29;
		--end;
		list[i++] = 0;

		// Bins 1
		for (k = 1 + (beg >> 26); k <= 1 + (end >> 26); ++k)
			list[i++] = k;

		for (k = 9 + (beg >> 23); k <= 9 + (end >> 23); ++k)
			list[i++] = k;

		for (k = 73 + (beg >> 20); k <= 73 + (end >> 20); ++k)
			list[i++] = k;

		for (k = 585 + (beg >> 17); k <= 585 + (end >> 17); ++k)
			list[i++] = k;

		for (k = 4681 + (beg >> 14); k <= 4681 + (end >> 14); ++k)
			list[i++] = k;

		return i;
	}

	public TabixReader(String fileName) throws IOException {
		this(fileName, false);
	}

	public TabixReader(String fileName, boolean debug) throws IOException {
		this.fileName = fileName;
		this.debug = debug;
		fileInputStream = new BlockCompressedInputStream(new File(fileName));
		readIndex();
	}

	protected int chr2tid(String chr) {
		if (sequenceName2tid.containsKey(chr))
			return sequenceName2tid.get(chr);

		// Try simple chromosome names (no 'chr')
		chr = Chromosome.simpleName(chr);
		if (sequenceName2tid.containsKey(chr))
			return sequenceName2tid.get(chr);

		else
			return -1;
	}

	public void close() {
		if (fileInputStream != null) {
			try {
				fileInputStream.close();
			} catch (IOException e) {
				throw new RuntimeException(e);
			}
			fileInputStream = null;
		}
	}

	public long getFilePointer() {
		return fileInputStream.getFilePointer();
	}

	public int getmBc() {
		return mBc;
	}

	public int getmEc() {
		return mEc;
	}

	public int getmMeta() {
		return mMeta;
	}

	public int getmPreset() {
		return mPreset;
	}

	public int getmSc() {
		return mSc;
	}

	@Override
	public TabixIterator iterator() {
		if (tabixIterator != null)
			return tabixIterator;

		// No query performed yet: create one
		TPair64 off[] = new TPair64[1];
		off[0] = new TPair64(0, Long.MAX_VALUE);
		tabixIterator = new TabixIterator(this, -1, 0, Integer.MAX_VALUE, off);
		tabixIterator.setShowHeader(showHeader); // Do we want header lines?

		return tabixIterator;
	}

	/**
	 * Parse a region in the format of "chr1", "chr1:100" or "chr1:100-1000"
	 *
	 * @param reg
	 *            Region string
	 * @return An array where the three elements are [ sequence_id, region_begin,
	 *         region_end] On failure, sequence_id==-1.
	 */
	public int[] parseReg(final String reg) {
		String chr;
		int colon, hyphen;
		int[] ret = new int[3];
		colon = reg.indexOf(':');
		hyphen = reg.indexOf('-');
		chr = colon >= 0 ? reg.substring(0, colon) : reg;
		ret[1] = colon >= 0 ? Integer.parseInt(reg.substring(colon + 1, hyphen >= 0 ? hyphen : reg.length())) - 1 : 0;
		ret[2] = hyphen >= 0 ? Integer.parseInt(reg.substring(hyphen + 1)) : 0x7fffffff;
		ret[0] = chr2tid(chr);
		return ret;
	};

	/**
	 * Query for a given interval:
	 * 
	 * @param tid
	 *            : Chromosome number
	 * @param beg
	 *            : Interval start
	 * @param end
	 *            : Interval end
	 * @return
	 */
	private TabixIterator query(final int tid, final int beg, final int end) {
		TPair64[] chunksOff, chunks;
		long minFileOffset;

		if (tid < 0)
			return null;

		// Get index for chromosome 'tid'
		TabixIndex idx = tabixIndexes[tid];
		int[] bins = new int[MAX_BIN];
		int i, l, numChunks;

		// Fill up 'bins' with the bin numbers that are intersected by [beg, end)
		// interval.
		int numBins = reg2bins(beg, end, bins);

		// Minimum offset within file
		minFileOffset = idx.minOffset(beg);
		if (debug)
			Gpr.debug("minFileOffset: " + minFileOffset);

		// Add chunk lengths for all blocks within the interval
		for (i = numChunks = 0; i < numBins; ++i) {
			chunks = idx.get(bins[i]);
			if (chunks != null) {
				numChunks += chunks.length;
				if (debug) {
					StringBuilder sb = new StringBuilder();
					for (TPair64 tp : chunks)
						sb.append("\t\t" + tp + "\n");
					Gpr.debug("\tnumChunks: " + numChunks + "\n" + sb);
				}
			}
		}

		// Zero chunks? Then there is nothing in the index matching the interval
		if (numChunks == 0)
			return null;

		// Collect all chunks having their end coordinated after min_off
		chunksOff = new TPair64[numChunks];
		for (i = numChunks = 0; i < numBins; ++i) {
			chunks = idx.get(bins[i]);
			if (chunks != null) {
				for (int j = 0; j < chunks.length; ++j) {
					// Chunk ends after minFileOffset? => Create a copy and add it to the array
					// If chunk is before minFileOffset, so it is filtered out
					if (TPair64.less64(minFileOffset, chunks[j].v)) {
						chunksOff[numChunks++] = new TPair64(chunks[j]);
					}
				}
			}
		}

		// No chunks left? Nothing to do (index doesn't match query)
		if (numChunks == 0)
			return null;

		// Sort chunks (note, after 'numChunks' the array is null)
		Arrays.sort(chunksOff, 0, numChunks);

		// Resolve completely contained adjacent blocks
		for (i = 1, l = 0; i < numChunks; ++i) {
			if (TPair64.less64(chunksOff[l].v, chunksOff[i].v)) {
				++l;
				chunksOff[l].u = chunksOff[i].u;
				chunksOff[l].v = chunksOff[i].v;
			}
		}
		numChunks = l + 1;

		// Resolve overlaps between adjacent blocks; this may happen due to the merge in
		// indexing
		for (i = 1; i < numChunks; ++i)
			if (!TPair64.less64(chunksOff[i - 1].v, chunksOff[i].u))
				chunksOff[i - 1].v = chunksOff[i].u;

		// Merge adjacent blocks
		for (i = 1, l = 0; i < numChunks; ++i) {
			if (chunksOff[l].v >> 16 == chunksOff[i].u >> 16)
				chunksOff[l].v = chunksOff[i].v;
			else {
				++l;
				chunksOff[l].u = chunksOff[i].u;
				chunksOff[l].v = chunksOff[i].v;
			}
		}
		numChunks = l + 1;

		// Create new array and move merged chunks there
		TPair64[] mergedChunks = new TPair64[numChunks];
		for (i = 0; i < numChunks; ++i)
			mergedChunks[i] = new TPair64(chunksOff[i].u, chunksOff[i].v);

		return new TabixIterator(this, tid, beg, end, mergedChunks);
	}

	/**
	 * Return an iterator for the interval in this query Format: "chr:star-end"
	 */
	public TabixIterator query(String reg) {
		// Parse to an array: [tid, begin, end]. Note 'tid' means chromosome ID
		int[] x = parseReg(reg);
		tabixIterator = query(x[0], x[1], x[2]);
		return tabixIterator;
	}

	public TabixIterator query(Variant variant) {
		int tid = chr2tid(variant.getChromosomeName());

		// Tabix uses zero-based, half-open coordinates whereas
		// marker has zero-based, closed coordinates.
		int start = variant.getStart();
		int end = variant.getEnd() + 1;
		if (variant.isIns())
			start--;

		TabixIterator tabixIterator = query(tid, start, end);
		if (tabixIterator != null)
			tabixIterator.setDebug(debug);
		if (debug)
			Gpr.debug("Query: " + variant + "\ttabixIterator: " + tabixIterator);
		return tabixIterator;
	}

	/**
	 * Read the Tabix index from the default file.
	 */
	public void readIndex() throws IOException {
		readIndex(new File(fileName + ".tbi"));
	}

	/**
	 * Read the Tabix index from a file
	 *
	 * @param fp
	 *            File pointer
	 */
	public void readIndex(File fp) throws IOException {
		if (fp == null)
			return;
		BlockCompressedInputStream is = new BlockCompressedInputStream(fp);
		byte[] buf = new byte[4];

		is.read(buf, 0, 4); // read "TBI\1"
		int numSeqs = readInt(is);

		sequenceNames = new String[numSeqs]; // # sequences
		sequenceName2tid = new HashMap();
		mPreset = readInt(is);
		mSc = readInt(is);
		mBc = readInt(is);
		mEc = readInt(is);
		mMeta = readInt(is);
		mSkip = readInt(is);

		if (debug)
			Gpr.debug("Tabix index:" //
					+ "\n\tNumber of sequences: " + numSeqs //
					+ "\n\tmPreset: " + mPreset //
					+ "\n\tmSc    : " + mSc //
					+ "\n\tmBc    : " + mBc //
					+ "\n\tmEc    : " + mEc //
					+ "\n\tmMeta  : " + mMeta //
					+ "\n\tmSkip  : " + mSkip //
			);

		// Read sequence dictionary
		int sequencesLength = readInt(is);
		buf = new byte[sequencesLength];
		is.read(buf);

		// Chromosome names (sequence name) are stored as '\0' delimited
		// string (C style). Each chromosome name is assigned a 'tid'
		// (an integer number) which is implied by the order in which
		// they are stored
		// The mappings 'tid -> sequenceName' and 'sequenceName -> tid' are
		// created here
		for (int i = 0, j = 0, tid = 0; i < buf.length; ++i) {
			if (buf[i] == 0) {
				byte[] b = new byte[i - j];
				System.arraycopy(buf, j, b, 0, b.length);
				String chrName = new String(b);

				// Add chromosome mapping
				if (debug)
					Gpr.debug("sequenceNames[" + tid + "] = s: '" + chrName + "'");
				sequenceName2tid.put(chrName, tid);
				sequenceName2tid.put(Chromosome.simpleName(chrName), tid); // Add simple name as well

				sequenceNames[tid++] = chrName;
				j = i + 1;
			}
		}

		// Read the index for each chromosome (sequenceName)
		// Note: Not all bins are stored
		tabixIndexes = new TabixIndex[sequenceNames.length];
		for (int seqNum = 0; seqNum < sequenceNames.length; ++seqNum) {
			TabixIndex tabixIndex = new TabixIndex();
			tabixIndex.setDebug(debug);
			tabixIndex.readIndex(is);
			tabixIndexes[seqNum] = tabixIndex;

			if (debug) {
				String txtfile = fileName + ".tabixIndex_" + sequenceNames[seqNum] + ".txt";
				Gpr.debug("Writing to file " + txtfile);
				Gpr.toFile(txtfile, tabixIndexes[seqNum]);
			}
		}

		// Close
		is.close();
	}

	/**
	 * Read one line from the data file.
	 */
	public String readLine() throws IOException {
		StringBuffer buf = new StringBuffer();
		int c;
		while ((c = fileInputStream.read()) >= 0 && c != '\n')
			buf.append((char) c);
		if (c < 0)
			return null;
		return buf.toString();
	}

	void seek(long pos) throws IOException {
		if (debug)
			Gpr.debug("seek(" + pos + ")");
		fileInputStream.seek(pos);
	}

	public void setDebug(boolean debug) {
		this.debug = debug;
	}

	public void setShowHeader(boolean showHeader) {
		this.showHeader = showHeader;
	}

	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < tabixIndexes.length; i++)
			sb.append("TabixIndex[" + i + "]:\t" + tabixIndexes[i] + "\n");
		return sb.toString();
	}
}