All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.binseq.coder.DnaCoder Maven / Gradle / Ivy

The newest version!
package org.snpeff.binseq.coder;

import org.snpeff.nmer.Nmer;
import org.snpeff.util.Gpr;

/**
 * Class used to encode & decode sequences into binary and vice-versa
 *
 * Note:This is a singleton class.
 *
 * It stores DNA bases into 2 bits {a,c,g,t} <-> {0,1,2,3}
 *
 * @author pcingola
 *
 */
public class DnaCoder extends Coder {

	/**
	 *
	 */
	private static final long serialVersionUID = 1L;
	public static boolean debug = false;
	private static DnaCoder dnaCoder = new DnaCoder();

	protected static final int BITS_PER_BASE = 2;
	protected static final long MASK_FIRST_BASE = 3L;
	protected static final int BASES_PER_LONGWORD = BITS_PER_LONGWORD / BITS_PER_BASE;
	protected static final int LAST_BASE_IN_LONGWORD = BASES_PER_LONGWORD - 1;
	public static final long MASK_ALL_WORD = 0xffffffffffffffffL;
	public static final char[] TO_BASE = { 'a', 'c', 'g', 't' };

	public long MASK_BASE[], MASK_LOW[], MASK_HIGH[];
	public int COUNT_DIFFS[];

	public static DnaCoder get() {
		return dnaCoder;
	}

	/**
	 * Static class initialization
	 */
	DnaCoder() {
		// Initialize mask
		MASK_BASE = new long[DnaCoder.BASES_PER_LONGWORD];
		MASK_LOW = new long[Coder.BITS_PER_LONGWORD + 1];
		MASK_HIGH = new long[Coder.BITS_PER_LONGWORD + 1];

		// Initialize masks
		for (int i = 0; i < MASK_BASE.length; i++)
			MASK_BASE[i] = MASK_FIRST_BASE << (BITS_PER_BASE * i);

		long low = 1L, high = 1L << 63;
		MASK_LOW[0] = MASK_HIGH[0] = 0;
		for (int i = 1; i < MASK_LOW.length; i++) {
			int j = i - 1;
			MASK_LOW[i] = low = (1L << j) | low;
			MASK_HIGH[i] = high >> j; // Note: signed rotation
		}

		// Count diffs
		int countDiffLen = 16; // 16 bits
		COUNT_DIFFS = new int[1 << (countDiffLen - 1) + 1];
		for (int i = 1; i < COUNT_DIFFS.length; i++) {
			int count = 0;
			for (int j = 0; j <= countDiffLen; j += 2) {
				if (((i >> j) & 0x3) != 0) count++;
			}
			COUNT_DIFFS[i] = count;
		}
	}

	@Override
	public int basesPerWord() {
		return BASES_PER_LONGWORD;
	}

	/**
	 * Encode a base using 2 bits
	 * @param c
	 * @return
	 */
	@Override
	public int baseToBits(char c) {
		return baseToBits(c, false);
	}

	public int baseToBits(char c, boolean ignoreErrors) {
		switch (c) {
		case 'a':
		case 'A':
			return 0;
		case 'c':
		case 'C':
			return 1;
		case 'g':
		case 'G':
			return 2;
		case 't':
		case 'T':
		case 'u':
		case 'U':
			return 3;
		default:
			if (ignoreErrors) return 0;
			throw new RuntimeException("Unknown base '" + c + "'");
		}
	}

	@Override
	public int bitsPerBase() {
		return BITS_PER_BASE;
	}

	/**
	 * Copy 'length' bases from 'src' (starting from 'srcStart') to 'dst' (starting from 'dstStart')
	 * @param src
	 * @param srcStart
	 * @param dst
	 * @param length
	 */
	public void copyBases(long[] src, int srcStart, long[] dst, int dstStart, int length) {
		// Src parameters
		int startWord = srcStart / BASES_PER_LONGWORD;
		int startBase = srcStart % BASES_PER_LONGWORD;
		long startMask = MASK_LOW[BITS_PER_LONGWORD - startBase * BITS_PER_BASE];

		// Dst parameters
		int startWordDst = dstStart / BASES_PER_LONGWORD;
		int startBaseDst = dstStart % BASES_PER_LONGWORD;
		int endWordDst = (dstStart + length - 1) / BASES_PER_LONGWORD;
		int endBaseDst = (startBaseDst + length) % BASES_PER_LONGWORD;
		long startMaskDst = MASK_LOW[BITS_PER_LONGWORD - startBaseDst * BITS_PER_BASE];
		long endMaskDst = endBaseDst != 0 ? MASK_HIGH[endBaseDst * BITS_PER_BASE] : MASK_ALL_WORD;;

		// Rotation values
		int rolLsrc = BITS_PER_BASE * startBase;
		int rorHsrc = BITS_PER_LONGWORD - rolLsrc;
		int rorLdst = startBaseDst * BITS_PER_BASE;
		int rorHdst = BITS_PER_LONGWORD - rorLdst;

		// Copy words from src[] to dst[]
		long srcL = 0, srcH = 0, s, mask;
		for (int i = startWord, j = startWordDst, k = 0; k < length; i++, j++, k += 32) {
			//---
			// Get a 'long' from src[] (64 bits)
			//---
			srcL = (src[i] & startMask) << rolLsrc; // Get some bits from this word
			if ((startBase != 0) && ((i + 1) < src.length)) srcH = src[i + 1] >>> rorHsrc; // Do I need more bits from the next word? Is there a next word?
			else srcH = 0;
			s = srcH | srcL; // Store the 64 bits in one long

			//---
			// Set bits in dst[]
			//---
			if (j < dst.length) {
				// Mask: Which bits of 'dst' are kept
				if (j == endWordDst) mask = ~startMaskDst | ~endMaskDst;
				else mask = ~startMaskDst;

				// Set lower bits in dst[j]
				dst[j] = (dst[j] & mask) | ((s >>> rorLdst) & ~mask);

				// Do we need to set higher bits in dst[j+1] ?
				if ((j + 1) <= endWordDst) {
					if ((j + 1) == endWordDst) mask = startMaskDst | ~endMaskDst;
					else mask = startMaskDst;

					dst[j + 1] = (dst[j + 1] & mask) | ((s << rorHdst) & ~mask); // Set higher bits in dst[j+1]
				}
			}
		}
	}

	/**
	 * Copy 'length' bases from 'src' to 'dst' (starting from 'start')
	 * @param src
	 * @param start
	 * @param dst
	 * @param length
	 */
	public void copyBases(long[] src, long[] dst, int start, int length) {
		int startWord = start / BASES_PER_LONGWORD;
		int startBase = start % BASES_PER_LONGWORD;

		int endWord = (start + length) / BASES_PER_LONGWORD;
		int endBase = (startBase + length) % BASES_PER_LONGWORD;

		int len = (startBase + length) / BASES_PER_LONGWORD;

		long startMask = MASK_LOW[BITS_PER_LONGWORD - startBase * BITS_PER_BASE];
		long endMask = MASK_HIGH[endBase * BITS_PER_BASE];

		if (len == 0) {
			// Only one word to copy (start=end)
			long dstMaskedOut = dst[startWord] & (~(startMask & endMask)); // We have to keep the rest of the sequence intact
			dst[startWord] = dstMaskedOut | (src[startWord] & startMask & endMask);
		} else if (len == 1) {
			// Only two words to copy (start and end)
			long dstMaskedOut = dst[startWord] & (~startMask);// We have to keep the rest of the sequence intact
			dst[startWord] = dstMaskedOut | (src[startWord] & startMask);

			dstMaskedOut = dst[endWord] & (~endMask);// We have to keep the rest of the sequence intact
			dst[endWord] = dstMaskedOut | (src[endWord] & endMask);
		} else {
			// Several words to copy (start, end, the rest)
			long dstMaskedOut = dst[startWord] & (~startMask);// We have to keep the rest of the sequence intact
			dst[startWord] = dstMaskedOut | (src[startWord] & startMask);

			dstMaskedOut = dst[endWord] & (~endMask);// We have to keep the rest of the sequence intact
			dst[endWord] = dstMaskedOut | (src[endWord] & endMask);

			System.arraycopy(src, startWord + 1, dst, startWord + 1, len - 1); // Use a fast copy method
		}
	}

	/**
	 * Count number of different bases after an XOR comparison (see score method)
	 * This method is optimized using 16-bits pre-computed counts (COUNT_DIFFS)
	 * @param compare
	 * @return
	 */
	int countDiffBases(long compare) {
		int comp = COUNT_DIFFS[(int) (compare & 0xffffL)];
		comp += COUNT_DIFFS[(int) ((compare >> 16) & 0xffffL)];
		comp += COUNT_DIFFS[(int) ((compare >> 32) & 0xffffL)];
		comp += COUNT_DIFFS[(int) ((compare >> 48) & 0xffffL)];
		return comp;
	}

	void d(String m, long l) {
		Nmer n = new Nmer(32);
		n.setNmer(l);
		Gpr.debug(String.format("%10s\t%s\t%s", m, Gpr.bin64(l), n.toString()));
	}

	/**
	 * Decode bits from a given position
	 * @param word
	 * @param pos
	 * @return
	 */
	@Override
	public int decodeWord(long word, int pos) {
		int c = (int) ((word & MASK_BASE[pos]) >>> (pos << 1));
		return c;
	}

	/**
	 * Encode a base to a given position in a word
	 * @param base
	 * @param pos
	 * @return
	 */
	public long encodeWord(char base, int pos) {
		return ((long) baseToBits(base)) << (pos << 1);
	}

	@Override
	public int lastBaseinWord() {
		return LAST_BASE_IN_LONGWORD;
	}

	/**
	 * Calculate the coded length of a sequence in 'words' (depends on coder)
	 * @param len
	 * @return
	 */
	public int length2words(int len) {
		return ((len % basesPerWord()) != 0 ? len / basesPerWord() + 1 : len / basesPerWord());
	}

	@Override
	public long mask(int baseIndexInWord) {
		return MASK_BASE[baseIndexInWord];
	}

	/**
	 * Decode bits from a given position
	 * @param code
	 * @param pos
	 * @return
	 */
	public long replaceBase(long code, int pos, char newBase) {
		return (code & (~MASK_BASE[pos])) | encodeWord(newBase, pos);
	}

	/**
	 * Reverse all bases in 'code'
	 * @param linearIndex
	 * @return
	 */
	public long reverseBases(long code) {
		long reversedCode = 0;
		for (int pos = 0; pos < basesPerWord(); pos++) {
			int c = decodeWord(code, pos);
			reversedCode = reversedCode << BITS_PER_BASE | c;
		}
		return reversedCode;
	}

	/**
	 * Calculate a 'score' for a sequence (dst) and a sub-sequence (src).
	 * The score is the number of equal bases (or zero if they differ)
	 *
	 * @param dst : Destination sequence codes[]
	 * @param src : Source sequence codes[]
	 * @param srcStart : Source sub-sequence start
	 * @param length : Number of bases to compare
	 * @param threshold: Number of bases allowed to differ
	 *
	 * @return
	 */
	public int score(long dst[], long src[], int srcStart, int length, int threshold) {
		// Src parameters
		int startWord = srcStart / BASES_PER_LONGWORD;
		int startBase = srcStart % BASES_PER_LONGWORD;
		long startMask = MASK_LOW[BITS_PER_LONGWORD - startBase * BITS_PER_BASE];

		// Dst parameters
		int endWordDst = (length - 1) / BASES_PER_LONGWORD;
		int endBaseDst = length % BASES_PER_LONGWORD;
		long endMaskDst = endBaseDst != 0 ? MASK_HIGH[endBaseDst * BITS_PER_BASE] : MASK_ALL_WORD;;

		// Rotation values
		int rolLsrc = BITS_PER_BASE * startBase;
		int rorHsrc = BITS_PER_LONGWORD - rolLsrc;

		// Copy words from src[] to dst[]
		long srcL = 0, srcH = 0, s, mask;
		int diffBases = 0;
		for (int i = startWord, j = 0, k = 0; k < length; i++, j++, k += 32) {
			//---
			// Get a 'long' from src[] (64 bits)
			//---
			srcL = (src[i] & startMask) << rolLsrc; // Get some bits from this word
			if ((startBase != 0) && ((i + 1) < src.length)) srcH = src[i + 1] >>> rorHsrc; // Do I need more bits from the next word? Is there a next word?
			else srcH = 0;
			s = srcH | srcL; // Store the 64 bits in one long

			//---
			// Compare bits in dst[]
			//---
			if (j < dst.length) {
				// Mask: Which bits of 'dst' are not compared
				if (j == endWordDst) mask = endMaskDst;
				else mask = MASK_ALL_WORD;

				// Compare lower bits in dst[j]
				long compare = (dst[j] ^ s) & mask;
				if (compare != 0) { // Any differences?
					// Should we continue?
					if (threshold == 0) return 0;
					else {
						diffBases += countDiffBases(compare);
						if (diffBases > threshold) return 0;
					}
				}
			}
		}

		return length - diffBases;
	}

	/**
	 * Decode a base using 2 bits
	 */
	@Override
	public char toBase(int code) {
		return TO_BASE[code];
	}

	@Override
	public char toBase(long word, int pos) {
		int c = (int) ((word & MASK_BASE[pos]) >>> (pos << 1));
		return toBase(c);
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy