All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.binseq.indexer.SuffixIndexerNmer Maven / Gradle / Ivy

The newest version!
package org.snpeff.binseq.indexer;

import java.util.Iterator;

import org.snpeff.binseq.BinarySequence;
import org.snpeff.binseq.comparator.SequenceReference;
import org.snpeff.binseq.comparator.SubsequenceComparator;
import org.snpeff.collections.HashLongLongArray;
import org.snpeff.nmer.Nmer;
import org.snpeff.util.Gpr;

/**
 * Index all suffixes of all the sequences (it indexes using Nmers).
 * 
 * Note: Under the current structure, only exact overlap matches are allowed
 * 
 * @author pcingola
 *
 * @param 
 */

public class SuffixIndexerNmer extends SequenceIndexer {

	int nmerSize;
	Nmer nmer;
	HashLongLongArray hash;
	OverlapFilter overlapFilter = null;

	public SuffixIndexerNmer(SubsequenceComparator subsequenceComparator, int nmerSize) {
		super(subsequenceComparator);

		// Create Nmer
		this.nmerSize = nmerSize;
		nmer = new Nmer(nmerSize);

		// Create and initialize hash
		hash = new HashLongLongArray();

		// Add a null sequence as the first sequence. This is done because 
		// TroveCollections represent 0 as 'EMPTY'. This means that we cannot 
		// index sequence number zero (the first sequence).
		sequences.add(null);
	}

	/**
	 * Add a sequence to this index
	 * @param sequence
	 * @return Index to this sequence (a number that can be used to retrieve this sequence)
	 */
	@Override
	public int add(T sequence) {
		sequences.add(sequence);
		int idx = sequences.size() - 1;
		indexAllSuffix(sequence, idx);
		return idx;
	}

	/**
	 * Find best overlap for 'sequence'
	 * @param sequence
	 * @return An overlap result
	 */
	@SuppressWarnings("unchecked")
	public OverlapRessult findBestOverlap(T sequence) {
		// Find an overlapping sequence 
		OverlapRessult result = new OverlapRessult();
		findOverlap(sequence, result);

		if (result.bestScore < sequence.length()) { // We calculate the reverseWC score (unless we already have an optimal score)
			// Find an overlapping sequence for reverseWC
			T sequenceRwc = (T) sequence.reverseWc();
			OverlapRessult resultRwc = new OverlapRessult();
			resultRwc.reverseWC = true;
			findOverlap(sequenceRwc, resultRwc);

			// If reverseWc has a better match => use reverseWc 
			if ((result.bestSequence == null) || (result.bestScore < resultRwc.bestScore)) {
				result = resultRwc; // Swap results
				sequence = sequenceRwc; // Swap sequences
			}
		}

		return result;
	}

	/**
	 * Finds the best overlaps for a sequence
	 * @param sequence
	 * @return
	 */
	boolean findOverlap(T sequence, OverlapRessult result) {
		int max = sequence.length() - nmerSize;

		// Initialize nmer
		nmer.setNmer(0); // Reset
		for (int i = 0; i < nmerSize; i++)
			nmer.rol(sequence.getBase(i));

		// Compare all suffixes of this sequence
		for (int i = 0; true; i++) {
			// Any references that match this Nmer?
			long bucket[] = hash.getBucket(nmer.getNmer());

			// If the sequence is shorter than the best score, there is no point comparing any more (the score will be lower)
			if (result.bestScore >= (sequence.length() - i)) break;

			// Found something: Compare all buckets
			if (bucket != null) {
				int len = hash.getBucketLength(nmer.getNmer());

				// Analyze all references in this bucket
				for (int r = 0; r < len; r++) {
					long ref = bucket[r];
					int seqIdx = SequenceReference.getSeqIdx(ref);
					int start = SequenceReference.getStart(ref);
					T seq = get(seqIdx);

					if ((overlapFilter == null) || overlapFilter.considerOverlap(sequence, seq)) {
						// We only want perfect sequence overlaps (either sequence has to have a zero index)
						if ((i != 0) && (start != 0)) {
							// Skip this sub-sequence comparison
						} else if ((i == 0) && (result.bestScore >= (seq.length() - start))) { // If the overlap is shorter than the bestScore, there is no point comparing them (the score will be lower)
							// Skip this sub-sequence comparison
						} else if ((start == 0) && (result.bestScore >= (sequence.length() - i))) { // If the overlap is shorter than the bestScore, there is no point comparing them (the score will be lower)
							// Skip this sub-sequence comparison
						} else {
							// Compare subsequences
							int score = subsequenceComparator.score(sequence, i, seq, start);
							if (score > result.bestScore) {
								result.bestScore = score;
								result.bestSequence = seq;
								result.bestReference = ref;
								result.bestId = seqIdx;
								result.start = start - i;
							}

							if (score == sequence.length()) return true;// Already found best possible score => Don't look any more
						}
					}
				}
			}

			if (i >= max) break; // Are we done?
			nmer.rol(sequence.getBase(i + nmerSize)); // Prepare for next iteration
		}

		return result.bestSequence != null;
	}

	public OverlapFilter getOverlapFilter() {
		return overlapFilter;
	}

	/**
	 * Index all possible suffix of 'sequence'
	 * @param sequence
	 */
	void indexAllSuffix(T sequence, int seqIds) {
		int max = sequence.length() - nmerSize;

		// Initialize nmer
		nmer.setNmer(0); // Reset
		for (int i = 0; i < nmerSize; i++)
			nmer.rol(sequence.getBase(i));

		// Create all references to suffixes of this sequence
		for (int start = 0; true; start++) {
			long ref = SequenceReference.getReference(seqIds, start);
			hash.put(nmer.getNmer(), ref);
			if (start >= max) break;
			// Prepare for next iteration
			nmer.rol(sequence.getBase(start + nmerSize));
		}
	}

	@Override
	public Iterator iterator() {
		Iterator it = sequences.iterator();
		it.next(); // The first sequence is 'null', skip it
		return it;
	}

	/**
	 * Find the best possible overlap and join the sequences or just add add the sequence to the index
	 * @param sequence
	 * @return true if an overlap was found and false if no overlap was found
	 */
	@SuppressWarnings("unchecked")
	public boolean overlap(T sequence) {
		// Find best overlapping sequence 
		OverlapRessult result = findBestOverlap(sequence);
		if (result.bestSequence == null) return false; // Nothing found? => return

		// If sequence is fully included in "result.bestSequence", then the overlap is already done (nothing to do)
		if ((result.start >= 0) && ((result.start + sequence.length()) <= result.bestSequence.length())) return true;

		// Overlap sequence
		T overlapSeq = (T) result.bestSequence.overlap(sequence, result.start);

		// Replace old sequence with new sequence
		replaceSequenceOverlap(result.bestSequence, overlapSeq, result.bestId, result.start);
		return true;
	}

	public void printSequences() {
		for (T seq : this)
			System.out.println(seq.getSequence());
	}

	/**
	 * Update all suffixes from an overlapping sequence
	 * Create all references to suffixes for sequenceNew (replace suffixes from sequenceOri if possible)
	 */
	void replaceSequenceOverlap(T sequenceOri, T sequenceNew, int seqIdx, int start) {
		int max = Math.max(sequenceNew.length(), sequenceOri.length()) - nmerSize;

		if (start >= 0) {
			int startIdx = sequenceOri.length() - nmerSize;

			// Initialzie Nmer
			for (int i = 0, j = startIdx; i < nmerSize; i++, j++)
				nmer.rol(sequenceNew.getBase(j));

			// This is a 'new' part of sequenceNew => Create new index entries
			for (int idx = startIdx; idx < max; idx++) {
				long ref = SequenceReference.getReference(seqIdx, idx);
				hash.put(nmer.getNmer(), ref);

				nmer.rol(sequenceNew.getBase(idx + nmerSize)); // Update nmer
			}
		} else {
			// Initialzie Nmer
			for (int i = 0; i < nmerSize; i++)
				nmer.rol(sequenceNew.getBase(i));

			// Start < 0
			int idxOri = 0;
			for (int idx = 0; idx < max; idx++) {
				long ref = SequenceReference.getReference(seqIdx, idx);
				if ((0 <= idx) && (idx < (-start))) {
					hash.put(nmer.getNmer(), ref); // This is a 'new' part of sequenceNew => Create new index entries
				} else {
					// This is an 'old' part of sequenceNew => Update index entries (if they exist)
					long refOld = SequenceReference.getReference(seqIdx, idxOri);
					hash.replace(nmer.getNmer(), refOld, ref);
					idxOri++;
				}

				// Update nmer
				nmer.rol(sequenceNew.getBase(idx + nmerSize));
			}
		}

		// Replace sequence in 'sequences'
		sequences.set(seqIdx, sequenceNew);
	}

	/**
	 * Perform consistency checks
	 */
	public void sanityCheck() {
		//---
		// Check that all indexes are there (it's like re-indexing all the sequences)
		//---
		// System.out.println("Sanity check (Sequence -> Nmers): ");

		// For each sequence Id....
		int k = 1;
		for (int seqIdx = 1; seqIdx < sequences.size(); seqIdx++) {
			T sequence = sequences.get(seqIdx);

			int max = sequence.length() - nmerSize;

			// Initialize nmer
			nmer.setNmer(0); // Reset
			for (int i = 0; i < nmerSize; i++)
				nmer.rol(sequence.getBase(i));

			// For each Nmer
			for (int start = 0; start < max; start++) {
				long ref = SequenceReference.getReference(seqIdx, start);
				if (!hash.contains(nmer.getNmer(), ref)) throw new RuntimeException("ERROR: Cannot find reference:\n\tReference: " + ref + "\tsequence.id: " + seqIdx + "\tindex: " + start + "\n\tNmer: " + nmer + "(" + nmer.getNmer() + ")\n\tSequence: " + sequence.getSequence());

				nmer.rol(sequence.getBase(start + nmerSize));
				Gpr.showMarkStderr(k++, 10000);
			}
		}

		//---
		// Check that all reference have the right sequenceId:index
		//---
		//System.out.println("\nSanity check (Nmers -> Sequence): ");
		k = 1;
		for (long nmerLong : hash.keys()) {
			long[] bucket = hash.getBucket(nmerLong);
			if (bucket == null) {
				if (nmerLong != 0) throw new RuntimeException("ERROR: Nmer does not have any bucket!\n\tNmer: " + nmerLong);
			} else {
				int len = hash.getLatestBucketLength();

				// For each reference in this bucket...
				for (int i = 0; i < len; i++) {
					long ref = bucket[i];
					int seqIdx = SequenceReference.getSeqIdx(ref);
					int start = SequenceReference.getStart(ref);
					T sequence = get(seqIdx);

					// Compare nmer
					String nmerStr = sequence.getSequence().substring(start, start + nmer.length());
					nmer.setNmer(nmerLong);
					if (!nmer.toString().equalsIgnoreCase(nmerStr)) throw new RuntimeException("ERROR: Reference does not match Nmer:\n\tNnmer: " + nmerLong + "\tReference: " + ref + "\tsequence.id: " + seqIdx + "\tindex: " + start + "\n\tNmer: " + nmer + "\n\tSequence: " + sequence.getSequence());

					Gpr.showMarkStderr(k++, 10000);
				}
			}
		}
	}

	public void setOverlapFilter(OverlapFilter overlapFilter) {
		this.overlapFilter = overlapFilter;
	}

	@Override
	public String toString() {
		long tot = 0;
		int max = 0;
		StringBuilder sb = new StringBuilder();
		for (T seq : this) {
			tot += seq.length();
			max = Math.max(max, seq.length());
			// sb.append(seq.getSequence() + "\n");
		}

		if (sequences.size() > 0) sb.append("Max sequence length: " + max + "\tAvg sequence length: " + (tot / sequences.size()));
		sb.append("\tHash stats: " + hash.toString());
		return sb.toString();
	}

	public String toStringSequences() {
		StringBuilder sb = new StringBuilder();
		for (T seq : this)
			sb.append(seq.getSequence() + "\n");
		return sb.toString();
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy