All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.spark.mappers.PairwiseSequenceComparison Maven / Gradle / Ivy

package org.biojava.spark.mappers;

/**
 * Created by ap3 on 29/04/2016.
 */

import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.biojava.nbio.alignment.Alignments;
import org.biojava.nbio.alignment.SimpleGapPenalty;
import org.biojava.nbio.alignment.template.GapPenalty;
import org.biojava.nbio.alignment.template.PairwiseSequenceAligner;
import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
import org.biojava.nbio.core.alignment.template.SequencePair;
import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import scala.Tuple2;
import scala.Tuple4;
import scala.Tuple5;

import java.io.Serializable;
import java.util.List;


/**
 * Performs a pairwise alignment and returns a Tuple5.
 * These are the returned elements:
 * - name1
 * - name2
 * - overlap1
 * - overlap2
 * - percentage identical residues in the alignment
 */
public class PairwiseSequenceComparison implements Function,Tuple2 >, Tuple5> {

	/**
	 * The serial id for this version of the class.
	 */
	private static final long serialVersionUID = 8962410797026956531L;

	List> sequences;

	float minOverlap;
	float minPercid;

	public PairwiseSequenceComparison( List> sequences, float minOverlap, float minPercid){
		this.sequences = sequences;
		this.minOverlap = minOverlap;
		this.minPercid = minPercid;
	}
	private static final boolean debug = false;


	@Override
	public Tuple5 call(Tuple2,Tuple2> tuple) throws Exception {

		Tuple2 p1 = tuple._1();
		Tuple2 p2 = tuple._2();

		SubstitutionMatrix matrix = SubstitutionMatrixHelper.getBlosum65();
		GapPenalty penalty = new SimpleGapPenalty();
		penalty.setOpenPenalty(8);
		penalty.setExtensionPenalty(1);

		ProteinSequence prot1 = new ProteinSequence(p1._2());
		ProteinSequence prot2 = new ProteinSequence(p2._2());

		try {
			PairwiseSequenceAligner smithWaterman = Alignments.getPairwiseAligner(prot1,
					prot2,
					Alignments.PairwiseSequenceAlignerType.LOCAL, penalty, matrix);

			SequencePair alignment = smithWaterman.getPair();

			if ( debug )
				System.out.println(alignment.toString(60));



			int numIdenticals = alignment.getNumIdenticals();

			int aligLength = alignment.getLength();

			float percentIdenticals = (numIdenticals / (float) aligLength);


			// test overlaps

			int l1 = prot1.getLength();
			int l2 = prot2.getLength();

			int size = alignment.getLength();

			float overlap1 = l1 / (float) size;
			float overlap2 = l2 / (float) size;

			if ( debug )
				System.out.println(p1._1() + " " + p2._1() + " size:" + size + " l1: " + l1 + " l2: " + l2 + " overlap1 " + overlap1 + " overlap2 " + overlap2 + " %id: " + percentIdenticals);


			return new Tuple5(p1._1(),p2._1(),overlap1,overlap2,percentIdenticals);
		} catch (Exception e) {
			e.printStackTrace();
		}


		return new  Tuple5(p1._1(),p2._1(),0f,0f,0f);
	}


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy