All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.alignment.KMerSimilarity Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.alignment;

import java.util.ArrayList;
import java.util.List;

import com.hfg.bio.phylogeny.DistanceMatrix;
import com.hfg.bio.seq.BioSequence;

//------------------------------------------------------------------------------
/**
 * A basic similarity assessment that uses k-mer analysis.
 *
 * @author J. Alex Taylor, hairyfatguy.com
 */
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------

public class KMerSimilarity
{

   //---------------------------------------------------------------------------
   public DistanceMatrix generateKMerDistanceMatrix(List inSequences, int inKMerSize)
   {
      List kMerData = new ArrayList<>(inSequences.size());
      for (BioSequence sequence : inSequences)
      {
         KMerBitSet kMerBitSet = new KMerBitSet(sequence.getType(), inKMerSize);
         kMerBitSet.fill(sequence);

         kMerData.add(kMerBitSet);
      }

      // Calculate the k-mer "distance" between ea. sequence pair
      DistanceMatrix distanceMatrix = new DistanceMatrix();

      for (int i = 0; i < kMerData.size() - 1; i++)
      {
         BioSequence seq1 = inSequences.get(i);
         KMerBitSet seq1KMerData = kMerData.get(i);
         for (int j = 0; j < kMerData.size(); j++)
         {
            BioSequence seq2 = inSequences.get(j);
            KMerBitSet seq2KMerData = kMerData.get(j);

            float sum = seq1KMerData.getCommonKMerCount(seq2KMerData);

            float similarity = sum / (Math.min(seq1.length(), seq2.length()) - inKMerSize + 1);

            distanceMatrix.setDistance(seq1.getID(), seq2.getID(), 1 - similarity);
         }
      }


      return distanceMatrix;
   }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy