All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.processAssemblyGenomes.SyntenicAnchors Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.processAssemblyGenomes;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.log4j.Logger;

/**
 * Class originally written by Baoxing Song for post-processing
 * minimap2 alignments.  It takes an input a file created by aligning with minimap2
 * and this command:
 * 
 * /home/lcj34/minimap2/minimap2 -A 1 -B 1 -O 1 -E 1 -a -t 80 /workdir/lcj34/assemblies_by_chrom/p39/p39chr7.fa /workdir/lcj34/assemblies_by_chrom/b73/b73chr7anchors.fa > b37anchorsMap.sam
 * 
 * The resulting .sam file (b37anchorsMap.sam in above query) is split to get 3 fields via grep:
 *  - refStart
 *  - asmStart
 *  - ms (DP score of the max scoring segment in the minimap alignment)
 * 
 *  longest path algorithm from here:
 *    https://www.geeksforgeeks.org/find-longest-path-directed-acyclic-graph/
 *    
 *  The code currently has methods to run with mummer 4 coords file entries.  The "score" from
 *  mummer alignments is (%id * alignment len)/100.
 *  
 *  Method getPairedSimilarFragmentsFromMummer() is written so this code may be applied to
 *  mummer4 alignment results.  Similar methods may be added to create lists of PairedSimilarFragments
 *  from other aligners.
 *    
 * @author  Baoxing Song, lcj34
 *
 */

public class SyntenicAnchors {
    private static final Logger myLogger = Logger.getLogger(SyntenicAnchors.class);   

    // This method changes the input List to one sorted
    // based on assembly/query start position.
    public static void myPairedSimilarFragmentSort( List pairedSimilarFragments, int score, int penalty, int scoreThreshold ){
        Collections.sort(pairedSimilarFragments); // initially sort by refStart position

        int startIndex=0;
        int endIndex=0;
        int maxScore=0;
        int currentScore=0;
        for( int idx=0;  idx pairedSimilarFragments.get(jdx).getAsmStartPos() ){
                                currentScore+=score*pairedSimilarFragments.get(jdx).getScore();
                            }else if( pairedSimilarFragments.get(jdx-1).getRefStartPos() == pairedSimilarFragments.get(jdx).getRefStartPos() ){ // tandem duplication
                                currentScore+=score*pairedSimilarFragments.get(jdx).getScore();
                            }else{
                                currentScore+=penalty*pairedSimilarFragments.get(jdx).getScore();
                            }
                        }

                    }else{
                        currentScore+=penalty*pairedSimilarFragments.get(jdx).getScore(); // penalty because are now forward strand
                    }
                    if (maxScore < currentScore) {
                        maxScore = currentScore;
                        endIndex=jdx; // keeps track of where to stop the reverse strand grouping
                    }
                    // If score is negative, stop the loop.  This will happen as we find more
                    // forward vs reverse strands.  If there was just 1 reverse strand followed 
                    // by a forward strand, currentScore goes from 3 to -1 (3 plus -4)
                    if( currentScore<0 ){ 
                        break;
                    }
                }
                // if maxScore is larger than scoreThreshold, it means there were multiple reverse
                // strand entries in this group, or a single reverse alignment of significant length.
                // If we have several reverse alignments, we think it is real.  If just 1, may be false 
                // alignment.  If the maxScore is greater than the scoreThreshold, we treat it as real.
                // Flip all elements in the range so assembly coordinates are in increasing order.
                if( maxScore>scoreThreshold ){
                    maxScore = 0 ;
                    currentScore = 0;
                    // loop to find start index of elements we want to flip
                    for( int jdx=endIndex; jdx>=idx; --jdx ){
                        if( 1 == pairedSimilarFragments.get(jdx).getStrand() ) {
                            if( jdx>idx ){
                                // Verify the reverse alignments are in order to each other.  If not,
                                // apply penalty.  This doesn't prevent overlaps, which will be dealt with later.
                                if ( pairedSimilarFragments.get(jdx-1).getAsmStartPos() > pairedSimilarFragments.get(jdx).getAsmStartPos() ){
                                    currentScore+=score*pairedSimilarFragments.get(jdx).getScore();
                                }else if( pairedSimilarFragments.get(jdx-1).getRefStartPos() == pairedSimilarFragments.get(jdx).getRefStartPos() ){ // tandem duplication
                                    currentScore+=score*pairedSimilarFragments.get(jdx).getScore();
                                }else{
                                    currentScore+=penalty*pairedSimilarFragments.get(jdx).getScore();
                                }
                            }else{
                                currentScore+=score*pairedSimilarFragments.get(jdx).getScore();
                            }
                        }else{
                            currentScore+=penalty*pairedSimilarFragments.get(jdx).getScore();
                        }
                        if (maxScore < currentScore) {
                            maxScore = currentScore;
                            startIndex=jdx;
                        }
                        if( currentScore<0 ){ 
                            break;
                        }
                    }

                    int length = (endIndex-startIndex+1)/2;
                    if(length > 0) {
//                        myLogger.info("maxScore: " + maxScore + " startIndex " + startIndex  + " endIndex " + endIndex + " position: " + pairedSimilarFragments.get(startIndex).getRefStartPos());
                        for (int j = 0; j < length; ++j) {
                            PairedSimilarFragment temp = pairedSimilarFragments.get(startIndex + j);
                            pairedSimilarFragments.set(startIndex + j, pairedSimilarFragments.get(endIndex - j));
                            pairedSimilarFragments.set(endIndex - j, temp);
                        }
                       // Flip the elements in the list so the asm reverse alignments
                        // all have increasing asm values. (ie the last element of the reverse
                        // grouping is now the first, and the first is the last.
                        boolean thereAreReverseAlignments = true;
                        while (thereAreReverseAlignments) {
                            thereAreReverseAlignments = false;
                            for (int j = 1; j < length; ++j) {
                                // If a single reference position has multiple assembly alignments mapping to it,
                                // swap the order until the assembly positions are all increasing.
                                if (pairedSimilarFragments.get(startIndex + j - 1).getRefStartPos() == pairedSimilarFragments.get(startIndex + j).getRefStartPos() &&
                                        pairedSimilarFragments.get(startIndex + j - 1).getAsmStartPos() > pairedSimilarFragments.get(startIndex + j).getAsmStartPos()) {
                                    thereAreReverseAlignments = true;
                                    PairedSimilarFragment temp = pairedSimilarFragments.get(startIndex + j);
                                    pairedSimilarFragments.set(startIndex + j, pairedSimilarFragments.get(startIndex + j - 1));
                                    pairedSimilarFragments.set(startIndex + j - 1, temp);
                                }
                            }
                        }
                    }
                    // Flip the elements in the list so the asm reverse ailgnments
                    // all have increasing asm values. (ie the last element of the reverse
                    // grouping is now the first, and the first is the last.

                    idx=endIndex;
                }
                maxScore=0;
                currentScore=0;
            }
        }
    }


    /**
     * Grab the ref and asm start/end positions, determine asm strand direction,
     * calculate a score, and store all to a list of PairedSimilarFragments
     * 
     * The score is calculated as: (%id * asmLen)/100 casted to int
     * 
     * @param mummerAlignments List of String from a mummer coordinates file
     * 
     * @return List created from mummer coordinates file
     */
    public static List getPairedSimilarFragmentsFromMummer(List mummerAlignments) {
        List psfArray = new ArrayList();
        
        try {
            //  coords file: S1 E1 S2 E2 len1 len2 %id tag1 tag2
            for (String line : mummerAlignments) {
            
                int tabIndex1 = line.indexOf("\t");
                int tabIndex2 = line.indexOf("\t",tabIndex1+1);
                int tabIndex3 = line.indexOf("\t",tabIndex2+1);
                int tabIndex4 = line.indexOf("\t",tabIndex3+1);
                int tabIndex5 = line.indexOf("\t",tabIndex4+1);
                int tabIndex6 = line.indexOf("\t",tabIndex5+1);
                int tabIndex7 = line.indexOf("\t",tabIndex6+1);
                
                int refStart = Integer.parseInt(line.substring(0,tabIndex1));
                int refEnd = Integer.parseInt(line.substring(tabIndex1+1,tabIndex2));
                int asmStart = Integer.parseInt(line.substring(tabIndex2+1,tabIndex3));
                int asmEnd = Integer.parseInt(line.substring(tabIndex3+1,tabIndex4));
                
                int asmLen = Integer.parseInt(line.substring(tabIndex5+1,tabIndex6));
                double id = Double.parseDouble(line.substring(tabIndex6+1,tabIndex7));
                
                double score = (asmLen * id)/100;

                int strand = 0;
                if (asmStart > asmEnd) {
                    strand = 1;
                }
                PairedSimilarFragment pairedSimilarFragment = new PairedSimilarFragment(refStart, refEnd, asmStart,asmEnd, score, strand);
                psfArray.add(pairedSimilarFragment);
            }
            return psfArray;
        } catch (Exception exc) {
            myLogger.debug(exc.getMessage(),exc); // print stack trace
            throw new IllegalArgumentException("AssemblyProcessingUtils:getPairedSimilarFragmentsFromMummer - error processing coords file: " + exc.getMessage());
        }
        
    }

    public static  List longestIncreasingSubsequenceLAGAN (List pairedSimilarFragments){
        double maxSore = 0;
        int bestEnd = 0;
        double[] scoreArray = new double[pairedSimilarFragments.size()]; // arrays of scores
        int[] prev = new int[pairedSimilarFragments.size()];  // index of previous node in longest path
        scoreArray[0] = pairedSimilarFragments.get(0).getScore();
        prev[0] = -1;
        for (int idx = 1; idx < pairedSimilarFragments.size(); ++idx) {
            scoreArray[idx] = pairedSimilarFragments.get(idx).getScore();
            prev[idx] = -1;
            for (int jdx = idx - 1; jdx >= 0; --jdx) // checking all previous nodes
                // Because we swapped asm/query start position so that inversions were all increasing,
                // we should always be on the diagonal.  If not, then we filter it.
                // This gets rid of the noise, while preserving the inversions on
                // the diagonal
                // Are only looking at positions previous to our current "idx" position
                if (scoreArray[jdx] + pairedSimilarFragments.get(idx).getScore() > scoreArray[idx] && 
                    pairedSimilarFragments.get(jdx).getAsmStartPos() < pairedSimilarFragments.get(idx).getAsmStartPos()){
                    scoreArray[idx] = scoreArray[jdx] + pairedSimilarFragments.get(idx).getScore();
                    prev[idx] = jdx;
                }
            if (scoreArray[idx] > maxSore){
                bestEnd = idx;
                maxSore = scoreArray[idx];
            }
        }
        List sortedPairedSimilarFragments = new ArrayList();
        int idx=bestEnd; // bestEnd is where to stop the longest path
        sortedPairedSimilarFragments.add(pairedSimilarFragments.get(idx));
        int jdx = prev[idx]; // prev[] is index on the longest path
        while( jdx>=0 ){
            sortedPairedSimilarFragments.add(pairedSimilarFragments.get(jdx));
            jdx=prev[jdx];
        }

        // Reversing the order
        Collections.reverse(sortedPairedSimilarFragments);
        
        return sortedPairedSimilarFragments;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy