net.maizegenetics.pangenome.processAssemblyGenomes.SyntenicAnchors Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.processAssemblyGenomes;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.log4j.Logger;
/**
* Class originally written by Baoxing Song for post-processing
* minimap2 alignments. It takes an input a file created by aligning with minimap2
* and this command:
*
* /home/lcj34/minimap2/minimap2 -A 1 -B 1 -O 1 -E 1 -a -t 80 /workdir/lcj34/assemblies_by_chrom/p39/p39chr7.fa /workdir/lcj34/assemblies_by_chrom/b73/b73chr7anchors.fa > b37anchorsMap.sam
*
* The resulting .sam file (b37anchorsMap.sam in above query) is split to get 3 fields via grep:
* - refStart
* - asmStart
* - ms (DP score of the max scoring segment in the minimap alignment)
*
* longest path algorithm from here:
* https://www.geeksforgeeks.org/find-longest-path-directed-acyclic-graph/
*
* The code currently has methods to run with mummer 4 coords file entries. The "score" from
* mummer alignments is (%id * alignment len)/100.
*
* Method getPairedSimilarFragmentsFromMummer() is written so this code may be applied to
* mummer4 alignment results. Similar methods may be added to create lists of PairedSimilarFragments
* from other aligners.
*
* @author Baoxing Song, lcj34
*
*/
public class SyntenicAnchors {
private static final Logger myLogger = Logger.getLogger(SyntenicAnchors.class);
// This method changes the input List to one sorted
// based on assembly/query start position.
public static void myPairedSimilarFragmentSort( List pairedSimilarFragments, int score, int penalty, int scoreThreshold ){
Collections.sort(pairedSimilarFragments); // initially sort by refStart position
int startIndex=0;
int endIndex=0;
int maxScore=0;
int currentScore=0;
for( int idx=0; idx pairedSimilarFragments.get(jdx).getAsmStartPos() ){
currentScore+=score*pairedSimilarFragments.get(jdx).getScore();
}else if( pairedSimilarFragments.get(jdx-1).getRefStartPos() == pairedSimilarFragments.get(jdx).getRefStartPos() ){ // tandem duplication
currentScore+=score*pairedSimilarFragments.get(jdx).getScore();
}else{
currentScore+=penalty*pairedSimilarFragments.get(jdx).getScore();
}
}
}else{
currentScore+=penalty*pairedSimilarFragments.get(jdx).getScore(); // penalty because are now forward strand
}
if (maxScore < currentScore) {
maxScore = currentScore;
endIndex=jdx; // keeps track of where to stop the reverse strand grouping
}
// If score is negative, stop the loop. This will happen as we find more
// forward vs reverse strands. If there was just 1 reverse strand followed
// by a forward strand, currentScore goes from 3 to -1 (3 plus -4)
if( currentScore<0 ){
break;
}
}
// if maxScore is larger than scoreThreshold, it means there were multiple reverse
// strand entries in this group, or a single reverse alignment of significant length.
// If we have several reverse alignments, we think it is real. If just 1, may be false
// alignment. If the maxScore is greater than the scoreThreshold, we treat it as real.
// Flip all elements in the range so assembly coordinates are in increasing order.
if( maxScore>scoreThreshold ){
maxScore = 0 ;
currentScore = 0;
// loop to find start index of elements we want to flip
for( int jdx=endIndex; jdx>=idx; --jdx ){
if( 1 == pairedSimilarFragments.get(jdx).getStrand() ) {
if( jdx>idx ){
// Verify the reverse alignments are in order to each other. If not,
// apply penalty. This doesn't prevent overlaps, which will be dealt with later.
if ( pairedSimilarFragments.get(jdx-1).getAsmStartPos() > pairedSimilarFragments.get(jdx).getAsmStartPos() ){
currentScore+=score*pairedSimilarFragments.get(jdx).getScore();
}else if( pairedSimilarFragments.get(jdx-1).getRefStartPos() == pairedSimilarFragments.get(jdx).getRefStartPos() ){ // tandem duplication
currentScore+=score*pairedSimilarFragments.get(jdx).getScore();
}else{
currentScore+=penalty*pairedSimilarFragments.get(jdx).getScore();
}
}else{
currentScore+=score*pairedSimilarFragments.get(jdx).getScore();
}
}else{
currentScore+=penalty*pairedSimilarFragments.get(jdx).getScore();
}
if (maxScore < currentScore) {
maxScore = currentScore;
startIndex=jdx;
}
if( currentScore<0 ){
break;
}
}
int length = (endIndex-startIndex+1)/2;
if(length > 0) {
// myLogger.info("maxScore: " + maxScore + " startIndex " + startIndex + " endIndex " + endIndex + " position: " + pairedSimilarFragments.get(startIndex).getRefStartPos());
for (int j = 0; j < length; ++j) {
PairedSimilarFragment temp = pairedSimilarFragments.get(startIndex + j);
pairedSimilarFragments.set(startIndex + j, pairedSimilarFragments.get(endIndex - j));
pairedSimilarFragments.set(endIndex - j, temp);
}
// Flip the elements in the list so the asm reverse alignments
// all have increasing asm values. (ie the last element of the reverse
// grouping is now the first, and the first is the last.
boolean thereAreReverseAlignments = true;
while (thereAreReverseAlignments) {
thereAreReverseAlignments = false;
for (int j = 1; j < length; ++j) {
// If a single reference position has multiple assembly alignments mapping to it,
// swap the order until the assembly positions are all increasing.
if (pairedSimilarFragments.get(startIndex + j - 1).getRefStartPos() == pairedSimilarFragments.get(startIndex + j).getRefStartPos() &&
pairedSimilarFragments.get(startIndex + j - 1).getAsmStartPos() > pairedSimilarFragments.get(startIndex + j).getAsmStartPos()) {
thereAreReverseAlignments = true;
PairedSimilarFragment temp = pairedSimilarFragments.get(startIndex + j);
pairedSimilarFragments.set(startIndex + j, pairedSimilarFragments.get(startIndex + j - 1));
pairedSimilarFragments.set(startIndex + j - 1, temp);
}
}
}
}
// Flip the elements in the list so the asm reverse ailgnments
// all have increasing asm values. (ie the last element of the reverse
// grouping is now the first, and the first is the last.
idx=endIndex;
}
maxScore=0;
currentScore=0;
}
}
}
/**
* Grab the ref and asm start/end positions, determine asm strand direction,
* calculate a score, and store all to a list of PairedSimilarFragments
*
* The score is calculated as: (%id * asmLen)/100 casted to int
*
* @param mummerAlignments List of String from a mummer coordinates file
*
* @return List created from mummer coordinates file
*/
public static List getPairedSimilarFragmentsFromMummer(List mummerAlignments) {
List psfArray = new ArrayList();
try {
// coords file: S1 E1 S2 E2 len1 len2 %id tag1 tag2
for (String line : mummerAlignments) {
int tabIndex1 = line.indexOf("\t");
int tabIndex2 = line.indexOf("\t",tabIndex1+1);
int tabIndex3 = line.indexOf("\t",tabIndex2+1);
int tabIndex4 = line.indexOf("\t",tabIndex3+1);
int tabIndex5 = line.indexOf("\t",tabIndex4+1);
int tabIndex6 = line.indexOf("\t",tabIndex5+1);
int tabIndex7 = line.indexOf("\t",tabIndex6+1);
int refStart = Integer.parseInt(line.substring(0,tabIndex1));
int refEnd = Integer.parseInt(line.substring(tabIndex1+1,tabIndex2));
int asmStart = Integer.parseInt(line.substring(tabIndex2+1,tabIndex3));
int asmEnd = Integer.parseInt(line.substring(tabIndex3+1,tabIndex4));
int asmLen = Integer.parseInt(line.substring(tabIndex5+1,tabIndex6));
double id = Double.parseDouble(line.substring(tabIndex6+1,tabIndex7));
double score = (asmLen * id)/100;
int strand = 0;
if (asmStart > asmEnd) {
strand = 1;
}
PairedSimilarFragment pairedSimilarFragment = new PairedSimilarFragment(refStart, refEnd, asmStart,asmEnd, score, strand);
psfArray.add(pairedSimilarFragment);
}
return psfArray;
} catch (Exception exc) {
myLogger.debug(exc.getMessage(),exc); // print stack trace
throw new IllegalArgumentException("AssemblyProcessingUtils:getPairedSimilarFragmentsFromMummer - error processing coords file: " + exc.getMessage());
}
}
public static List longestIncreasingSubsequenceLAGAN (List pairedSimilarFragments){
double maxSore = 0;
int bestEnd = 0;
double[] scoreArray = new double[pairedSimilarFragments.size()]; // arrays of scores
int[] prev = new int[pairedSimilarFragments.size()]; // index of previous node in longest path
scoreArray[0] = pairedSimilarFragments.get(0).getScore();
prev[0] = -1;
for (int idx = 1; idx < pairedSimilarFragments.size(); ++idx) {
scoreArray[idx] = pairedSimilarFragments.get(idx).getScore();
prev[idx] = -1;
for (int jdx = idx - 1; jdx >= 0; --jdx) // checking all previous nodes
// Because we swapped asm/query start position so that inversions were all increasing,
// we should always be on the diagonal. If not, then we filter it.
// This gets rid of the noise, while preserving the inversions on
// the diagonal
// Are only looking at positions previous to our current "idx" position
if (scoreArray[jdx] + pairedSimilarFragments.get(idx).getScore() > scoreArray[idx] &&
pairedSimilarFragments.get(jdx).getAsmStartPos() < pairedSimilarFragments.get(idx).getAsmStartPos()){
scoreArray[idx] = scoreArray[jdx] + pairedSimilarFragments.get(idx).getScore();
prev[idx] = jdx;
}
if (scoreArray[idx] > maxSore){
bestEnd = idx;
maxSore = scoreArray[idx];
}
}
List sortedPairedSimilarFragments = new ArrayList();
int idx=bestEnd; // bestEnd is where to stop the longest path
sortedPairedSimilarFragments.add(pairedSimilarFragments.get(idx));
int jdx = prev[idx]; // prev[] is index on the longest path
while( jdx>=0 ){
sortedPairedSimilarFragments.add(pairedSimilarFragments.get(jdx));
jdx=prev[jdx];
}
// Reversing the order
Collections.reverse(sortedPairedSimilarFragments);
return sortedPairedSimilarFragments;
}
}