All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.processAssemblyGenomes.MummerScriptProcessing Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
/**
 * 
 */
package net.maizegenetics.pangenome.processAssemblyGenomes;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import org.apache.log4j.Logger;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.SetMultimap;

import net.maizegenetics.dna.map.Position;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

/**
 * This class contains methods that run mummer4 scripts, e.g. nucmer, delta-filter, show-coords and show-snps.
 * In addition, there are methods that process the output from these scripts. 
 * 
 * NOTE:  processing here is relative to the needs of AssemblyHaplotypesPlugin.
 * @author lcj34
 *
 */
public class MummerScriptProcessing {
    
    private static final Logger myLogger = Logger.getLogger(MummerScriptProcessing.class);

    private enum DirectionCompare  {
       BOTH_ASCENDING(0), 
       PREVASC_CURDESC(1), 
       PREVDESC_CURASC(2),
       BOTH_DESCENDING(3);
       int value;
       private DirectionCompare(int typeValue) {
           value = typeValue;
       }
   }
    /**
     * Call mummer nucmer program to align the sequences
     * Parameters are:
     *   -c 250:  Set the minimum cluster length to 250
     *   --mum: Use anchor matches that are unique in both the reference and query 
     * 
     * @param refFasta
     * @param asmFasta
     * @param outputDeltaFilePrefix
     * @param outputDir
     */
    public static void alignWithNucmer(String refFasta, String asmFasta,  String outputDeltaFilePrefix, String outputDir, 
            String mummer4Path, int clusterSize) {
        try {      
            // -c:  Sets the minimum length of a cluster of matches
            // --mum: Use anchor matches that are unique in both the reference and query
            Long time = System.nanoTime();

            // nucmer doesn't use threading when there is just 1 sequence in the "query" (assembly) file 
                       
            // nucmer writes where ever the -p prefix tells it.  Redirecting the output appears to have
            // no effect.  So the deltaFilePrefix must include the output directory.  Otherwise, the delta
            // file is written to the directory in which the program is executed.
            String nucmer = mummer4Path + "/nucmer";
            String clusterParam = Integer.toString(clusterSize);
            ProcessBuilder builder = new ProcessBuilder(
                    nucmer, "-p", outputDeltaFilePrefix, "-c", clusterParam, "--mum", refFasta, asmFasta);
 
            String redirectError = outputDir + "errNucmer.log";
            builder.redirectError(new File(redirectError));
            Process process;
            // there is no output redirection - nucmer will always create delta file using prefix to determine directory
            String runCommand = builder.command().stream().collect(Collectors.joining(" "));
            myLogger.info(runCommand);
            process = builder.start();
            int error = process.waitFor();
            if (error != 0) {
                myLogger.error("Error creating delta file from  nucmer: Error: " + error);
                throw new IllegalStateException("Error creating delta File from nucmer ");
            }  
            myLogger.info("Finished Mummer4 alignWithNucmer in " + (System.nanoTime()-time)/1e9 + " seconds");
            
        } catch (Exception exc) {
            exc.printStackTrace();
            throw new IllegalStateException("Error running alignWithNucmer with ref: " + refFasta + " and assembly: " + asmFasta + " error: " + exc.getMessage());
        }
    }
    
    /**
     * Call mummer4 delta-filter method with parameter:
     *   -g 1-to-1 global alignment not allowing rearrangements
     *   
     * NOTE:  the -g option filters out many alignments, including inversions. Some of these
     * will be added back when the "refilterCoords" method is run later.
     * @param deltaFilePrefix
     * @param outputDir
     */
    public static void runDeltaFilter(String deltaFilePrefix, String outputDir, String mummer4Path) {
        try {  
            // Possible delta-filter options:  -g determined to fit our needs
            // -g:  1-to-1 global alignment not allowing rearrangements, vs
            // -1:  1-to-1 alignment allowing for rearrangements (intersection of -r and -q alignments)
           
            Long time = System.nanoTime();
                       
            String deltaFilter = mummer4Path + "/delta-filter";
            String deltaFile = deltaFilePrefix + ".delta";
            ProcessBuilder builder = new ProcessBuilder(
                    deltaFilter, "-1", deltaFile);
            String redirectError = outputDir + "errDeltaFilter.log";
            builder.redirectError(new File(redirectError));
            Process process;
            String outputFiltered = deltaFilePrefix + ".delta_filtered";
            File outputFile = new File(outputFiltered);
            builder.redirectOutput(outputFile);
            String runCommand = builder.command().stream().collect(Collectors.joining(" "));
            myLogger.info(runCommand);
            process = builder.start();
            int error = process.waitFor();
            if (error != 0) {
                myLogger.error("Error creating delta filtered file from delta file " + deltaFilePrefix + ".delta Error: " + error);
                throw new IllegalStateException("Error creating delta filtered File via show-coords ");
            }  
            myLogger.info("Finished runDeltaFiltered in " + (System.nanoTime()-time)/1e9 + " seconds");
            
        } catch (Exception exc) {
            exc.printStackTrace();
            throw new IllegalStateException("Error running runShowCoords against delta file: " + deltaFilePrefix + ".delta, error: " + exc.getMessage());
        }
    }

    /**
     * call mummer4 show-coords method
     * @param deltaFilePrefix
     * @param outputDir
     */
    public static void runShowCoords(String deltaFilePrefix, String outputDir, String mummer4Path) {
        // Run show-coords both against the delta, and the delta_filtered file.
        try {      
            // -T:  print results to tab-delimited file
            // -H:  do not print header information
            // -r:  Sort output lines by reference IDs and coordinates            
            Long time = System.nanoTime();
                   
            myLogger.info("Run show-coords on original delta file");
            String deltaFile = deltaFilePrefix + ".delta";
            String showCoords = mummer4Path + "/show-coords";
            ProcessBuilder builder = new ProcessBuilder(
                    showCoords, "-T", "-r", "-H", deltaFile); 
            String redirectError = outputDir + "errCoords_orig.log";
            builder.redirectError(new File(redirectError));
            Process process;
            String outputCoordsOrig = deltaFilePrefix + ".coords_orig";
            File outputFile = new File(outputCoordsOrig);
            builder.redirectOutput(outputFile);
            String runCommand = builder.command().stream().collect(Collectors.joining(" "));
            myLogger.info(runCommand);
            process = builder.start();
            int error = process.waitFor();
            if (error != 0) {
                myLogger.error("Error creating coords file from original delta file " + deltaFilePrefix + ".delta Error: " + error);
                throw new IllegalStateException("Error creating coords File via show-coords ");
            }  
            
            deltaFile = deltaFilePrefix + ".delta_filtered";
            builder = new ProcessBuilder(
                    showCoords, "-T", "-r", "-H", deltaFile);
            redirectError = outputDir + "errCoords_filtered.log";
            builder.redirectError(new File(redirectError));
            String outputCoordsFiltered = deltaFilePrefix + ".coords_filtered";
            outputFile = new File(outputCoordsFiltered);
            builder.redirectOutput(outputFile);
            runCommand = builder.command().stream().collect(Collectors.joining(" "));
            myLogger.info(runCommand);
            process = builder.start();
            error = process.waitFor();
            if (error != 0) {
                myLogger.error("Error creating coords file from filtered delta file " + deltaFilePrefix + ".delta Error: " + error);
                throw new IllegalStateException("Error creating coords File via show-coords ");
            }  
            
            myLogger.info("Finished runShowCoords in " + (System.nanoTime()-time)/1e9 + " seconds");
            
        } catch (Exception exc) {
            exc.printStackTrace();
            throw new IllegalStateException("Error running runShowCoords against delta file: " + deltaFilePrefix + ".delta, error: " + exc.getMessage());
        }
    }
       
    /**
     * This method post-processes the filtered and original coords file when the mummer coords 
     * file came from a delta filtered with the -G option.  
     * It will
     *   1.  create a list of entries to add back based on groups of ascending/descending
     *       entries of at least 3 adjacent alignments whose distance from each other is
     *       less than a specified amount
     *   2.  adds the entries above (in a sorted manner) to the filtered coords list
     *   
     * @param coordsDelta
     * @param coordsDeltaG
     * @param coordsGNoEmbedded
     */
    public static void refilterCoordsFileMinusG(String outputDeltaFilePrefix, String coordsDelta, String coordsDeltaG,
                                              String coordsGNoEmbedded, String chrom) {
        List deltaLines = new ArrayList();
        List gLines = new ArrayList();
        List sortedRefCoordsList = new ArrayList();
        
        try (BufferedReader deltaBR = Utils.getBufferedReader(coordsDelta);
             BufferedReader deltaGbr = Utils.getBufferedReader(coordsDeltaG);
             BufferedWriter bw1 = Utils.getBufferedWriter(coordsGNoEmbedded)){            
            
            String line = null;
            while ((line = deltaBR.readLine()) != null) {
                deltaLines.add(line);
            }            

            while ((line = deltaGbr.readLine()) != null) {                
                gLines.add(line); // add to list that gets merged
            }          
            
            List removedLines = new ArrayList();
            for (String entry : deltaLines) {
                if (!gLines.contains(entry) ) {
                    removedLines.add(entry);
                }
            }
            
            List returnedLines = new ArrayList ();           
            if (removedLines.size() > 0) {
                returnedLines = findAlignmentsToReturnFromAfterMinusGFilter(removedLines, gLines);
            }
            myLogger.info("refilterCoordsFile: number returned from findAlignmentsToReturn " + returnedLines.size());
            myLogger.info("refilterCoordsFile: after findAlignmentsToReturn, now create sortedRefCoordsList");
            
            
            // This processing keeps the ref ordering (vs appending all to the end)
            // It is more code than using "contains" but runs in less than a 10th of the time.  
            Map,String> refStartToEntryMap = new HashMap<>();
            for (String entry : gLines) {
                int firstTab = entry.indexOf("\t");
                int refStart = Integer.parseInt(entry.substring(0, firstTab));
                String entryMinusRefStart = entry.substring(firstTab+1);
                int refEnd = Integer.parseInt(entryMinusRefStart.substring(0,entryMinusRefStart.indexOf("\t")));
                refStartToEntryMap.put(new Tuple(refStart,refEnd), entry);
            }
            for (String entry : returnedLines) {
                int firstTab = entry.indexOf("\t");
                int refStart = Integer.parseInt(entry.substring(0, firstTab));
                String entryMinusRefStart = entry.substring(firstTab+1);
                int refEnd = Integer.parseInt(entryMinusRefStart.substring(0,entryMinusRefStart.indexOf("\t")));
                refStartToEntryMap.put(new Tuple(refStart,refEnd), entry);
            }
            List> refStartList = new ArrayList>(refStartToEntryMap.keySet());
            Collections.sort(refStartList);
            
            refStartList.stream().forEach(item -> {
                sortedRefCoordsList.add(refStartToEntryMap.get(item));
            });
            
            // remove the embedded lines. 
            myLogger.info("refilterCoordsFile: call checkForEmbedded, orig list size: " + sortedRefCoordsList.size());
            List coordsNoRefEmbedded = checkForEmbedded(sortedRefCoordsList, true); // remove ref embedded entries
            List coordsNoRefOrAssemblyEmbedded = checkForEmbedded(coordsNoRefEmbedded, false); // remove assembly embedded entries
            for (String entry : coordsNoRefOrAssemblyEmbedded) {
                // Mummer:show-snps will use the original delta file (not filtered) and the
                // coords file created here to call SNPs.  For show-snps we want embedded entries removed, but
                // the coordinates of remaining entries cannot change or show-snps barfs.
                bw1.write(entry);
                bw1.write("\n");
            }
            myLogger.info("refilterCoordsFile: finshed removing embedded,  list size= " + coordsNoRefOrAssemblyEmbedded.size());            
  
        } catch (Exception exc) { 
            myLogger.debug(exc.getMessage(), exc);
            throw new IllegalStateException("refilterCoordsFile failed with error: " + exc.getMessage());
        }
        
        return ;
    }

    /**
     * Takes a mummer delta file filtered via the -1 option, determines which entries to keep.
     * From the remaining, it removes embedded entries.
     *
     * @param outputDeltaFilePrefix
     * @param coordsDelta
     * @param coordsDelta1
     * @param coordsGNoEmbedded
     * @param chrom
     */
    public static void refilterCoordsFile( String outputDeltaFilePrefix, String coordsDelta, String coordsDelta1,
                                           String coordsGNoEmbedded, String chrom, int scoreThreshold) {
        List deltaLines = new ArrayList();
        List gLines = new ArrayList();

        Map, String> refToEntryMap = new HashMap<>();
        try (BufferedReader deltaBR = Utils.getBufferedReader(coordsDelta1);
             BufferedWriter bw1 = Utils.getBufferedWriter(coordsGNoEmbedded)){

            String line = null;
            while ((line = deltaBR.readLine()) != null) {
                deltaLines.add(line);
                int tabIndex1 = line.indexOf("\t");
                int tabIndex2 = line.indexOf("\t",tabIndex1+1);
                int refStart = Integer.parseInt(line.substring(0,tabIndex1));
                int refEnd = Integer.parseInt(line.substring(tabIndex1+1,tabIndex2));
                refToEntryMap.put(new Tuple(refStart,refEnd), line);
            }

            long time = System.nanoTime();
            List psfList = SyntenicAnchors.getPairedSimilarFragmentsFromMummer(deltaLines);
            myLogger.info("After getPairedSimilarFragmentsFromMummer - size of list = " + psfList.size());

            int score = 3; 
            int penalty = -4; 
            SyntenicAnchors.myPairedSimilarFragmentSort( psfList, score, penalty, scoreThreshold );
            
            List psfLongestPathList = SyntenicAnchors.longestIncreasingSubsequenceLAGAN(psfList);

            // longestIncreasingSubsequenceLAGAN resorts entries based on ascending assembly positions.
            // Sort again based on reference position.  Assumptions in overlap processing are that
            // the list is sorted by ascending reference start.
            Collections.sort(psfLongestPathList); 
            
            // Create list of coords entries to keep from the longestPath entries
            List keptLines = new ArrayList();
            for (PairedSimilarFragment psf : psfLongestPathList) {
                String entry = refToEntryMap.get(new Tuple(psf.getRefStartPos(),psf.getRefEndPos()));
                if (entry != null) {
                    keptLines.add(entry);
                }
            }

            myLogger.info("refilterCoordsFile: number KEPT from processing SyntenicAnchors " + keptLines.size() + ", time to process: " + (System.nanoTime() - time)/1e9);

            // remove the embedded lines.
            List coordsNoRefEmbedded = checkForEmbedded(keptLines, true); // remove ref embedded entries
            List coordsNoRefOrAssemblyEmbedded = checkForEmbedded(coordsNoRefEmbedded, false); // remove assembly embedded entries
            for (String entry : coordsNoRefOrAssemblyEmbedded) {
                // Mummer:show-snps will use the original delta file (not filtered) and the
                // coords file created here to call SNPs.  For show-snps we want embedded entries removed, but
                // the coordinates of remaining entries cannot change or show-snps barfs.
                bw1.write(entry);
                bw1.write("\n");
            }
            myLogger.info("refilterCoordsFile: finshed removing embedded,  list size= " + coordsNoRefOrAssemblyEmbedded.size());

        } catch (Exception exc) {
            myLogger.debug(exc.getMessage(), exc);
            throw new IllegalStateException("refilterCoordsFile failed with error: " + exc.getMessage());
        }

    }
    /**
     * Takes a mummer coords file and searches for overlaps.  All of the overlap goes to the
     * first entry.  If splitting in this manner results in a split in the middle or an assembly
     * deletion, then split is adjusted so the deletion is contained in the first entry of the
     * overlapped pair.  The snpFile is used to determine indel positions.
     * 
     * For now, assembly insertions may be split.  This has not been a problem for variant context 
     * processing.
     * 
     * @param coordsNoEmbedded
     * @param snpFile  SnpFIle used to determine assembly indels
     * @param coordsFinal
     */
    public static void filterCoordsOverlaps(String coordsNoEmbedded, String snpFile, String coordsFinal) {      
        
        try (BufferedWriter bw = Utils.getBufferedWriter(coordsFinal);
             BufferedReader coordsBR = Utils.getBufferedReader(coordsNoEmbedded);
             BufferedReader snpBR = Utils.getBufferedReader(snpFile)) {
            List coordsNoRefOrAssemblyEmbedded = new ArrayList();
            String line;
            myLogger.info("filterCoordsOverlaps: Reading coordinates file " + coordsNoEmbedded);
            while ((line = coordsBR.readLine()) != null) {
                coordsNoRefOrAssemblyEmbedded.add(line);
            }
            
            myLogger.info("filterCoordsOverlaps: Reading snp file " + snpFile);
            List snpList = new ArrayList();
            while ((line = snpBR.readLine()) != null) {
                snpList.add(line);
            }
            
            myLogger.info("filterCoordsOverlaps: calling splitOverlappingCoordsEntries for ref");
            List overlapRefSplitList = splitOverlappingCoordsEntries(coordsNoRefOrAssemblyEmbedded, snpList,true).getX(); // process ref overlaps

            // Assemblies can have overlapping start positions, and start positions
            // out of order.  It takes recursively going through to adjust entries to remove
            // all overlaps, as in some cases we pitch previous entries.  This is generally
            // only repeated once.
            boolean rerun = true;
            int rerunCount = 0;
            List overlapSplitList = new ArrayList();
            overlapSplitList.addAll(overlapRefSplitList);
            while (rerun) {
                rerunCount++;
                myLogger.info("filterCoordsOverlaps: calling splitOverlappingCoordsEntries for assembly, count=" + rerunCount);
                Tuple,Boolean> overlapAllSplitListTemp = splitOverlappingCoordsEntries(overlapSplitList, snpList, false);
                rerun = overlapAllSplitListTemp.getY();
                overlapSplitList.clear();
                overlapSplitList.addAll(overlapAllSplitListTemp.getX()); // save new list
            }
            
            // write lines from final filtered file to *.coords_final file
            for (String entry : overlapSplitList) {
                // This file is used in AssemblyProcessingUtils.parseCoordinateRegions
                bw.write(entry);
                bw.write("\n");
            }
        } catch (Exception exc) {
            throw new IllegalStateException("MummerScriptProcessing:filterOverlaps: error reading or writing file: " + exc.getMessage());
        }

    }
    /**
     * Splits overlapping entries, 
     * The mummer4 coords file entries will have these tab-delimited columns:
     *    S1 E1 S2 E2 Len1 Len2 %ID refID asmID
     * The files processed were sorted by ref-coordinates via the show-coords -r param, so S1/E1 is ref coords
     * and S2/E2 are the assembly coordinates.
     * 
     * When 2 entries are found to overlap, the first entry keeps its coordinates.  The second will be truncated
     * by the amount of the overlap.  It is understood this may not be completely accurate as the position of
     * indels is not considered.
     * 
     * @param sortedList File to be filtered
     * @return
     */
    public static Tuple,Boolean> splitOverlappingCoordsEntries(List sortedList, List snpList, boolean splitByRef) {
        
        System.out.println("splitOverlappingCoordsEntries: begin, size of sortedList " + sortedList.size());
        String prevEntry = sortedList.get(0);
        String currentEntry = sortedList.get(0);
        Tuple prevStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(prevEntry,splitByRef);
               
        boolean rerun = false;
        // Create map of SNP entries for quicker processing
        Multimap snpByRefMap =   HashMultimap.create();
        Multimap snpByAsmMap = HashMultimap.create();
        try {
            for (String entry : snpList) {
 
                int tabIndex1 = entry.indexOf("\t");
                int tabIndex2 = entry.indexOf("\t",tabIndex1+1);
                int tabIndex3 = entry.indexOf("\t",tabIndex2+1);
                int tabIndex4 = entry.indexOf("\t",tabIndex3+1);
                int refPos = Integer.parseInt(entry.substring(0,tabIndex1));
                int asmPos = Integer.parseInt(entry.substring(tabIndex3+1, tabIndex4));
 
                snpByRefMap.put(refPos, entry);
                snpByAsmMap.put(asmPos, entry);
            }
        } catch (Exception exc) {
            throw new IllegalStateException("splitOverlappingCoordsEntries: error processsing SNP file " + exc.getMessage());
        }
 
        List splitOverlapList = new ArrayList();
        // find overlapping ref alignments
        int splitCount = 0;
        int overlapCount = 0;
        for (int idx=1; idx < sortedList.size(); idx++) {
            currentEntry = sortedList.get(idx);

            Tuple curStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(currentEntry,splitByRef); // true = ref

            // overlap entry contains 
            // The sortedList is sorted by ref, so 1st always begins before second
            // If second is embedded in first, Tuple.y is true - in which
            // case we ditch the embedded entry, and entry previous remains as is.
            // otherwise, alter both previous and current entries
            Tuple  overlapData = checkForOverlap(prevStartEnd,curStartEnd);

            boolean isEmbedded = overlapData.getY();
            boolean isReverseEmbedded = false;
            if (overlapData.x > 0) { // if number of overlaps > 0, process
                overlapCount++;

                if (isEmbedded == false) { // 2nd not embedded in first
                    // check if first is not embedded in the second
                    Tuple revPrevStartEnd = curStartEnd;
                    Tuple revCurStartEnd = prevStartEnd;
                    Tuple overlapDataReverse = checkForOverlap(revPrevStartEnd,revCurStartEnd);
                    isReverseEmbedded = overlapDataReverse.getY();

                    if (isReverseEmbedded == false) { // neither entry embedded in the other, but there is an overlap
                        // prev entry stays the same, change ref start, asm start, and len in current entry
                        Tuple newEntries = adjustEntryForOverlap(prevEntry,currentEntry,overlapData.x, snpByRefMap, snpByAsmMap, splitByRef);
                        if (newEntries.getY().equals("NOLEN")) {
                            // The number of overlaps exceeded the size of the next entry.
                            // The "prevEntry" was adjusted.  We drop the next entry as its ref length is now 0 or negative
                            rerun = true;
                            prevEntry = newEntries.getX();
                            prevStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(prevEntry,splitByRef);

                        } else if (newEntries.getX().equals("NOLEN")) {
                            // The "curEntry" starts before the "prevEntry" .  In this case, the code
                            // chose whichever entry had the greatest length.  If it was the second one
                            // ie the "curEntry", then we drop the previous one, hence the "NOLEN" value.
                            // Which could create another overlap , so we'll need to rerun overlap processing
                            rerun = true;
                            prevEntry = newEntries.getY();
                            prevStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(prevEntry,splitByRef);
                        } else {
                            Tuple newCurStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(newEntries.getY(),splitByRef);

                            splitOverlapList.add(newEntries.getX());
                            prevEntry = newEntries.getY();
                            prevStartEnd = newCurStartEnd;
                        }
                        splitCount++; // for debug info
                    } else { // first was embedded in the second
                        // drop the first by setting it to currentEntry, but DON'T put it on the list
                        prevEntry = currentEntry;
                        prevStartEnd = curStartEnd;
                    }
                }
                // if currentEntry is embedded in the previous, previous stays the same,
                // we drop current by ignoring it here.
            } else { // these 2 entries don't overlap
                splitOverlapList.add(prevEntry);
                prevEntry = currentEntry;
                prevStartEnd = curStartEnd;
            }

        }
        // Add last entry.  It has already been adjusted if it overlapped the previous
        splitOverlapList.add(prevEntry);
        myLogger.info("splitOverlappingCoordsEntries: finalListSize " + splitOverlapList.size() + ", should match original list size " + sortedList.size());
        myLogger.info("splitOverlappingCoordsEntries: overlapCount " + overlapCount + ", splitCount " + splitCount);
        return new Tuple,Boolean>(splitOverlapList,rerun);


    }
    

    /**
     * Check entries in a list of mummer4 coords file entries and removed those
     * that are embedded
     * @param sortedList List of sorted mummer coords file entries.  Should be sorted by ref start
     * @param splitByRef Boolean - if true, check ref embedded. Otherwise check if assembly coordinates are embedded 
     * @return
     */
    public static List checkForEmbedded(List sortedList, boolean splitByRef){
 
       List finalSplitList = new ArrayList();
        
       if (sortedList.isEmpty()) return finalSplitList; // this shouldn't happen!
       
        String prevEntry = sortedList.get(0);
        String currentEntry = sortedList.get(0);
        Tuple prevStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(prevEntry,splitByRef);
        
        // find embedded  alignments
        for (int idx=1; idx < sortedList.size(); idx++) {
            currentEntry = sortedList.get(idx);
            Tuple curStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(currentEntry,splitByRef); // true = ref
   
            // overlap entry contains 
            // The sortedList is sorted by ref, so 1st always begins before second
            // If second is embedded in first, Tuple.y is true - in which
            // case we ditch the embedded entry, and entry previous remains as is.
 
            Tuple  overlapData = checkForOverlap(prevStartEnd,curStartEnd);
                       
            boolean isEmbedded = overlapData.getY();
            boolean isReverseEmbedded = false;
            if (isEmbedded == false) {
                // check if reverse is embedded.
                // Reverse pair and recheck. THis catches the following type overlapping pair:
                //  75585382        75582853
                //  75587301        75581001
                Tuple revPrevStartEnd = curStartEnd;
                Tuple revCurStartEnd = prevStartEnd;
                Tuple overlapDataReverse = checkForOverlap(revPrevStartEnd,revCurStartEnd);
                isReverseEmbedded = overlapDataReverse.y;

                if (isReverseEmbedded == false) { // neither entry embedded in the other
                    finalSplitList.add(prevEntry); // keep this entry
                } 
                // If the first was embedded in the second, it is dropped by not putting it on the list
                // Current entry becomes prevEntry if no overlap, or if first entry overlapped the second.
                prevEntry = currentEntry;  
                prevStartEnd = curStartEnd;
            }
            
        }
        finalSplitList.add(prevEntry);  // add last entry
        return finalSplitList;
    }
    
    // Adjust coordinates on overlapping mummer4 coords file entries
    // When 2 coordinate entries overlap, the first one gets the sequence.
    // The second one's start coordinates (both ref and assembly) are adjusted upward 
    // ( or downward if the values are descending) past the length of the overlap.  The
    // alignment lengths of both ref and assembly are also adjusted appropriately.  If the split
    // occurs in the middle of an assembly deletion relative to the reference, that sequence
    // is added to the first of the 2 entries.  We realize this
    // may not be strictly accurate as there could be other deletion/insertion involved
    // that would effect the start.  
    public static Tuple adjustEntryForOverlap(String prevEntry, String currentEntry,int numOverlaps, 
            Multimap snpByRefMap, Multimap snpByAsmMap, boolean splitByRef) {
        // currentEntry is a tab-delimited string of:
        //  S1 E1 S2 E2 len1 len2 %id tag1 tag2
        int tabIndex1 = currentEntry.indexOf("\t");
        int tabIndex2 = currentEntry.indexOf("\t",tabIndex1+1);
        int tabIndex3 = currentEntry.indexOf("\t",tabIndex2+1);
        int tabIndex4 = currentEntry.indexOf("\t",tabIndex3+1);
        int tabIndex5 = currentEntry.indexOf("\t",tabIndex4+1);
        int tabIndex6 = currentEntry.indexOf("\t",tabIndex5+1);
        
        // Get indices for the prevEntry.
        int prevIndex1 = prevEntry.indexOf("\t");
        int prevIndex2 = prevEntry.indexOf("\t",prevIndex1+1);
        int prevIndex3 = prevEntry.indexOf("\t",prevIndex2+1);
        int prevIndex4 = prevEntry.indexOf("\t",prevIndex3+1);
        int prevIndex5 = prevEntry.indexOf("\t",prevIndex4+1);
        int prevIndex6 = prevEntry.indexOf("\t",prevIndex5+1);
        
        StringBuilder adjustedNewSB = new StringBuilder();

        // Ref will all be ascending positions.  Assembly (query)
        // could be either
        int refStart = Integer.parseInt(currentEntry.substring(0,tabIndex1));
        int refAdjust = numOverlaps;
        int asmAdjust = numOverlaps;
              
        int asmStart = Integer.parseInt(currentEntry.substring(tabIndex2+1,tabIndex3));
        int asmEnd = Integer.parseInt(currentEntry.substring(tabIndex3+1, tabIndex4));
        boolean currentAsmDir = asmStart <= asmEnd ? true : false; // true if is ascending coordinates
        
        int prevRefEnd = Integer.parseInt(prevEntry.substring(prevIndex1+1,prevIndex2));
        int prevAsmStart = Integer.parseInt(prevEntry.substring(prevIndex2+1,prevIndex3));
        int prevAsmEnd = Integer.parseInt(prevEntry.substring(prevIndex3+1,prevIndex4));
        boolean prevAsmDir = prevAsmStart <= prevAsmEnd ?  true : false; // true if is ascending
   
        DirectionCompare directionCompare = DirectionCompare.BOTH_ASCENDING; // both prev and current asm entries are ascending
        if (prevAsmDir == true) {
            if (currentAsmDir == false) directionCompare = DirectionCompare.PREVASC_CURDESC; // prev asm ascending, current is descending
        } else if (prevAsmDir == false) {
            if (currentAsmDir == true) directionCompare = DirectionCompare.PREVDESC_CURASC; // prev asm descending, current is ascending
            else directionCompare = DirectionCompare.BOTH_DESCENDING; // both prev and current asm alignments are descending
        }
        
        // if both entries have ascending assembly coordinates and the previous asm start is > current asm start,
        // OR both entries have descending assembly coordinates and the previous asm start is < current asm start,
        // the pick the entry with the greatest length.  It gets too complicated to get the overlap
        // correct in these cases.  Using greatest length and choosing to keep the second entry means there
        // could  be an overlap with the last "previous" entry, so the calling method will re-process the overlaps.
        if ((prevAsmStart > asmStart && directionCompare == DirectionCompare.BOTH_ASCENDING) ||
            (prevAsmStart < asmStart && directionCompare == DirectionCompare.BOTH_DESCENDING)) {
            // pick the alignment with the greatest length, pitch the other
            int prevAsmLen = Integer.parseInt(prevEntry.substring(prevIndex5+1, prevIndex6));
            int curAsmLen = Integer.parseInt(currentEntry.substring(tabIndex5+1, tabIndex6));
            System.out.println("adjustEntryForOverlap: asmStart < prevAsmStart, prevLen=" + prevAsmLen
                    + ", curAsmLen=" + curAsmLen + ", prevAsmStart " + prevAsmStart + ", asmStart " + asmStart);
            if (prevAsmLen >= curAsmLen) {
                // keep previous entry, drop current entry
                return new Tuple(prevEntry,"NOLEN");
            } else {
             // keep current entry, drop previous entry
                return new Tuple("NOLEN",currentEntry);
            }
        }

        // Get the asm SNP position for this ref SNP position
        int refSnpPos = refStart + refAdjust;
        Collection refSnpEntries = snpByRefMap.get(refSnpPos);
        int asmSnpPos = 0;

        Collection asmSnps = null;
        if (refSnpEntries.size() == 1) {
            // Ref has an entry.  Only 1 entry.  Find asm position
            // We purposefully don't process if Ref has multiple entries - those
            // are assembly insertions.
            String refEntry = refSnpEntries.iterator().next();
            int tab1 = refEntry.indexOf("\t");
            int tab2 = refEntry.indexOf("\t",tab1+1);
            int tab3 = refEntry.indexOf("\t",tab2+1);
            if (refEntry.substring(tab2+1, tab3).equals(".")) {
                // if no asm deletion, we don't need to process in loop below
                // Can't assume based on number of assembly SNPs at this position that we have an indel,
                // as sometimes the same position was aligned to different regions of the ref, so more than
                // 1 SNP, but not part of a deletion.
                asmSnpPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(refEntry, 4, 12));
                // Find all SNP file entries for the potential new  assembly position from the coordinates file
                asmSnps = snpByAsmMap.get(asmSnpPos);
            }
        }

        // Previous entry does not get adjusted unless there is a SNP indel at the
        // overlap spot.  All the overlap goes to the prev entry, so its end and length
        // don't change.  The cur entry has its start adjusted based on the overlap
        String adjustedPrevEntry = prevEntry;

        if (asmSnps != null && !asmSnps.isEmpty()) {
            // multiple entries for the ASM at this position, means the ref
            // has sequence lacking in the assembly.  Increase the reference
            // beyond this
            if (asmSnps.size() > 1) {
                Tuple indelStartAndLen = AssemblyProcessingUtils.findRefIndelStart(refSnpPos,asmSnps);
                int refIndelStart = indelStartAndLen.getX();
                int asmSnpLen = indelStartAndLen.getY();
                refAdjust += asmSnpLen;
                if (refIndelStart < prevRefEnd + numOverlaps ) {
                    // Adjust again in case the indel started before the overlapping region we moved past

                    // The new start means the previous entry's end point should include these indels
                    // This adjusts the PREVIOUS entry's end point
                    // Looking for the number of bp indels that occurred AFTER the previous refEnd point                  
                    int prevEndAdjust = (refIndelStart - prevRefEnd) + asmSnpLen -1;
                    adjustedPrevEntry = getAdjustedMummerCoordEntry(prevEntry, prevEndAdjust, prevAsmDir);
                }
                if (refIndelStart < refStart + numOverlaps) {
                    int skipOverlapEntries = (refStart+numOverlaps) - refIndelStart;
                    // This is the number for starting the "current" ref - skipping over the overlapped
                    // regions and the indels.
                    refAdjust -= skipOverlapEntries; // subtract number of indel bps occurring BEFORE prev entry ended
                }
                asmAdjust = numOverlaps + 1; // move this up by 1 to skip past the indel
            }
        }

        // All ref entries are ascending, so just add the adjustment to the start of the "current"
        // The adjustments below are for the current entry
        refStart += refAdjust;
        
        // For assemblies: it is also the "current" that gets adjusted, but which end gets adjusted
        // depends on the direction of both the current and previous entries.
        if (directionCompare == DirectionCompare.BOTH_ASCENDING ) { // both increasing
            // ex: 8340  8645
            //     8640  8700 - (0) increase this start by 6
            asmStart += asmAdjust;
        } else if (directionCompare == DirectionCompare.PREVASC_CURDESC ){ // first=increasing, second=decreasing            
            if (asmStart > prevAsmEnd ) {
                // ex: 8340  8645
                //     8700  8640  (1) case to increase end by 6
                asmEnd += asmAdjust;
            } else {
                // ex: 60710 61625
                //     61014 56944 (1) case to decrease start by 305
                asmStart -= asmAdjust;
            }            
        } else if (directionCompare == DirectionCompare.PREVDESC_CURASC){ // first=decreasing, second=increasing
            // ex: 8645  8325
            //     8640  8700   (2) increase this start  by 6
            asmStart += asmAdjust;
        } else if ( directionCompare == DirectionCompare.BOTH_DESCENDING){ //both decreasing
            // ex: 8724  8251
            //     8260  8000   (3) decrease this start by 10
            asmStart -= asmAdjust;
        }

        int refLen = Integer.parseInt(currentEntry.substring(tabIndex4+1,tabIndex5));
        refLen -= refAdjust;

        int asmLen = Integer.parseInt(currentEntry.substring(tabIndex5+1,tabIndex6)); // Len1
        asmLen -= asmAdjust;

        // Any bps from an overlapping entry were given to the first entry.  WHen adjusting,
        // the assembly for the overlaps, an insertion could move the reference end point past
        // the start of the next reference entry.  In this case, drop the entry

        if (refLen < 1 || asmLen < 1) {
            myLogger.info("adjustForOverlap: after adjustment dropping currentEntry, length of ref: " + refLen
                    + ", length of asm: " + asmLen);
            return new Tuple(adjustedPrevEntry,"NOLEN");
        }

        adjustedNewSB.append(refStart).append("\t"); // S1
        adjustedNewSB.append(currentEntry.substring(tabIndex1+1,tabIndex2)).append("\t"); // E1

        adjustedNewSB.append(asmStart).append("\t"); // S2
        adjustedNewSB.append(asmEnd).append("\t"); // E2

        adjustedNewSB.append(refLen).append("\t");

        adjustedNewSB.append(asmLen).append("\t"); // len2
        adjustedNewSB.append(currentEntry.substring(tabIndex6+1)); // rest of entry - no tab, no newline needed
        
        // returns a Tuple of adjusted values for 
        return new Tuple(adjustedPrevEntry,adjustedNewSB.toString());        
    }
    
    // Create a new entry by increasing the ref and asm coordinate end point
    // by the amount indicated in adjustLen.  Also ajust the ref and assembly len
    // fields.
    // Mummer Coords entry has the form:
    //     S1 E1 S2 E2 len1 len2 %id tag1 tag2
    public static String getAdjustedMummerCoordEntry(String entry, int adjustLen, boolean asmAscending){
        int tabIndex1 = entry.indexOf("\t");
        int tabIndex2 = entry.indexOf("\t",tabIndex1+1);
        int tabIndex3 = entry.indexOf("\t",tabIndex2+1);
        int tabIndex4 = entry.indexOf("\t",tabIndex3+1);
        int tabIndex5 = entry.indexOf("\t",tabIndex4+1);
        int tabIndex6 = entry.indexOf("\t",tabIndex5+1);
        
        
        StringBuilder adjustedNewSB = new StringBuilder();
        try {
            int oldRefEnd = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, 2, 12));
            int oldAsmEnd = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, 4, 12));
            int oldRefLen = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, 5, 12));
            int oldAsmLen = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, 6, 12));
            int newRefEnd = oldRefEnd + adjustLen;

            int newAsmEnd = asmAscending ? (oldAsmEnd + 1) : (oldAsmEnd - 1);

            int newRefLen = oldRefLen + adjustLen;
            int newAsmLen = oldAsmLen + adjustLen;
            adjustedNewSB.append(entry.substring(0,tabIndex1)).append("\t");
            adjustedNewSB.append(Integer.toString(newRefEnd)).append("\t");
            adjustedNewSB.append(entry.substring(tabIndex2+1,tabIndex3)).append("\t");
            adjustedNewSB.append(Integer.toString(newAsmEnd)).append("\t");
            adjustedNewSB.append(Integer.toString(newRefLen)).append("\t");
            adjustedNewSB.append(Integer.toString(newAsmLen)).append("\t");
            adjustedNewSB.append(entry.substring(tabIndex6+1));
            return adjustedNewSB.toString();
        } catch (Exception exc) {
            throw new IllegalStateException("MummerScriptProcessing:getAdjustedEntry error parsing entry, message: " + exc.getMessage());
        }           
    }

    /**
     * Returns a Tuple indicating 
     * There will be inversions, so no guarantee  prevStartEnd.x is <= curStartEnd.x
     * @param prevStartEnd
     * @param curStartEnd
     * @return Tuple indicating (x) number of overlaps between the 2, and (y) was one embedded in the other?
     */
    public static Tuple checkForOverlap(Tuple prevStartEnd, Tuple curStartEnd){
        int numOverlap = 0;
        int pStart = prevStartEnd.x;
        int pEnd = prevStartEnd.y;
        int cStart = curStartEnd.x;
        int cEnd = curStartEnd.y;
        
        // Adjust if coordinates are reversed
        if (prevStartEnd.x > prevStartEnd.y ) {
            pStart = prevStartEnd.y;
            pEnd = prevStartEnd.x;
        }
        if (curStartEnd.x > curStartEnd.y ) {
            cStart = curStartEnd.y;
            cEnd = curStartEnd.x;
        }
        
        // Using De Morgan's law to determine overlap
        if (pEnd >= cStart && cEnd >= pStart) {
            // there is overlap - determine the amount
            int maxStart = Math.max(pStart,cStart);
            int minEnd = Math.min(pEnd, cEnd);
            numOverlap = (minEnd-maxStart) + 1;
        } 
        // check for embedded
        if ((cStart >= pStart) && (cEnd <= pEnd)) {
            return new Tuple(numOverlap,true); //
        }
        
        // if no overlap, numOverlap will be 0
        return new Tuple(numOverlap,false);
    }

    /**
     * Takes a list of mummer4 coordinates removed during delta-filter with -g and determines
     * which coordinates should be returned. Those returned have reference range coordinates that
     * fall between 2 existing kept alignment reference range coordinates, and whose assembly coordinates
     * fall between the assembly coordinates of the 2 ranges.  Most often, these will be 
     * alignments where the assembly aligned on the reverse strand.
     * Nothing is added back that falls before the start of the coordinates created with delta-filter
     * -g, or after the end of the delta-filter -g list of coordinates.
     * 
     * If a removed entry falls before the first of the pair of gList entries, and extends into
     * the second gList entry, it is not added.
     *
     * The entries returned are limited to alignments where the ref length is at least 1000 bps.
     * 
     * Any entries added that overlap each other will be handled in later processing 
     * when all overlaps are handled.
     * 
     * The mummer4 delta-filter with -g performed 1-1 global alignment with no rearrangements.
     * This purpose of findAlignmentsToReturn() is to return inversions that appear on the diagonal.
     * 
     * NOTE:  This method is obsolete, but kept to facilitate user desire to process with mummer -G filtering
     * @param removedList
     * @param gList
     * @return
     */
    public static List findAlignmentsToReturnFromAfterMinusGFilter(List removedList, List gList) {
        List returnItemList = new ArrayList();
        
        int glistIdx = 1; // current of the second entry into the filtered coordinates list

        glistIdx = findNonOverlappingGlistPair( gList,  glistIdx);
        if (glistIdx == gList.size()) return returnItemList; // no overlapping positions
        String gListLowString = gList.get(glistIdx-1);
        String gListHighString = gList.get(glistIdx);
        
        Tuple gLowRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,true);
        Tuple gHighRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,true);              
        
        Tuple gLowAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,false);
        Tuple gHighAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,false);
        
        Range gRefRange = Range.closed(gLowRefStartEnd.getY(), gHighRefStartEnd.getX());
        
        // The mummer4 -g filtered coordinates processing does not allow inversions/rearrangements, so ASM start
        // should always be less than ASM end (Not true with the removed list from the original coords processing)
        Range gAsmRange = Range.closed(gLowAsmStartEnd.getY(), gHighAsmStartEnd.getX());
        
        // Walk through all the removed entries.
        // Add back those where the removed ref coordinates fall between the gList line's ref coordinates and the removed
        // assembly coordinates are within the diagonal, ie between gList line's assembly coordinates.
        // The removedList coordinates that fall within will probably be reversed.
        
        for (int ridx = 0; ridx < removedList.size();) {
            String removedEntry = removedList.get(ridx);
            Tuple remRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(removedEntry,true);
            Tuple remAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(removedEntry,false);
            
            int refLen = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(removedEntry, 5, 9));
            if (refLen < 1000) {
                // only adding back if ref len is > 1000
                ridx++;
                continue;
            }
            Range removedAsmRange;
            // There will be inversions in the removed coordinates
            if (remAsmStartEnd.getX() < remAsmStartEnd.getY()) {
                removedAsmRange = Range.closed(remAsmStartEnd.getX(), remAsmStartEnd.getY());
            } else {
                removedAsmRange = Range.closed(remAsmStartEnd.getY(), remAsmStartEnd.getX());
            }
            
            // Ref range coordinates are always ascending
            Range removedRefRange = Range.closed(remRefStartEnd.getX(), remRefStartEnd.getY());
            if (gRefRange.encloses(removedRefRange)) {
                // ref coordinates are within range - check the assembly coordinates
                if (gAsmRange.encloses(removedAsmRange)) {
                    returnItemList.add(removedEntry);                    
                } // else the entry is dropped
                ridx++; // move to the next "removed" entry
            } else if (remRefStartEnd.getX() < gRefRange.lowerEndpoint()) {
                // not adding anything until the removed entry's ref coordinates have reached
                //  the glist ref coordinate start
                ridx++; 
            } else if (remRefStartEnd.getX() > gRefRange.upperEndpoint()) {
                // need to move to next gList entry until we have 2 entries 
                // whose ref coordinates surround the removed entry list coordinates
                glistIdx++;
                if (glistIdx == gList.size()) break; // no more entries to add
                glistIdx = findNonOverlappingGlistPair( gList,  glistIdx);
                if (glistIdx == gList.size()) {
                    // The coords files are ordered by ref coordinate.
                    // If the current "removed" entry starts beyond  the last
                    // gList entry, then all remaining are beyond it so none will
                    // be added back to the list
                    break; // no more entries will be added
                } else {                    
                    
                    gListLowString = gList.get(glistIdx-1);
                    gListHighString = gList.get(glistIdx);
                    
                    gLowRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,true);
                    gHighRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,true);              
                    
                    gLowAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,false);
                    gHighAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,false);
                    
                    gRefRange = Range.closed(gLowRefStartEnd.getY(), gHighRefStartEnd.getX());
                    
                    // The mummer4 -g filtered coordinates processing does not allow inversions/rearrangements, so ASM start
                    // should always be less than ASM end (Not true with the removed list from the original coords processing)
                    gAsmRange = Range.closed(gLowAsmStartEnd.getY(), gHighAsmStartEnd.getX());
                }
                // Not increasing ridx here.  We're still processing the same "removed" entry but against
                // a new gList range.
            } else {
                // Here is one we process.  Removed entry's ref range start falls within the gList ref range coordinates.
                // Need to check the assembly, and it if falls within the diagonal, it is kept.
                if (remRefStartEnd.getX() > gRefRange.lowerEndpoint() && remRefStartEnd.getX() < gRefRange.upperEndpoint()) {
                    if (gAsmRange.encloses(removedAsmRange)) {
                        returnItemList.add(removedEntry);                    
                    } // else the entry is dropped
                }
                ridx++; // move to the next "removed" entry
            }
        
        } 
       
        // these are all the inversions we'll add.
        return returnItemList;
    }



    /**
     * Traverse a list of mummer coords entries.  Create lists of alignments
     * that have a minimum of 3 in a row where assembly is in the same
     * direction (forward or reverse) and the distance between the end
     * of one entry and the beginning of the next is > 0.01 percent.
     * 
     * Return a concatenated list of all the entries kept from the original file.
     * 
     * This method is expected to be used with a mummer coords file created 
     * from a delta file filtered with the -1 option.
     * 
     * @param origList
     * @return
     */
    public static List findAlignmentsToRemove_fromMinus1Filter(List origList) {
        List returnItemList = new ArrayList();
        List tempList = new ArrayList();
        
        int numConsecutive = 4;

        int totalRefLen = 0;
        String prevEntry = origList.get(0);
        Tuple prevAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(prevEntry,false);
        boolean prevDirectionAscending = true;
        tempList.add(prevEntry);
        if (prevAsmStartEnd.x > prevAsmStartEnd.y) {
            prevDirectionAscending = false;
        }
                
        if (origList.size() == 1) {
            // This happens with phgSmallSeq testing.  Though unlikely, it could
            // happen in the real world.
            returnItemList.addAll(origList);
            myLogger.info("findAlignmentsToReturn: origList size is 1 - add to returnList and quit");
            return returnItemList;
        }
                
        // loop through origList to find entries that should be kept
        myLogger.info("findAlignmentsToReturn: origList size: " + origList.size());
        for (int idx=1; idx < origList.size(); idx++) {
            String currentEntry = origList.get(idx);

            boolean currentDirectionAscending = false;
            Tuple curAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(currentEntry,false);
            int refLen = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(currentEntry, 5, 9));

            if (curAsmStartEnd.x < curAsmStartEnd.y) {
                currentDirectionAscending = true;
            }
            if (currentDirectionAscending != prevDirectionAscending) {
                // direction has changed, stop processing the current list
                if (tempList.size() >= numConsecutive && totalRefLen >= 2000) {
                    returnItemList.addAll(tempList);
                }
                tempList.clear();
                totalRefLen = 0;
            } else { // both current and prev entries are moving in same direction
                // calculate the distance between the current and previous entries
                double distance = AssemblyProcessingUtils.calculateCoordDistance(prevAsmStartEnd,curAsmStartEnd);

                if (distance > 0.01 ) { // distance is too great, store/clear tempList
                    if (tempList.size() >= numConsecutive && totalRefLen >= 2000) {
                        returnItemList.addAll(tempList);
                    }
                    tempList.clear();
                    totalRefLen = 0;
                }
            }
            
            if ( refLen >= 250) { // alignment must be reasonable length
                tempList.add(currentEntry); 
                totalRefLen += refLen;
            }
            prevEntry = currentEntry;
            prevAsmStartEnd = curAsmStartEnd;
            prevDirectionAscending = currentDirectionAscending;
        }
        if (tempList.size() >= numConsecutive && totalRefLen >= 2000) {          
            returnItemList.addAll(tempList); // add final entries
        }

        return returnItemList;
    }

    /**
     * Given a list of mummer coords entries and an index, finds the entry where this entry
     * and the one before it have no overlaps in either the ref or assembly coordinates.
     * The index returned is for the second of the 2 entries.  This method is used when
     * searching for mummer delta file entries removed during -g delta filtering that we
     * would like to return.
     * 
     * @param gList
     * @param glistIdx
     * @return
     */
    public static int findNonOverlappingGlistPair(List gList, int glistIdx) {
        // glistIdx is the current index into the list of coordinates held in gList.
        // To get low, do one less, to get current high, use glistIdx.
        // increment glistIdx until you have one where neither ref nor assembly coordinates
        // are overlapping.
        String gListLowString = gList.get(glistIdx-1);
        String gListHighString = gList.get(glistIdx);
       
        Tuple gLowRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,true);
        Tuple gHighRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,true);
               
        Tuple gLowAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,false);
        Tuple gHighAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,false);
        
        while (glistIdx+1 < gList.size()) {
            boolean keep = true;
            if (gLowRefStartEnd.getY() > gHighRefStartEnd.getX() || gLowAsmStartEnd.getY() > gHighAsmStartEnd.getX()) {
                keep = false;
            }
            if (keep) return glistIdx;
            else {
                glistIdx++;
                gListLowString = gListHighString;
                gListHighString = gList.get(glistIdx);
                
                gLowRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,true);
                gHighRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,true);
                gLowAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,false);
                gHighAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,false);
            }
        }
        
        if ((gLowRefStartEnd.getY() <= gHighRefStartEnd.getX()) && (gLowAsmStartEnd.getY() <= gHighAsmStartEnd.getX() )) {
            // we found a good range - return the index into the filtered coords file
            return glistIdx;
        }
        return gList.size(); // indicates we've exhausted the list
    }
    
    /**
     * Fun the mummer4 show-snps entry against a delta file, using a coords file as
     * additional input.  From the command line, this would look like:
     *   cat coords_file | show-snps -T -r -H -S deltaFile
     * @param deltaFilePrefix
     * @param coordsForShowSnps
     * @param outputDir
     */
    public static void runShowSNPsWithCat(String deltaFilePrefix, String coordsForShowSnps, String outputDir, String mummer4Path, String chrom) {
        try {
            
            // Start a shell process which will cat the contents of the coords file
            Process process1 = new ProcessBuilder("/bin/bash").start();
 
            String deltaFile = deltaFilePrefix + ".delta";
            
            myLogger.info("runShowSNPs: Using delta file " + deltaFile);
            myLogger.info("runSHowSNPs: Using coordsFile " + coordsForShowSnps);
            
            // Create process2.  This will run the mummer4 show-snps command.  IT is run here
            // with the -S option.  This option expects a coords file from stdin.  Once show-snps
            // is started, it waits for the coords file input.
            String showSnps = mummer4Path + "/show-snps";
            ProcessBuilder builder = new ProcessBuilder(
                    showSnps, "-T", "-r", "-H", "-S", deltaFile);
            String runCommand = builder.command().stream().collect(Collectors.joining(" "));
            System.out.println(runCommand);
            String redirectError = outputDir + "/_errSNPs.log";
            builder.redirectError(new File(redirectError));
            
            String outputSNPS = deltaFilePrefix + ".snps_prefiltered";
            File outputFile = new File(outputSNPS);
            builder.redirectOutput(outputFile);
            Process process2 = builder.start();
          
            // Send output of cat'ing the coords file to the mummer4:show_snps script
            // The constructor calls "start" when begins piping the output of process1
            // to the input of process2
            RedirectStreams redirectStreams = new RedirectStreams(process1,process2);
                        
            // Write the "cat coordsfile" command to the outputstream of process1
            BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(process1.getOutputStream()));
            String command = "cat " + coordsForShowSnps;
            System.out.println("RunShowSNPs Input process1: " + command);
            bufferedWriter.write(command + '\n');
            bufferedWriter.close();
            
            int error = process2.waitFor();
            if (error != 0) {
                System.out.println("Error creating snps file from delta file " + deltaFile + " Error: " + error);
                throw new IllegalStateException("Error creating SNPS File via show-snps ");
            }
 
            
            // Read the created SNP file into a list
            BufferedReader origBR = Utils.getBufferedReader(outputSNPS);
            List origSnpsList = new ArrayList(); 
            String line;
            while ((line = origBR.readLine()) != null) {
                origSnpsList.add(line);
            }
            origBR.close(); 
            
            // verifySnpEntries is now called after final coordinate processing
            
        } catch (Exception exc) {
            throw new IllegalStateException("MummerScriptProcessing:runShowSNPsWithCat Error processing initial SNP file " + exc.getMessage());
        }
        
    }

    public static void finalSnpFiltering(String outputSNPS, String deltaFilePrefix, String chrom) {
        // Read the created SNP file into a list
        String snpFile = deltaFilePrefix + ".snps_final";
        try (BufferedReader origBR = Utils.getBufferedReader(outputSNPS);
             BufferedWriter snpBW = Utils.getBufferedWriter(snpFile)) {
            List origSnpsList = new ArrayList(); 
            String line;
            while ((line = origBR.readLine()) != null) {
                origSnpsList.add(line);
            }                     
            
            // Find which of these entries have positions represented in the mummer coords file after filtering
            String coordsFile = deltaFilePrefix + ".coords_final";
            List finalSnps = verifySNPEntries(origSnpsList, coordsFile, chrom);
            
            // write final snp file          
            for (String entry: finalSnps) {
                snpBW.write(entry);
                snpBW.write("\n");
            }
            
        } catch (Exception exc) {
            throw new IllegalStateException("Error processing initial final SNP file " + exc.getMessage());
        }       
 
    }
 
    /**
     * This method calls show-snps using only a delta file as input
     * @param deltaFilePrefix
     * @param outputDir
     */
    public static void runShowSNPs(String deltaFilePrefix, String outputDir, String mummer4Path, String chrom) {
        try {      
            // -T:  print results to tab-delimited file
            // -H:  do not print header information
            // -r:  Sort output lines by reference IDs and coordinates            
            Long time = System.nanoTime();
            
            // Run show-snps with original delta 
            String deltaFile = deltaFilePrefix + ".delta";
            String showSnps = mummer4Path + "/show-snps";
            ProcessBuilder builder = new ProcessBuilder(
                    showSnps, "-T", "-r", "-H", deltaFile);
            String redirectError = outputDir + "errSNPsOrig.log";
            builder.redirectError(new File(redirectError));
            String outputSNPS = deltaFilePrefix + ".snps_orig";
            File outputFile = new File(outputSNPS);
            builder.redirectOutput(outputFile);
            String runCommand = builder.command().stream().collect(Collectors.joining(" "));
            myLogger.info(runCommand);
            Process process = builder.start();
            int error = process.waitFor();
            if (error != 0) {
                myLogger.error("Error creating snps file from delta file " + deltaFile + " Error: " + error);
                throw new IllegalStateException("Error creating SNPS File via show-snps ");
            }  
            
            // Read the created SNP file into a list
            BufferedReader origBR = Utils.getBufferedReader(outputSNPS);
            List origSnps = new ArrayList(); 
            String line;
            while ((line = origBR.readLine()) != null) {
                origSnps.add(line);
            }
            origBR.close();           
            
            // Find which of these entries have positions represented in the mummer coords file after filtering
            String coordsFile = deltaFilePrefix + ".coords_final";
            List finalSnps = verifySNPEntries(origSnps, coordsFile, chrom);
            
            // write final snp file
            String snpFile = deltaFilePrefix + ".snps_final";
            BufferedWriter snpBW = Utils.getBufferedWriter(snpFile);
            for (String entry: finalSnps) {
                snpBW.write(entry);
                snpBW.write("\n");
            }
            snpBW.close();
            
            myLogger.info("Finished runShowSNPs in " + (System.nanoTime()-time)/1e9 + " seconds");
            
        } catch (Exception exc) {
            exc.printStackTrace();
            throw new IllegalStateException("Error running runShowSNPs against delta file: " + deltaFilePrefix + ".delta, error: " + exc.getMessage());
        }
    }

    /**
     * This method takes a list of Mummer SNP file entries and verifies the SNP 
     * positions are represented  in the filtered/overlap-merged coords
     * file. 
     * 
     *  Additional filtering is performed to remove SNPs that occurred in overlapped
     *  coordinates entries, resulting in duplicate SNPs with differing assembly
     *  positions for the same reference position.
     * 
     * A list of "represented" SNPs is returned.
     * 
     * NOTE:  Snp entries must be added in order.
     * 
     * @param deltaSNPs - list of SNPs from a Mummer filtered delta file
     * @param coordsFile - coordinates file to use when checking for positions.
     * @parameter chrom - the reference chrom, used to create a Position object
     * @return
     */
    
    public static List verifySNPEntries(List deltaSNPs, String coordsFile, String chrom){
        List finalSNPList = new ArrayList();

        myLogger.info("mergeVerifySNPEntries: begin");

        // Create coords List from coordsFile        
        List finalCoordsList = new ArrayList();
        try (BufferedReader coordsBR = Utils.getBufferedReader(coordsFile)){
            String coordsLine;
            while ((coordsLine = coordsBR.readLine()) != null) {
                finalCoordsList.add(coordsLine);
            }           
        } catch (IOException ioe) {
            throw new IllegalStateException("mergeVerifySNPEntries: error readingfilteredCoordsFile " 
                    + coordsFile + " message: " + ioe.getMessage());
        }

        // The SNP file entry is a tab-delimited line comprise of columns:
        // RefPos  RefSNP   AsmSNP   AsmPos BUFF DIST  R  Q  LenR LenQ FRM1 FRM2 RefTag  AsmTag
        int refPosCol = 1;
        int asmPosCol = 4;
        int refSNPcol = 2;
        int asmSNPcol = 3;
        int totalColumns = 12;

        RangeMap> finalCoordsRangeMap = AssemblyProcessingUtils.getCoordsRangeMap(finalCoordsList, chrom);

        HashSet refHashSet = new HashSet(); // to ensure we don't keep multiple SNPs for same position
        HashSet asmHashSet = new HashSet();

        int snpIdx = 0;
        
        // Save the entry on a map of referencePos to Tuple
        Map> snpRefToAsmPosMap = new HashMap>();

        try {
            System.out.println("verifySNPEntries - begin, snpIdx = 0);, deltaSNPs.size = " + deltaSNPs.size());
            
            while (snpIdx < deltaSNPs.size() ) {
                String entry = deltaSNPs.get(snpIdx); 

                boolean snpInRange = AssemblyProcessingUtils.checkSnpEntryInRange( entry, finalCoordsRangeMap, chrom);

                // SNP can be in range here, but can be a duplicate because show-snps was run on
                // the coords file BEFORE overlapped were merged.  Different assembly alignments mapping
                // to the same reference coordinates could show different snps.
                if (snpInRange) {  

                    int refPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refPosCol, totalColumns));
                    int asmPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmPosCol, totalColumns));
                    String curRefSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refSNPcol, totalColumns);
                    String curAsmSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmSNPcol, totalColumns);
  
                    if (!curRefSNPVal.equals(".") && !curAsmSNPVal.equals(".")) {
                        // If neither the ref nor the asm position have been seen, add them to the "keep" lists
                        if (!refHashSet.contains(refPos) || !asmHashSet.contains(asmPos)) {
                            refHashSet.add(refPos);
                            asmHashSet.add(asmPos);
                            snpRefToAsmPosMap.put(refPos,Arrays.asList(entry));               
                        }
                        snpIdx++; // process next SNP entry
                        continue;
                    } else if (curRefSNPVal.equals(".")) { // assembly insertion
                        // create the indel
                        List tmpSnpRefToAsmPosList = new ArrayList();

                        List asmPosList = new ArrayList();
                        asmPosList.add(asmPos);

                        tmpSnpRefToAsmPosList.add(entry);

                        int savedRefPos = refPos;
                        int prevAsmPos = asmPos;

                        snpIdx++;
                        // This block could be inside the while below except for needing to know
                        // the direction of the assembly.  If the assembly values are increasing,
                        // prevAsmPos would default to asmPos-1.  If they are decreasing, it would
                        // default to asmPos+1.  THe (Mat.abs(asmPos - prevAsmPos) == 1 will fail
                        // if this is wrong, hence this additional else clause.
                        if (snpIdx < deltaSNPs.size()) {
                            entry = deltaSNPs.get(snpIdx);
                            
                            refPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refPosCol, totalColumns));
                            asmPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmPosCol, totalColumns));
                            curRefSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refSNPcol, totalColumns);
                            curAsmSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmSNPcol, totalColumns);
                        }
 
                        while (snpIdx < deltaSNPs.size() && refPos == savedRefPos && curRefSNPVal.equals(".") && (Math.abs(asmPos - prevAsmPos) == 1)) {

                            // next entry is good, add to list
                            asmPosList.add(asmPos);
                            tmpSnpRefToAsmPosList.add(entry);
                            
                            prevAsmPos = asmPos;  
                            snpIdx++;
 
                            if (snpIdx < deltaSNPs.size() ){
                                entry = deltaSNPs.get(snpIdx);
                               
                                refPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refPosCol, totalColumns));
                                asmPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmPosCol, totalColumns));
                                curRefSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refSNPcol, totalColumns);
                                curAsmSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmSNPcol, totalColumns);
                            }
                                                                                  
                        }
                        // we have processed the whole indel
                        if (!refHashSet.contains(savedRefPos) && checkNoListEntriesInRange(asmHashSet, asmPosList)) {
                            refHashSet.add(savedRefPos);
                            int lastEntry = asmPosList.size() - 1;
                            int asmStart = asmPosList.get(0) < asmPosList.get(lastEntry) ? asmPosList.get(0) : asmPosList.get(lastEntry);
                            int asmEnd = asmPosList.get(0) < asmPosList.get(lastEntry) ? asmPosList.get(lastEntry) : asmPosList.get(0);
                            for (int idx = asmStart; idx <= asmEnd; idx ++) {
                                asmHashSet.add(idx);
                            }
                            
                            snpRefToAsmPosMap.put(savedRefPos,tmpSnpRefToAsmPosList);
                        }

                    } else { // assembly deletion related to ref 
                        // To this map add a set of SNP positions consistent with just 1 alignment
                        Map> tmpSnpRefToAsmPosMap = new HashMap>();

                        // Stores the ref to SNP entry mapping for this set of assembly deletions
                        SetMultimap refToIndelMap = HashMultimap.create();
                        // Stores the asm to SNP entry mappings for this set of assembly deletions
                        SetMultimap asmToIndelMap = HashMultimap.create();
                        
                        refToIndelMap.put(refPos, entry);
                        asmToIndelMap.put(asmPos, entry);
                        
                        List refPosList = new ArrayList();

                        int savedAsmPos = asmPos;
                        int prevRefPos = refPos;

                        // Get the next entry to see if we are still processing a deletion
                        snpIdx++;
                        if (snpIdx < deltaSNPs.size() ){ 
                            entry = deltaSNPs.get(snpIdx);
                            
                            refPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refPosCol, totalColumns));
                            asmPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmPosCol, totalColumns));
                            curRefSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refSNPcol, totalColumns);
                            curAsmSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmSNPcol, totalColumns);
                        }
       
                        // because of alignment overlaps (PHG-324) asmPos will not always be the same.
                        // Instead we check that refPos either equals prevRefPos OR it increases by 1 AND assembly val = "."
                        // When refPos = prevRefPos, it means there was an overlapping alignment with the same SNP
                        while (snpIdx < deltaSNPs.size() && curAsmSNPVal.equals(".") && ( refPos == prevRefPos || (Math.abs(refPos - prevRefPos) == 1)) ) {

                            // next entry is good, add to list
                            refPosList.add(refPos);
                            
                            // There should never be multiple entries where both ref and asm positions are the same
                            // This implies duplicate due to previous mummer overlapping entries - don't add it.
                            if (!(refPos == prevRefPos && asmPos == savedAsmPos)) {
                                refToIndelMap.put(refPos, entry);
                                asmToIndelMap.put(asmPos, entry);
                            } 
                            
                            prevRefPos = refPos;
                            snpIdx++;
                            if (snpIdx < deltaSNPs.size() ){
                                entry = deltaSNPs.get(snpIdx);
                                
                                refPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refPosCol, totalColumns));
                                asmPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmPosCol, totalColumns));
                                curRefSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refSNPcol, totalColumns);
                                curAsmSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmSNPcol, totalColumns);
                            }                            
                                                        
                        }
                        // we have processed the whole indel - choose a set of SNP deletion entries
                        if (asmToIndelMap.keySet().size() == 1) {
                            // no overlapping alignments, no duplicate SNP values - just add them
                            List refPositionList = new ArrayList<>(refToIndelMap.keySet());
                            Collections.sort(refPositionList);
                            for (int pos : refPositionList) { // should only be 1 entry                                                            
                                tmpSnpRefToAsmPosMap.put(pos,(new ArrayList(refToIndelMap.get(pos))));
                                refPosList.add(pos); // to be checked for duplicates later
                            }
                        } else {
                            // We have overlaps - choose.  Looking at the SNP files, sometimes choosing the
                            // lowest asm coordinate fell in a seeming chronological order, and sometimes
                            // choosing the highest asm coordinate mapped better chronologically.  In general,
                            // while the ref positions are ALWAYS in order, assembly positions are not.
                            // A third alternative is to choose the position that had the most consecutive
                            // deletions in this series.  Often, though, they were equal.
                            // 
                            // This code uses the lowest asm coordinate as it more often (but not always) fell within
                            // a chronological order.
                            
                            List asmPositionList = new ArrayList<>(asmToIndelMap.keySet());
                            Collections.sort(asmPositionList); // get lowest asm position that maps at this deletion series                        
                            
                            Collection entries = asmToIndelMap.get(asmPositionList.get(0));
                            List entriesAsList = new ArrayList<>(entries);
                            Collections.sort(entriesAsList);
                            
                            for (String snpEntry : entriesAsList) {
                                int refSNPposition = Integer.parseInt(snpEntry.substring(0, snpEntry.indexOf("\t")));
                                refPosList.add(refSNPposition);
                                tmpSnpRefToAsmPosMap.put(refSNPposition,Arrays.asList(snpEntry));
                            }
                            // reset savedAsmPos to used for duplicate check below
                            savedAsmPos = asmPositionList.get(0);
                            
                        }
                        
                        // check for duplicate positions, add final
                        if (!asmHashSet.contains(savedAsmPos) && checkNoListEntriesInRange(refHashSet, refPosList)) {
                            asmHashSet.add(savedAsmPos);
                            int lastEntry = refPosList.size() - 1;
                            int refStart = refPosList.get(0) < refPosList.get(lastEntry) ? refPosList.get(0) : refPosList.get(lastEntry);
                            int refEnd = refPosList.get(0) < refPosList.get(lastEntry) ? refPosList.get(lastEntry) : refPosList.get(0);
                            for (int idx = refStart; idx <= refEnd; idx++) {
                                refHashSet.add(idx);
                            }
                            
                            snpRefToAsmPosMap.putAll(tmpSnpRefToAsmPosMap);
                        }
                    }

                } else {
                    snpIdx++;
                }                
            }
        } catch (Exception exc) {
            System.out.println("verifySNPEntries exception message: " + exc.getMessage());
            exc.printStackTrace();
        }

        // Create the final list
        
        List refPosMap = new ArrayList(snpRefToAsmPosMap.keySet());
        Collections.sort(refPosMap);
        for (int refSNPpos : refPosMap) {
            List snpTupleEntry = snpRefToAsmPosMap.get(refSNPpos);
            snpTupleEntry.stream().forEach(item -> {
                finalSNPList.add(item); // add show-snps entry to final list
            });
        }        

        myLogger.info("mergeVerifySNPEntries: total original SNPS " + deltaSNPs.size() 
        + ", number after filtering by coords_final coordinates " + finalSNPList.size());
        return finalSNPList;

    }
    
    /**
     * Method takes a RangeSet of Integers and a list of integers.
     * It returns "false" if any value from the list is present in the RangeSet
     * It returns "true" if no value from the list is present in the RangeSet
     * @param hashSet
     * @param testList
     * @return
     */
    public static boolean checkNoListEntriesInRange(HashSet hashSet, List testList) {
        for (int entry : testList) {
            if (hashSet.contains(entry)) return false;
        }
        return true;
    }
    
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy