net.maizegenetics.pangenome.processAssemblyGenomes.MummerScriptProcessing Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.processAssemblyGenomes;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.log4j.Logger;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.SetMultimap;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
/**
* This class contains methods that run mummer4 scripts, e.g. nucmer, delta-filter, show-coords and show-snps.
* In addition, there are methods that process the output from these scripts.
*
* NOTE: processing here is relative to the needs of AssemblyHaplotypesPlugin.
* @author lcj34
*
*/
public class MummerScriptProcessing {
private static final Logger myLogger = Logger.getLogger(MummerScriptProcessing.class);
private enum DirectionCompare {
BOTH_ASCENDING(0),
PREVASC_CURDESC(1),
PREVDESC_CURASC(2),
BOTH_DESCENDING(3);
int value;
private DirectionCompare(int typeValue) {
value = typeValue;
}
}
/**
* Call mummer nucmer program to align the sequences
* Parameters are:
* -c 250: Set the minimum cluster length to 250
* --mum: Use anchor matches that are unique in both the reference and query
*
* @param refFasta
* @param asmFasta
* @param outputDeltaFilePrefix
* @param outputDir
*/
public static void alignWithNucmer(String refFasta, String asmFasta, String outputDeltaFilePrefix, String outputDir,
String mummer4Path, int clusterSize) {
try {
// -c: Sets the minimum length of a cluster of matches
// --mum: Use anchor matches that are unique in both the reference and query
Long time = System.nanoTime();
// nucmer doesn't use threading when there is just 1 sequence in the "query" (assembly) file
// nucmer writes where ever the -p prefix tells it. Redirecting the output appears to have
// no effect. So the deltaFilePrefix must include the output directory. Otherwise, the delta
// file is written to the directory in which the program is executed.
String nucmer = mummer4Path + "/nucmer";
String clusterParam = Integer.toString(clusterSize);
ProcessBuilder builder = new ProcessBuilder(
nucmer, "-p", outputDeltaFilePrefix, "-c", clusterParam, "--mum", refFasta, asmFasta);
String redirectError = outputDir + "errNucmer.log";
builder.redirectError(new File(redirectError));
Process process;
// there is no output redirection - nucmer will always create delta file using prefix to determine directory
String runCommand = builder.command().stream().collect(Collectors.joining(" "));
myLogger.info(runCommand);
process = builder.start();
int error = process.waitFor();
if (error != 0) {
myLogger.error("Error creating delta file from nucmer: Error: " + error);
throw new IllegalStateException("Error creating delta File from nucmer ");
}
myLogger.info("Finished Mummer4 alignWithNucmer in " + (System.nanoTime()-time)/1e9 + " seconds");
} catch (Exception exc) {
exc.printStackTrace();
throw new IllegalStateException("Error running alignWithNucmer with ref: " + refFasta + " and assembly: " + asmFasta + " error: " + exc.getMessage());
}
}
/**
* Call mummer4 delta-filter method with parameter:
* -g 1-to-1 global alignment not allowing rearrangements
*
* NOTE: the -g option filters out many alignments, including inversions. Some of these
* will be added back when the "refilterCoords" method is run later.
* @param deltaFilePrefix
* @param outputDir
*/
public static void runDeltaFilter(String deltaFilePrefix, String outputDir, String mummer4Path) {
try {
// Possible delta-filter options: -g determined to fit our needs
// -g: 1-to-1 global alignment not allowing rearrangements, vs
// -1: 1-to-1 alignment allowing for rearrangements (intersection of -r and -q alignments)
Long time = System.nanoTime();
String deltaFilter = mummer4Path + "/delta-filter";
String deltaFile = deltaFilePrefix + ".delta";
ProcessBuilder builder = new ProcessBuilder(
deltaFilter, "-1", deltaFile);
String redirectError = outputDir + "errDeltaFilter.log";
builder.redirectError(new File(redirectError));
Process process;
String outputFiltered = deltaFilePrefix + ".delta_filtered";
File outputFile = new File(outputFiltered);
builder.redirectOutput(outputFile);
String runCommand = builder.command().stream().collect(Collectors.joining(" "));
myLogger.info(runCommand);
process = builder.start();
int error = process.waitFor();
if (error != 0) {
myLogger.error("Error creating delta filtered file from delta file " + deltaFilePrefix + ".delta Error: " + error);
throw new IllegalStateException("Error creating delta filtered File via show-coords ");
}
myLogger.info("Finished runDeltaFiltered in " + (System.nanoTime()-time)/1e9 + " seconds");
} catch (Exception exc) {
exc.printStackTrace();
throw new IllegalStateException("Error running runShowCoords against delta file: " + deltaFilePrefix + ".delta, error: " + exc.getMessage());
}
}
/**
* call mummer4 show-coords method
* @param deltaFilePrefix
* @param outputDir
*/
public static void runShowCoords(String deltaFilePrefix, String outputDir, String mummer4Path) {
// Run show-coords both against the delta, and the delta_filtered file.
try {
// -T: print results to tab-delimited file
// -H: do not print header information
// -r: Sort output lines by reference IDs and coordinates
Long time = System.nanoTime();
myLogger.info("Run show-coords on original delta file");
String deltaFile = deltaFilePrefix + ".delta";
String showCoords = mummer4Path + "/show-coords";
ProcessBuilder builder = new ProcessBuilder(
showCoords, "-T", "-r", "-H", deltaFile);
String redirectError = outputDir + "errCoords_orig.log";
builder.redirectError(new File(redirectError));
Process process;
String outputCoordsOrig = deltaFilePrefix + ".coords_orig";
File outputFile = new File(outputCoordsOrig);
builder.redirectOutput(outputFile);
String runCommand = builder.command().stream().collect(Collectors.joining(" "));
myLogger.info(runCommand);
process = builder.start();
int error = process.waitFor();
if (error != 0) {
myLogger.error("Error creating coords file from original delta file " + deltaFilePrefix + ".delta Error: " + error);
throw new IllegalStateException("Error creating coords File via show-coords ");
}
deltaFile = deltaFilePrefix + ".delta_filtered";
builder = new ProcessBuilder(
showCoords, "-T", "-r", "-H", deltaFile);
redirectError = outputDir + "errCoords_filtered.log";
builder.redirectError(new File(redirectError));
String outputCoordsFiltered = deltaFilePrefix + ".coords_filtered";
outputFile = new File(outputCoordsFiltered);
builder.redirectOutput(outputFile);
runCommand = builder.command().stream().collect(Collectors.joining(" "));
myLogger.info(runCommand);
process = builder.start();
error = process.waitFor();
if (error != 0) {
myLogger.error("Error creating coords file from filtered delta file " + deltaFilePrefix + ".delta Error: " + error);
throw new IllegalStateException("Error creating coords File via show-coords ");
}
myLogger.info("Finished runShowCoords in " + (System.nanoTime()-time)/1e9 + " seconds");
} catch (Exception exc) {
exc.printStackTrace();
throw new IllegalStateException("Error running runShowCoords against delta file: " + deltaFilePrefix + ".delta, error: " + exc.getMessage());
}
}
/**
* This method post-processes the filtered and original coords file when the mummer coords
* file came from a delta filtered with the -G option.
* It will
* 1. create a list of entries to add back based on groups of ascending/descending
* entries of at least 3 adjacent alignments whose distance from each other is
* less than a specified amount
* 2. adds the entries above (in a sorted manner) to the filtered coords list
*
* @param coordsDelta
* @param coordsDeltaG
* @param coordsGNoEmbedded
*/
public static void refilterCoordsFileMinusG(String outputDeltaFilePrefix, String coordsDelta, String coordsDeltaG,
String coordsGNoEmbedded, String chrom) {
List deltaLines = new ArrayList();
List gLines = new ArrayList();
List sortedRefCoordsList = new ArrayList();
try (BufferedReader deltaBR = Utils.getBufferedReader(coordsDelta);
BufferedReader deltaGbr = Utils.getBufferedReader(coordsDeltaG);
BufferedWriter bw1 = Utils.getBufferedWriter(coordsGNoEmbedded)){
String line = null;
while ((line = deltaBR.readLine()) != null) {
deltaLines.add(line);
}
while ((line = deltaGbr.readLine()) != null) {
gLines.add(line); // add to list that gets merged
}
List removedLines = new ArrayList();
for (String entry : deltaLines) {
if (!gLines.contains(entry) ) {
removedLines.add(entry);
}
}
List returnedLines = new ArrayList ();
if (removedLines.size() > 0) {
returnedLines = findAlignmentsToReturnFromAfterMinusGFilter(removedLines, gLines);
}
myLogger.info("refilterCoordsFile: number returned from findAlignmentsToReturn " + returnedLines.size());
myLogger.info("refilterCoordsFile: after findAlignmentsToReturn, now create sortedRefCoordsList");
// This processing keeps the ref ordering (vs appending all to the end)
// It is more code than using "contains" but runs in less than a 10th of the time.
Map,String> refStartToEntryMap = new HashMap<>();
for (String entry : gLines) {
int firstTab = entry.indexOf("\t");
int refStart = Integer.parseInt(entry.substring(0, firstTab));
String entryMinusRefStart = entry.substring(firstTab+1);
int refEnd = Integer.parseInt(entryMinusRefStart.substring(0,entryMinusRefStart.indexOf("\t")));
refStartToEntryMap.put(new Tuple(refStart,refEnd), entry);
}
for (String entry : returnedLines) {
int firstTab = entry.indexOf("\t");
int refStart = Integer.parseInt(entry.substring(0, firstTab));
String entryMinusRefStart = entry.substring(firstTab+1);
int refEnd = Integer.parseInt(entryMinusRefStart.substring(0,entryMinusRefStart.indexOf("\t")));
refStartToEntryMap.put(new Tuple(refStart,refEnd), entry);
}
List> refStartList = new ArrayList>(refStartToEntryMap.keySet());
Collections.sort(refStartList);
refStartList.stream().forEach(item -> {
sortedRefCoordsList.add(refStartToEntryMap.get(item));
});
// remove the embedded lines.
myLogger.info("refilterCoordsFile: call checkForEmbedded, orig list size: " + sortedRefCoordsList.size());
List coordsNoRefEmbedded = checkForEmbedded(sortedRefCoordsList, true); // remove ref embedded entries
List coordsNoRefOrAssemblyEmbedded = checkForEmbedded(coordsNoRefEmbedded, false); // remove assembly embedded entries
for (String entry : coordsNoRefOrAssemblyEmbedded) {
// Mummer:show-snps will use the original delta file (not filtered) and the
// coords file created here to call SNPs. For show-snps we want embedded entries removed, but
// the coordinates of remaining entries cannot change or show-snps barfs.
bw1.write(entry);
bw1.write("\n");
}
myLogger.info("refilterCoordsFile: finshed removing embedded, list size= " + coordsNoRefOrAssemblyEmbedded.size());
} catch (Exception exc) {
myLogger.debug(exc.getMessage(), exc);
throw new IllegalStateException("refilterCoordsFile failed with error: " + exc.getMessage());
}
return ;
}
/**
* Takes a mummer delta file filtered via the -1 option, determines which entries to keep.
* From the remaining, it removes embedded entries.
*
* @param outputDeltaFilePrefix
* @param coordsDelta
* @param coordsDelta1
* @param coordsGNoEmbedded
* @param chrom
*/
public static void refilterCoordsFile( String outputDeltaFilePrefix, String coordsDelta, String coordsDelta1,
String coordsGNoEmbedded, String chrom, int scoreThreshold) {
List deltaLines = new ArrayList();
List gLines = new ArrayList();
Map, String> refToEntryMap = new HashMap<>();
try (BufferedReader deltaBR = Utils.getBufferedReader(coordsDelta1);
BufferedWriter bw1 = Utils.getBufferedWriter(coordsGNoEmbedded)){
String line = null;
while ((line = deltaBR.readLine()) != null) {
deltaLines.add(line);
int tabIndex1 = line.indexOf("\t");
int tabIndex2 = line.indexOf("\t",tabIndex1+1);
int refStart = Integer.parseInt(line.substring(0,tabIndex1));
int refEnd = Integer.parseInt(line.substring(tabIndex1+1,tabIndex2));
refToEntryMap.put(new Tuple(refStart,refEnd), line);
}
long time = System.nanoTime();
List psfList = SyntenicAnchors.getPairedSimilarFragmentsFromMummer(deltaLines);
myLogger.info("After getPairedSimilarFragmentsFromMummer - size of list = " + psfList.size());
int score = 3;
int penalty = -4;
SyntenicAnchors.myPairedSimilarFragmentSort( psfList, score, penalty, scoreThreshold );
List psfLongestPathList = SyntenicAnchors.longestIncreasingSubsequenceLAGAN(psfList);
// longestIncreasingSubsequenceLAGAN resorts entries based on ascending assembly positions.
// Sort again based on reference position. Assumptions in overlap processing are that
// the list is sorted by ascending reference start.
Collections.sort(psfLongestPathList);
// Create list of coords entries to keep from the longestPath entries
List keptLines = new ArrayList();
for (PairedSimilarFragment psf : psfLongestPathList) {
String entry = refToEntryMap.get(new Tuple(psf.getRefStartPos(),psf.getRefEndPos()));
if (entry != null) {
keptLines.add(entry);
}
}
myLogger.info("refilterCoordsFile: number KEPT from processing SyntenicAnchors " + keptLines.size() + ", time to process: " + (System.nanoTime() - time)/1e9);
// remove the embedded lines.
List coordsNoRefEmbedded = checkForEmbedded(keptLines, true); // remove ref embedded entries
List coordsNoRefOrAssemblyEmbedded = checkForEmbedded(coordsNoRefEmbedded, false); // remove assembly embedded entries
for (String entry : coordsNoRefOrAssemblyEmbedded) {
// Mummer:show-snps will use the original delta file (not filtered) and the
// coords file created here to call SNPs. For show-snps we want embedded entries removed, but
// the coordinates of remaining entries cannot change or show-snps barfs.
bw1.write(entry);
bw1.write("\n");
}
myLogger.info("refilterCoordsFile: finshed removing embedded, list size= " + coordsNoRefOrAssemblyEmbedded.size());
} catch (Exception exc) {
myLogger.debug(exc.getMessage(), exc);
throw new IllegalStateException("refilterCoordsFile failed with error: " + exc.getMessage());
}
}
/**
* Takes a mummer coords file and searches for overlaps. All of the overlap goes to the
* first entry. If splitting in this manner results in a split in the middle or an assembly
* deletion, then split is adjusted so the deletion is contained in the first entry of the
* overlapped pair. The snpFile is used to determine indel positions.
*
* For now, assembly insertions may be split. This has not been a problem for variant context
* processing.
*
* @param coordsNoEmbedded
* @param snpFile SnpFIle used to determine assembly indels
* @param coordsFinal
*/
public static void filterCoordsOverlaps(String coordsNoEmbedded, String snpFile, String coordsFinal) {
try (BufferedWriter bw = Utils.getBufferedWriter(coordsFinal);
BufferedReader coordsBR = Utils.getBufferedReader(coordsNoEmbedded);
BufferedReader snpBR = Utils.getBufferedReader(snpFile)) {
List coordsNoRefOrAssemblyEmbedded = new ArrayList();
String line;
myLogger.info("filterCoordsOverlaps: Reading coordinates file " + coordsNoEmbedded);
while ((line = coordsBR.readLine()) != null) {
coordsNoRefOrAssemblyEmbedded.add(line);
}
myLogger.info("filterCoordsOverlaps: Reading snp file " + snpFile);
List snpList = new ArrayList();
while ((line = snpBR.readLine()) != null) {
snpList.add(line);
}
myLogger.info("filterCoordsOverlaps: calling splitOverlappingCoordsEntries for ref");
List overlapRefSplitList = splitOverlappingCoordsEntries(coordsNoRefOrAssemblyEmbedded, snpList,true).getX(); // process ref overlaps
// Assemblies can have overlapping start positions, and start positions
// out of order. It takes recursively going through to adjust entries to remove
// all overlaps, as in some cases we pitch previous entries. This is generally
// only repeated once.
boolean rerun = true;
int rerunCount = 0;
List overlapSplitList = new ArrayList();
overlapSplitList.addAll(overlapRefSplitList);
while (rerun) {
rerunCount++;
myLogger.info("filterCoordsOverlaps: calling splitOverlappingCoordsEntries for assembly, count=" + rerunCount);
Tuple,Boolean> overlapAllSplitListTemp = splitOverlappingCoordsEntries(overlapSplitList, snpList, false);
rerun = overlapAllSplitListTemp.getY();
overlapSplitList.clear();
overlapSplitList.addAll(overlapAllSplitListTemp.getX()); // save new list
}
// write lines from final filtered file to *.coords_final file
for (String entry : overlapSplitList) {
// This file is used in AssemblyProcessingUtils.parseCoordinateRegions
bw.write(entry);
bw.write("\n");
}
} catch (Exception exc) {
throw new IllegalStateException("MummerScriptProcessing:filterOverlaps: error reading or writing file: " + exc.getMessage());
}
}
/**
* Splits overlapping entries,
* The mummer4 coords file entries will have these tab-delimited columns:
* S1 E1 S2 E2 Len1 Len2 %ID refID asmID
* The files processed were sorted by ref-coordinates via the show-coords -r param, so S1/E1 is ref coords
* and S2/E2 are the assembly coordinates.
*
* When 2 entries are found to overlap, the first entry keeps its coordinates. The second will be truncated
* by the amount of the overlap. It is understood this may not be completely accurate as the position of
* indels is not considered.
*
* @param sortedList File to be filtered
* @return
*/
public static Tuple,Boolean> splitOverlappingCoordsEntries(List sortedList, List snpList, boolean splitByRef) {
System.out.println("splitOverlappingCoordsEntries: begin, size of sortedList " + sortedList.size());
String prevEntry = sortedList.get(0);
String currentEntry = sortedList.get(0);
Tuple prevStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(prevEntry,splitByRef);
boolean rerun = false;
// Create map of SNP entries for quicker processing
Multimap snpByRefMap = HashMultimap.create();
Multimap snpByAsmMap = HashMultimap.create();
try {
for (String entry : snpList) {
int tabIndex1 = entry.indexOf("\t");
int tabIndex2 = entry.indexOf("\t",tabIndex1+1);
int tabIndex3 = entry.indexOf("\t",tabIndex2+1);
int tabIndex4 = entry.indexOf("\t",tabIndex3+1);
int refPos = Integer.parseInt(entry.substring(0,tabIndex1));
int asmPos = Integer.parseInt(entry.substring(tabIndex3+1, tabIndex4));
snpByRefMap.put(refPos, entry);
snpByAsmMap.put(asmPos, entry);
}
} catch (Exception exc) {
throw new IllegalStateException("splitOverlappingCoordsEntries: error processsing SNP file " + exc.getMessage());
}
List splitOverlapList = new ArrayList();
// find overlapping ref alignments
int splitCount = 0;
int overlapCount = 0;
for (int idx=1; idx < sortedList.size(); idx++) {
currentEntry = sortedList.get(idx);
Tuple curStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(currentEntry,splitByRef); // true = ref
// overlap entry contains
// The sortedList is sorted by ref, so 1st always begins before second
// If second is embedded in first, Tuple.y is true - in which
// case we ditch the embedded entry, and entry previous remains as is.
// otherwise, alter both previous and current entries
Tuple overlapData = checkForOverlap(prevStartEnd,curStartEnd);
boolean isEmbedded = overlapData.getY();
boolean isReverseEmbedded = false;
if (overlapData.x > 0) { // if number of overlaps > 0, process
overlapCount++;
if (isEmbedded == false) { // 2nd not embedded in first
// check if first is not embedded in the second
Tuple revPrevStartEnd = curStartEnd;
Tuple revCurStartEnd = prevStartEnd;
Tuple overlapDataReverse = checkForOverlap(revPrevStartEnd,revCurStartEnd);
isReverseEmbedded = overlapDataReverse.getY();
if (isReverseEmbedded == false) { // neither entry embedded in the other, but there is an overlap
// prev entry stays the same, change ref start, asm start, and len in current entry
Tuple newEntries = adjustEntryForOverlap(prevEntry,currentEntry,overlapData.x, snpByRefMap, snpByAsmMap, splitByRef);
if (newEntries.getY().equals("NOLEN")) {
// The number of overlaps exceeded the size of the next entry.
// The "prevEntry" was adjusted. We drop the next entry as its ref length is now 0 or negative
rerun = true;
prevEntry = newEntries.getX();
prevStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(prevEntry,splitByRef);
} else if (newEntries.getX().equals("NOLEN")) {
// The "curEntry" starts before the "prevEntry" . In this case, the code
// chose whichever entry had the greatest length. If it was the second one
// ie the "curEntry", then we drop the previous one, hence the "NOLEN" value.
// Which could create another overlap , so we'll need to rerun overlap processing
rerun = true;
prevEntry = newEntries.getY();
prevStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(prevEntry,splitByRef);
} else {
Tuple newCurStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(newEntries.getY(),splitByRef);
splitOverlapList.add(newEntries.getX());
prevEntry = newEntries.getY();
prevStartEnd = newCurStartEnd;
}
splitCount++; // for debug info
} else { // first was embedded in the second
// drop the first by setting it to currentEntry, but DON'T put it on the list
prevEntry = currentEntry;
prevStartEnd = curStartEnd;
}
}
// if currentEntry is embedded in the previous, previous stays the same,
// we drop current by ignoring it here.
} else { // these 2 entries don't overlap
splitOverlapList.add(prevEntry);
prevEntry = currentEntry;
prevStartEnd = curStartEnd;
}
}
// Add last entry. It has already been adjusted if it overlapped the previous
splitOverlapList.add(prevEntry);
myLogger.info("splitOverlappingCoordsEntries: finalListSize " + splitOverlapList.size() + ", should match original list size " + sortedList.size());
myLogger.info("splitOverlappingCoordsEntries: overlapCount " + overlapCount + ", splitCount " + splitCount);
return new Tuple,Boolean>(splitOverlapList,rerun);
}
/**
* Check entries in a list of mummer4 coords file entries and removed those
* that are embedded
* @param sortedList List of sorted mummer coords file entries. Should be sorted by ref start
* @param splitByRef Boolean - if true, check ref embedded. Otherwise check if assembly coordinates are embedded
* @return
*/
public static List checkForEmbedded(List sortedList, boolean splitByRef){
List finalSplitList = new ArrayList();
if (sortedList.isEmpty()) return finalSplitList; // this shouldn't happen!
String prevEntry = sortedList.get(0);
String currentEntry = sortedList.get(0);
Tuple prevStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(prevEntry,splitByRef);
// find embedded alignments
for (int idx=1; idx < sortedList.size(); idx++) {
currentEntry = sortedList.get(idx);
Tuple curStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(currentEntry,splitByRef); // true = ref
// overlap entry contains
// The sortedList is sorted by ref, so 1st always begins before second
// If second is embedded in first, Tuple.y is true - in which
// case we ditch the embedded entry, and entry previous remains as is.
Tuple overlapData = checkForOverlap(prevStartEnd,curStartEnd);
boolean isEmbedded = overlapData.getY();
boolean isReverseEmbedded = false;
if (isEmbedded == false) {
// check if reverse is embedded.
// Reverse pair and recheck. THis catches the following type overlapping pair:
// 75585382 75582853
// 75587301 75581001
Tuple revPrevStartEnd = curStartEnd;
Tuple revCurStartEnd = prevStartEnd;
Tuple overlapDataReverse = checkForOverlap(revPrevStartEnd,revCurStartEnd);
isReverseEmbedded = overlapDataReverse.y;
if (isReverseEmbedded == false) { // neither entry embedded in the other
finalSplitList.add(prevEntry); // keep this entry
}
// If the first was embedded in the second, it is dropped by not putting it on the list
// Current entry becomes prevEntry if no overlap, or if first entry overlapped the second.
prevEntry = currentEntry;
prevStartEnd = curStartEnd;
}
}
finalSplitList.add(prevEntry); // add last entry
return finalSplitList;
}
// Adjust coordinates on overlapping mummer4 coords file entries
// When 2 coordinate entries overlap, the first one gets the sequence.
// The second one's start coordinates (both ref and assembly) are adjusted upward
// ( or downward if the values are descending) past the length of the overlap. The
// alignment lengths of both ref and assembly are also adjusted appropriately. If the split
// occurs in the middle of an assembly deletion relative to the reference, that sequence
// is added to the first of the 2 entries. We realize this
// may not be strictly accurate as there could be other deletion/insertion involved
// that would effect the start.
public static Tuple adjustEntryForOverlap(String prevEntry, String currentEntry,int numOverlaps,
Multimap snpByRefMap, Multimap snpByAsmMap, boolean splitByRef) {
// currentEntry is a tab-delimited string of:
// S1 E1 S2 E2 len1 len2 %id tag1 tag2
int tabIndex1 = currentEntry.indexOf("\t");
int tabIndex2 = currentEntry.indexOf("\t",tabIndex1+1);
int tabIndex3 = currentEntry.indexOf("\t",tabIndex2+1);
int tabIndex4 = currentEntry.indexOf("\t",tabIndex3+1);
int tabIndex5 = currentEntry.indexOf("\t",tabIndex4+1);
int tabIndex6 = currentEntry.indexOf("\t",tabIndex5+1);
// Get indices for the prevEntry.
int prevIndex1 = prevEntry.indexOf("\t");
int prevIndex2 = prevEntry.indexOf("\t",prevIndex1+1);
int prevIndex3 = prevEntry.indexOf("\t",prevIndex2+1);
int prevIndex4 = prevEntry.indexOf("\t",prevIndex3+1);
int prevIndex5 = prevEntry.indexOf("\t",prevIndex4+1);
int prevIndex6 = prevEntry.indexOf("\t",prevIndex5+1);
StringBuilder adjustedNewSB = new StringBuilder();
// Ref will all be ascending positions. Assembly (query)
// could be either
int refStart = Integer.parseInt(currentEntry.substring(0,tabIndex1));
int refAdjust = numOverlaps;
int asmAdjust = numOverlaps;
int asmStart = Integer.parseInt(currentEntry.substring(tabIndex2+1,tabIndex3));
int asmEnd = Integer.parseInt(currentEntry.substring(tabIndex3+1, tabIndex4));
boolean currentAsmDir = asmStart <= asmEnd ? true : false; // true if is ascending coordinates
int prevRefEnd = Integer.parseInt(prevEntry.substring(prevIndex1+1,prevIndex2));
int prevAsmStart = Integer.parseInt(prevEntry.substring(prevIndex2+1,prevIndex3));
int prevAsmEnd = Integer.parseInt(prevEntry.substring(prevIndex3+1,prevIndex4));
boolean prevAsmDir = prevAsmStart <= prevAsmEnd ? true : false; // true if is ascending
DirectionCompare directionCompare = DirectionCompare.BOTH_ASCENDING; // both prev and current asm entries are ascending
if (prevAsmDir == true) {
if (currentAsmDir == false) directionCompare = DirectionCompare.PREVASC_CURDESC; // prev asm ascending, current is descending
} else if (prevAsmDir == false) {
if (currentAsmDir == true) directionCompare = DirectionCompare.PREVDESC_CURASC; // prev asm descending, current is ascending
else directionCompare = DirectionCompare.BOTH_DESCENDING; // both prev and current asm alignments are descending
}
// if both entries have ascending assembly coordinates and the previous asm start is > current asm start,
// OR both entries have descending assembly coordinates and the previous asm start is < current asm start,
// the pick the entry with the greatest length. It gets too complicated to get the overlap
// correct in these cases. Using greatest length and choosing to keep the second entry means there
// could be an overlap with the last "previous" entry, so the calling method will re-process the overlaps.
if ((prevAsmStart > asmStart && directionCompare == DirectionCompare.BOTH_ASCENDING) ||
(prevAsmStart < asmStart && directionCompare == DirectionCompare.BOTH_DESCENDING)) {
// pick the alignment with the greatest length, pitch the other
int prevAsmLen = Integer.parseInt(prevEntry.substring(prevIndex5+1, prevIndex6));
int curAsmLen = Integer.parseInt(currentEntry.substring(tabIndex5+1, tabIndex6));
System.out.println("adjustEntryForOverlap: asmStart < prevAsmStart, prevLen=" + prevAsmLen
+ ", curAsmLen=" + curAsmLen + ", prevAsmStart " + prevAsmStart + ", asmStart " + asmStart);
if (prevAsmLen >= curAsmLen) {
// keep previous entry, drop current entry
return new Tuple(prevEntry,"NOLEN");
} else {
// keep current entry, drop previous entry
return new Tuple("NOLEN",currentEntry);
}
}
// Get the asm SNP position for this ref SNP position
int refSnpPos = refStart + refAdjust;
Collection refSnpEntries = snpByRefMap.get(refSnpPos);
int asmSnpPos = 0;
Collection asmSnps = null;
if (refSnpEntries.size() == 1) {
// Ref has an entry. Only 1 entry. Find asm position
// We purposefully don't process if Ref has multiple entries - those
// are assembly insertions.
String refEntry = refSnpEntries.iterator().next();
int tab1 = refEntry.indexOf("\t");
int tab2 = refEntry.indexOf("\t",tab1+1);
int tab3 = refEntry.indexOf("\t",tab2+1);
if (refEntry.substring(tab2+1, tab3).equals(".")) {
// if no asm deletion, we don't need to process in loop below
// Can't assume based on number of assembly SNPs at this position that we have an indel,
// as sometimes the same position was aligned to different regions of the ref, so more than
// 1 SNP, but not part of a deletion.
asmSnpPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(refEntry, 4, 12));
// Find all SNP file entries for the potential new assembly position from the coordinates file
asmSnps = snpByAsmMap.get(asmSnpPos);
}
}
// Previous entry does not get adjusted unless there is a SNP indel at the
// overlap spot. All the overlap goes to the prev entry, so its end and length
// don't change. The cur entry has its start adjusted based on the overlap
String adjustedPrevEntry = prevEntry;
if (asmSnps != null && !asmSnps.isEmpty()) {
// multiple entries for the ASM at this position, means the ref
// has sequence lacking in the assembly. Increase the reference
// beyond this
if (asmSnps.size() > 1) {
Tuple indelStartAndLen = AssemblyProcessingUtils.findRefIndelStart(refSnpPos,asmSnps);
int refIndelStart = indelStartAndLen.getX();
int asmSnpLen = indelStartAndLen.getY();
refAdjust += asmSnpLen;
if (refIndelStart < prevRefEnd + numOverlaps ) {
// Adjust again in case the indel started before the overlapping region we moved past
// The new start means the previous entry's end point should include these indels
// This adjusts the PREVIOUS entry's end point
// Looking for the number of bp indels that occurred AFTER the previous refEnd point
int prevEndAdjust = (refIndelStart - prevRefEnd) + asmSnpLen -1;
adjustedPrevEntry = getAdjustedMummerCoordEntry(prevEntry, prevEndAdjust, prevAsmDir);
}
if (refIndelStart < refStart + numOverlaps) {
int skipOverlapEntries = (refStart+numOverlaps) - refIndelStart;
// This is the number for starting the "current" ref - skipping over the overlapped
// regions and the indels.
refAdjust -= skipOverlapEntries; // subtract number of indel bps occurring BEFORE prev entry ended
}
asmAdjust = numOverlaps + 1; // move this up by 1 to skip past the indel
}
}
// All ref entries are ascending, so just add the adjustment to the start of the "current"
// The adjustments below are for the current entry
refStart += refAdjust;
// For assemblies: it is also the "current" that gets adjusted, but which end gets adjusted
// depends on the direction of both the current and previous entries.
if (directionCompare == DirectionCompare.BOTH_ASCENDING ) { // both increasing
// ex: 8340 8645
// 8640 8700 - (0) increase this start by 6
asmStart += asmAdjust;
} else if (directionCompare == DirectionCompare.PREVASC_CURDESC ){ // first=increasing, second=decreasing
if (asmStart > prevAsmEnd ) {
// ex: 8340 8645
// 8700 8640 (1) case to increase end by 6
asmEnd += asmAdjust;
} else {
// ex: 60710 61625
// 61014 56944 (1) case to decrease start by 305
asmStart -= asmAdjust;
}
} else if (directionCompare == DirectionCompare.PREVDESC_CURASC){ // first=decreasing, second=increasing
// ex: 8645 8325
// 8640 8700 (2) increase this start by 6
asmStart += asmAdjust;
} else if ( directionCompare == DirectionCompare.BOTH_DESCENDING){ //both decreasing
// ex: 8724 8251
// 8260 8000 (3) decrease this start by 10
asmStart -= asmAdjust;
}
int refLen = Integer.parseInt(currentEntry.substring(tabIndex4+1,tabIndex5));
refLen -= refAdjust;
int asmLen = Integer.parseInt(currentEntry.substring(tabIndex5+1,tabIndex6)); // Len1
asmLen -= asmAdjust;
// Any bps from an overlapping entry were given to the first entry. WHen adjusting,
// the assembly for the overlaps, an insertion could move the reference end point past
// the start of the next reference entry. In this case, drop the entry
if (refLen < 1 || asmLen < 1) {
myLogger.info("adjustForOverlap: after adjustment dropping currentEntry, length of ref: " + refLen
+ ", length of asm: " + asmLen);
return new Tuple(adjustedPrevEntry,"NOLEN");
}
adjustedNewSB.append(refStart).append("\t"); // S1
adjustedNewSB.append(currentEntry.substring(tabIndex1+1,tabIndex2)).append("\t"); // E1
adjustedNewSB.append(asmStart).append("\t"); // S2
adjustedNewSB.append(asmEnd).append("\t"); // E2
adjustedNewSB.append(refLen).append("\t");
adjustedNewSB.append(asmLen).append("\t"); // len2
adjustedNewSB.append(currentEntry.substring(tabIndex6+1)); // rest of entry - no tab, no newline needed
// returns a Tuple of adjusted values for
return new Tuple(adjustedPrevEntry,adjustedNewSB.toString());
}
// Create a new entry by increasing the ref and asm coordinate end point
// by the amount indicated in adjustLen. Also ajust the ref and assembly len
// fields.
// Mummer Coords entry has the form:
// S1 E1 S2 E2 len1 len2 %id tag1 tag2
public static String getAdjustedMummerCoordEntry(String entry, int adjustLen, boolean asmAscending){
int tabIndex1 = entry.indexOf("\t");
int tabIndex2 = entry.indexOf("\t",tabIndex1+1);
int tabIndex3 = entry.indexOf("\t",tabIndex2+1);
int tabIndex4 = entry.indexOf("\t",tabIndex3+1);
int tabIndex5 = entry.indexOf("\t",tabIndex4+1);
int tabIndex6 = entry.indexOf("\t",tabIndex5+1);
StringBuilder adjustedNewSB = new StringBuilder();
try {
int oldRefEnd = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, 2, 12));
int oldAsmEnd = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, 4, 12));
int oldRefLen = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, 5, 12));
int oldAsmLen = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, 6, 12));
int newRefEnd = oldRefEnd + adjustLen;
int newAsmEnd = asmAscending ? (oldAsmEnd + 1) : (oldAsmEnd - 1);
int newRefLen = oldRefLen + adjustLen;
int newAsmLen = oldAsmLen + adjustLen;
adjustedNewSB.append(entry.substring(0,tabIndex1)).append("\t");
adjustedNewSB.append(Integer.toString(newRefEnd)).append("\t");
adjustedNewSB.append(entry.substring(tabIndex2+1,tabIndex3)).append("\t");
adjustedNewSB.append(Integer.toString(newAsmEnd)).append("\t");
adjustedNewSB.append(Integer.toString(newRefLen)).append("\t");
adjustedNewSB.append(Integer.toString(newAsmLen)).append("\t");
adjustedNewSB.append(entry.substring(tabIndex6+1));
return adjustedNewSB.toString();
} catch (Exception exc) {
throw new IllegalStateException("MummerScriptProcessing:getAdjustedEntry error parsing entry, message: " + exc.getMessage());
}
}
/**
* Returns a Tuple indicating
* There will be inversions, so no guarantee prevStartEnd.x is <= curStartEnd.x
* @param prevStartEnd
* @param curStartEnd
* @return Tuple indicating (x) number of overlaps between the 2, and (y) was one embedded in the other?
*/
public static Tuple checkForOverlap(Tuple prevStartEnd, Tuple curStartEnd){
int numOverlap = 0;
int pStart = prevStartEnd.x;
int pEnd = prevStartEnd.y;
int cStart = curStartEnd.x;
int cEnd = curStartEnd.y;
// Adjust if coordinates are reversed
if (prevStartEnd.x > prevStartEnd.y ) {
pStart = prevStartEnd.y;
pEnd = prevStartEnd.x;
}
if (curStartEnd.x > curStartEnd.y ) {
cStart = curStartEnd.y;
cEnd = curStartEnd.x;
}
// Using De Morgan's law to determine overlap
if (pEnd >= cStart && cEnd >= pStart) {
// there is overlap - determine the amount
int maxStart = Math.max(pStart,cStart);
int minEnd = Math.min(pEnd, cEnd);
numOverlap = (minEnd-maxStart) + 1;
}
// check for embedded
if ((cStart >= pStart) && (cEnd <= pEnd)) {
return new Tuple(numOverlap,true); //
}
// if no overlap, numOverlap will be 0
return new Tuple(numOverlap,false);
}
/**
* Takes a list of mummer4 coordinates removed during delta-filter with -g and determines
* which coordinates should be returned. Those returned have reference range coordinates that
* fall between 2 existing kept alignment reference range coordinates, and whose assembly coordinates
* fall between the assembly coordinates of the 2 ranges. Most often, these will be
* alignments where the assembly aligned on the reverse strand.
* Nothing is added back that falls before the start of the coordinates created with delta-filter
* -g, or after the end of the delta-filter -g list of coordinates.
*
* If a removed entry falls before the first of the pair of gList entries, and extends into
* the second gList entry, it is not added.
*
* The entries returned are limited to alignments where the ref length is at least 1000 bps.
*
* Any entries added that overlap each other will be handled in later processing
* when all overlaps are handled.
*
* The mummer4 delta-filter with -g performed 1-1 global alignment with no rearrangements.
* This purpose of findAlignmentsToReturn() is to return inversions that appear on the diagonal.
*
* NOTE: This method is obsolete, but kept to facilitate user desire to process with mummer -G filtering
* @param removedList
* @param gList
* @return
*/
public static List findAlignmentsToReturnFromAfterMinusGFilter(List removedList, List gList) {
List returnItemList = new ArrayList();
int glistIdx = 1; // current of the second entry into the filtered coordinates list
glistIdx = findNonOverlappingGlistPair( gList, glistIdx);
if (glistIdx == gList.size()) return returnItemList; // no overlapping positions
String gListLowString = gList.get(glistIdx-1);
String gListHighString = gList.get(glistIdx);
Tuple gLowRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,true);
Tuple gHighRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,true);
Tuple gLowAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,false);
Tuple gHighAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,false);
Range gRefRange = Range.closed(gLowRefStartEnd.getY(), gHighRefStartEnd.getX());
// The mummer4 -g filtered coordinates processing does not allow inversions/rearrangements, so ASM start
// should always be less than ASM end (Not true with the removed list from the original coords processing)
Range gAsmRange = Range.closed(gLowAsmStartEnd.getY(), gHighAsmStartEnd.getX());
// Walk through all the removed entries.
// Add back those where the removed ref coordinates fall between the gList line's ref coordinates and the removed
// assembly coordinates are within the diagonal, ie between gList line's assembly coordinates.
// The removedList coordinates that fall within will probably be reversed.
for (int ridx = 0; ridx < removedList.size();) {
String removedEntry = removedList.get(ridx);
Tuple remRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(removedEntry,true);
Tuple remAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(removedEntry,false);
int refLen = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(removedEntry, 5, 9));
if (refLen < 1000) {
// only adding back if ref len is > 1000
ridx++;
continue;
}
Range removedAsmRange;
// There will be inversions in the removed coordinates
if (remAsmStartEnd.getX() < remAsmStartEnd.getY()) {
removedAsmRange = Range.closed(remAsmStartEnd.getX(), remAsmStartEnd.getY());
} else {
removedAsmRange = Range.closed(remAsmStartEnd.getY(), remAsmStartEnd.getX());
}
// Ref range coordinates are always ascending
Range removedRefRange = Range.closed(remRefStartEnd.getX(), remRefStartEnd.getY());
if (gRefRange.encloses(removedRefRange)) {
// ref coordinates are within range - check the assembly coordinates
if (gAsmRange.encloses(removedAsmRange)) {
returnItemList.add(removedEntry);
} // else the entry is dropped
ridx++; // move to the next "removed" entry
} else if (remRefStartEnd.getX() < gRefRange.lowerEndpoint()) {
// not adding anything until the removed entry's ref coordinates have reached
// the glist ref coordinate start
ridx++;
} else if (remRefStartEnd.getX() > gRefRange.upperEndpoint()) {
// need to move to next gList entry until we have 2 entries
// whose ref coordinates surround the removed entry list coordinates
glistIdx++;
if (glistIdx == gList.size()) break; // no more entries to add
glistIdx = findNonOverlappingGlistPair( gList, glistIdx);
if (glistIdx == gList.size()) {
// The coords files are ordered by ref coordinate.
// If the current "removed" entry starts beyond the last
// gList entry, then all remaining are beyond it so none will
// be added back to the list
break; // no more entries will be added
} else {
gListLowString = gList.get(glistIdx-1);
gListHighString = gList.get(glistIdx);
gLowRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,true);
gHighRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,true);
gLowAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,false);
gHighAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,false);
gRefRange = Range.closed(gLowRefStartEnd.getY(), gHighRefStartEnd.getX());
// The mummer4 -g filtered coordinates processing does not allow inversions/rearrangements, so ASM start
// should always be less than ASM end (Not true with the removed list from the original coords processing)
gAsmRange = Range.closed(gLowAsmStartEnd.getY(), gHighAsmStartEnd.getX());
}
// Not increasing ridx here. We're still processing the same "removed" entry but against
// a new gList range.
} else {
// Here is one we process. Removed entry's ref range start falls within the gList ref range coordinates.
// Need to check the assembly, and it if falls within the diagonal, it is kept.
if (remRefStartEnd.getX() > gRefRange.lowerEndpoint() && remRefStartEnd.getX() < gRefRange.upperEndpoint()) {
if (gAsmRange.encloses(removedAsmRange)) {
returnItemList.add(removedEntry);
} // else the entry is dropped
}
ridx++; // move to the next "removed" entry
}
}
// these are all the inversions we'll add.
return returnItemList;
}
/**
* Traverse a list of mummer coords entries. Create lists of alignments
* that have a minimum of 3 in a row where assembly is in the same
* direction (forward or reverse) and the distance between the end
* of one entry and the beginning of the next is > 0.01 percent.
*
* Return a concatenated list of all the entries kept from the original file.
*
* This method is expected to be used with a mummer coords file created
* from a delta file filtered with the -1 option.
*
* @param origList
* @return
*/
public static List findAlignmentsToRemove_fromMinus1Filter(List origList) {
List returnItemList = new ArrayList();
List tempList = new ArrayList();
int numConsecutive = 4;
int totalRefLen = 0;
String prevEntry = origList.get(0);
Tuple prevAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(prevEntry,false);
boolean prevDirectionAscending = true;
tempList.add(prevEntry);
if (prevAsmStartEnd.x > prevAsmStartEnd.y) {
prevDirectionAscending = false;
}
if (origList.size() == 1) {
// This happens with phgSmallSeq testing. Though unlikely, it could
// happen in the real world.
returnItemList.addAll(origList);
myLogger.info("findAlignmentsToReturn: origList size is 1 - add to returnList and quit");
return returnItemList;
}
// loop through origList to find entries that should be kept
myLogger.info("findAlignmentsToReturn: origList size: " + origList.size());
for (int idx=1; idx < origList.size(); idx++) {
String currentEntry = origList.get(idx);
boolean currentDirectionAscending = false;
Tuple curAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(currentEntry,false);
int refLen = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(currentEntry, 5, 9));
if (curAsmStartEnd.x < curAsmStartEnd.y) {
currentDirectionAscending = true;
}
if (currentDirectionAscending != prevDirectionAscending) {
// direction has changed, stop processing the current list
if (tempList.size() >= numConsecutive && totalRefLen >= 2000) {
returnItemList.addAll(tempList);
}
tempList.clear();
totalRefLen = 0;
} else { // both current and prev entries are moving in same direction
// calculate the distance between the current and previous entries
double distance = AssemblyProcessingUtils.calculateCoordDistance(prevAsmStartEnd,curAsmStartEnd);
if (distance > 0.01 ) { // distance is too great, store/clear tempList
if (tempList.size() >= numConsecutive && totalRefLen >= 2000) {
returnItemList.addAll(tempList);
}
tempList.clear();
totalRefLen = 0;
}
}
if ( refLen >= 250) { // alignment must be reasonable length
tempList.add(currentEntry);
totalRefLen += refLen;
}
prevEntry = currentEntry;
prevAsmStartEnd = curAsmStartEnd;
prevDirectionAscending = currentDirectionAscending;
}
if (tempList.size() >= numConsecutive && totalRefLen >= 2000) {
returnItemList.addAll(tempList); // add final entries
}
return returnItemList;
}
/**
* Given a list of mummer coords entries and an index, finds the entry where this entry
* and the one before it have no overlaps in either the ref or assembly coordinates.
* The index returned is for the second of the 2 entries. This method is used when
* searching for mummer delta file entries removed during -g delta filtering that we
* would like to return.
*
* @param gList
* @param glistIdx
* @return
*/
public static int findNonOverlappingGlistPair(List gList, int glistIdx) {
// glistIdx is the current index into the list of coordinates held in gList.
// To get low, do one less, to get current high, use glistIdx.
// increment glistIdx until you have one where neither ref nor assembly coordinates
// are overlapping.
String gListLowString = gList.get(glistIdx-1);
String gListHighString = gList.get(glistIdx);
Tuple gLowRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,true);
Tuple gHighRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,true);
Tuple gLowAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,false);
Tuple gHighAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,false);
while (glistIdx+1 < gList.size()) {
boolean keep = true;
if (gLowRefStartEnd.getY() > gHighRefStartEnd.getX() || gLowAsmStartEnd.getY() > gHighAsmStartEnd.getX()) {
keep = false;
}
if (keep) return glistIdx;
else {
glistIdx++;
gListLowString = gListHighString;
gListHighString = gList.get(glistIdx);
gLowRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,true);
gHighRefStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,true);
gLowAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListLowString,false);
gHighAsmStartEnd = AssemblyProcessingUtils.getStartEndCoordinates(gListHighString,false);
}
}
if ((gLowRefStartEnd.getY() <= gHighRefStartEnd.getX()) && (gLowAsmStartEnd.getY() <= gHighAsmStartEnd.getX() )) {
// we found a good range - return the index into the filtered coords file
return glistIdx;
}
return gList.size(); // indicates we've exhausted the list
}
/**
* Fun the mummer4 show-snps entry against a delta file, using a coords file as
* additional input. From the command line, this would look like:
* cat coords_file | show-snps -T -r -H -S deltaFile
* @param deltaFilePrefix
* @param coordsForShowSnps
* @param outputDir
*/
public static void runShowSNPsWithCat(String deltaFilePrefix, String coordsForShowSnps, String outputDir, String mummer4Path, String chrom) {
try {
// Start a shell process which will cat the contents of the coords file
Process process1 = new ProcessBuilder("/bin/bash").start();
String deltaFile = deltaFilePrefix + ".delta";
myLogger.info("runShowSNPs: Using delta file " + deltaFile);
myLogger.info("runSHowSNPs: Using coordsFile " + coordsForShowSnps);
// Create process2. This will run the mummer4 show-snps command. IT is run here
// with the -S option. This option expects a coords file from stdin. Once show-snps
// is started, it waits for the coords file input.
String showSnps = mummer4Path + "/show-snps";
ProcessBuilder builder = new ProcessBuilder(
showSnps, "-T", "-r", "-H", "-S", deltaFile);
String runCommand = builder.command().stream().collect(Collectors.joining(" "));
System.out.println(runCommand);
String redirectError = outputDir + "/_errSNPs.log";
builder.redirectError(new File(redirectError));
String outputSNPS = deltaFilePrefix + ".snps_prefiltered";
File outputFile = new File(outputSNPS);
builder.redirectOutput(outputFile);
Process process2 = builder.start();
// Send output of cat'ing the coords file to the mummer4:show_snps script
// The constructor calls "start" when begins piping the output of process1
// to the input of process2
RedirectStreams redirectStreams = new RedirectStreams(process1,process2);
// Write the "cat coordsfile" command to the outputstream of process1
BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(process1.getOutputStream()));
String command = "cat " + coordsForShowSnps;
System.out.println("RunShowSNPs Input process1: " + command);
bufferedWriter.write(command + '\n');
bufferedWriter.close();
int error = process2.waitFor();
if (error != 0) {
System.out.println("Error creating snps file from delta file " + deltaFile + " Error: " + error);
throw new IllegalStateException("Error creating SNPS File via show-snps ");
}
// Read the created SNP file into a list
BufferedReader origBR = Utils.getBufferedReader(outputSNPS);
List origSnpsList = new ArrayList();
String line;
while ((line = origBR.readLine()) != null) {
origSnpsList.add(line);
}
origBR.close();
// verifySnpEntries is now called after final coordinate processing
} catch (Exception exc) {
throw new IllegalStateException("MummerScriptProcessing:runShowSNPsWithCat Error processing initial SNP file " + exc.getMessage());
}
}
public static void finalSnpFiltering(String outputSNPS, String deltaFilePrefix, String chrom) {
// Read the created SNP file into a list
String snpFile = deltaFilePrefix + ".snps_final";
try (BufferedReader origBR = Utils.getBufferedReader(outputSNPS);
BufferedWriter snpBW = Utils.getBufferedWriter(snpFile)) {
List origSnpsList = new ArrayList();
String line;
while ((line = origBR.readLine()) != null) {
origSnpsList.add(line);
}
// Find which of these entries have positions represented in the mummer coords file after filtering
String coordsFile = deltaFilePrefix + ".coords_final";
List finalSnps = verifySNPEntries(origSnpsList, coordsFile, chrom);
// write final snp file
for (String entry: finalSnps) {
snpBW.write(entry);
snpBW.write("\n");
}
} catch (Exception exc) {
throw new IllegalStateException("Error processing initial final SNP file " + exc.getMessage());
}
}
/**
* This method calls show-snps using only a delta file as input
* @param deltaFilePrefix
* @param outputDir
*/
public static void runShowSNPs(String deltaFilePrefix, String outputDir, String mummer4Path, String chrom) {
try {
// -T: print results to tab-delimited file
// -H: do not print header information
// -r: Sort output lines by reference IDs and coordinates
Long time = System.nanoTime();
// Run show-snps with original delta
String deltaFile = deltaFilePrefix + ".delta";
String showSnps = mummer4Path + "/show-snps";
ProcessBuilder builder = new ProcessBuilder(
showSnps, "-T", "-r", "-H", deltaFile);
String redirectError = outputDir + "errSNPsOrig.log";
builder.redirectError(new File(redirectError));
String outputSNPS = deltaFilePrefix + ".snps_orig";
File outputFile = new File(outputSNPS);
builder.redirectOutput(outputFile);
String runCommand = builder.command().stream().collect(Collectors.joining(" "));
myLogger.info(runCommand);
Process process = builder.start();
int error = process.waitFor();
if (error != 0) {
myLogger.error("Error creating snps file from delta file " + deltaFile + " Error: " + error);
throw new IllegalStateException("Error creating SNPS File via show-snps ");
}
// Read the created SNP file into a list
BufferedReader origBR = Utils.getBufferedReader(outputSNPS);
List origSnps = new ArrayList();
String line;
while ((line = origBR.readLine()) != null) {
origSnps.add(line);
}
origBR.close();
// Find which of these entries have positions represented in the mummer coords file after filtering
String coordsFile = deltaFilePrefix + ".coords_final";
List finalSnps = verifySNPEntries(origSnps, coordsFile, chrom);
// write final snp file
String snpFile = deltaFilePrefix + ".snps_final";
BufferedWriter snpBW = Utils.getBufferedWriter(snpFile);
for (String entry: finalSnps) {
snpBW.write(entry);
snpBW.write("\n");
}
snpBW.close();
myLogger.info("Finished runShowSNPs in " + (System.nanoTime()-time)/1e9 + " seconds");
} catch (Exception exc) {
exc.printStackTrace();
throw new IllegalStateException("Error running runShowSNPs against delta file: " + deltaFilePrefix + ".delta, error: " + exc.getMessage());
}
}
/**
* This method takes a list of Mummer SNP file entries and verifies the SNP
* positions are represented in the filtered/overlap-merged coords
* file.
*
* Additional filtering is performed to remove SNPs that occurred in overlapped
* coordinates entries, resulting in duplicate SNPs with differing assembly
* positions for the same reference position.
*
* A list of "represented" SNPs is returned.
*
* NOTE: Snp entries must be added in order.
*
* @param deltaSNPs - list of SNPs from a Mummer filtered delta file
* @param coordsFile - coordinates file to use when checking for positions.
* @parameter chrom - the reference chrom, used to create a Position object
* @return
*/
public static List verifySNPEntries(List deltaSNPs, String coordsFile, String chrom){
List finalSNPList = new ArrayList();
myLogger.info("mergeVerifySNPEntries: begin");
// Create coords List from coordsFile
List finalCoordsList = new ArrayList();
try (BufferedReader coordsBR = Utils.getBufferedReader(coordsFile)){
String coordsLine;
while ((coordsLine = coordsBR.readLine()) != null) {
finalCoordsList.add(coordsLine);
}
} catch (IOException ioe) {
throw new IllegalStateException("mergeVerifySNPEntries: error readingfilteredCoordsFile "
+ coordsFile + " message: " + ioe.getMessage());
}
// The SNP file entry is a tab-delimited line comprise of columns:
// RefPos RefSNP AsmSNP AsmPos BUFF DIST R Q LenR LenQ FRM1 FRM2 RefTag AsmTag
int refPosCol = 1;
int asmPosCol = 4;
int refSNPcol = 2;
int asmSNPcol = 3;
int totalColumns = 12;
RangeMap> finalCoordsRangeMap = AssemblyProcessingUtils.getCoordsRangeMap(finalCoordsList, chrom);
HashSet refHashSet = new HashSet(); // to ensure we don't keep multiple SNPs for same position
HashSet asmHashSet = new HashSet();
int snpIdx = 0;
// Save the entry on a map of referencePos to Tuple
Map> snpRefToAsmPosMap = new HashMap>();
try {
System.out.println("verifySNPEntries - begin, snpIdx = 0);, deltaSNPs.size = " + deltaSNPs.size());
while (snpIdx < deltaSNPs.size() ) {
String entry = deltaSNPs.get(snpIdx);
boolean snpInRange = AssemblyProcessingUtils.checkSnpEntryInRange( entry, finalCoordsRangeMap, chrom);
// SNP can be in range here, but can be a duplicate because show-snps was run on
// the coords file BEFORE overlapped were merged. Different assembly alignments mapping
// to the same reference coordinates could show different snps.
if (snpInRange) {
int refPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refPosCol, totalColumns));
int asmPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmPosCol, totalColumns));
String curRefSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refSNPcol, totalColumns);
String curAsmSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmSNPcol, totalColumns);
if (!curRefSNPVal.equals(".") && !curAsmSNPVal.equals(".")) {
// If neither the ref nor the asm position have been seen, add them to the "keep" lists
if (!refHashSet.contains(refPos) || !asmHashSet.contains(asmPos)) {
refHashSet.add(refPos);
asmHashSet.add(asmPos);
snpRefToAsmPosMap.put(refPos,Arrays.asList(entry));
}
snpIdx++; // process next SNP entry
continue;
} else if (curRefSNPVal.equals(".")) { // assembly insertion
// create the indel
List tmpSnpRefToAsmPosList = new ArrayList();
List asmPosList = new ArrayList();
asmPosList.add(asmPos);
tmpSnpRefToAsmPosList.add(entry);
int savedRefPos = refPos;
int prevAsmPos = asmPos;
snpIdx++;
// This block could be inside the while below except for needing to know
// the direction of the assembly. If the assembly values are increasing,
// prevAsmPos would default to asmPos-1. If they are decreasing, it would
// default to asmPos+1. THe (Mat.abs(asmPos - prevAsmPos) == 1 will fail
// if this is wrong, hence this additional else clause.
if (snpIdx < deltaSNPs.size()) {
entry = deltaSNPs.get(snpIdx);
refPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refPosCol, totalColumns));
asmPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmPosCol, totalColumns));
curRefSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refSNPcol, totalColumns);
curAsmSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmSNPcol, totalColumns);
}
while (snpIdx < deltaSNPs.size() && refPos == savedRefPos && curRefSNPVal.equals(".") && (Math.abs(asmPos - prevAsmPos) == 1)) {
// next entry is good, add to list
asmPosList.add(asmPos);
tmpSnpRefToAsmPosList.add(entry);
prevAsmPos = asmPos;
snpIdx++;
if (snpIdx < deltaSNPs.size() ){
entry = deltaSNPs.get(snpIdx);
refPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refPosCol, totalColumns));
asmPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmPosCol, totalColumns));
curRefSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refSNPcol, totalColumns);
curAsmSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmSNPcol, totalColumns);
}
}
// we have processed the whole indel
if (!refHashSet.contains(savedRefPos) && checkNoListEntriesInRange(asmHashSet, asmPosList)) {
refHashSet.add(savedRefPos);
int lastEntry = asmPosList.size() - 1;
int asmStart = asmPosList.get(0) < asmPosList.get(lastEntry) ? asmPosList.get(0) : asmPosList.get(lastEntry);
int asmEnd = asmPosList.get(0) < asmPosList.get(lastEntry) ? asmPosList.get(lastEntry) : asmPosList.get(0);
for (int idx = asmStart; idx <= asmEnd; idx ++) {
asmHashSet.add(idx);
}
snpRefToAsmPosMap.put(savedRefPos,tmpSnpRefToAsmPosList);
}
} else { // assembly deletion related to ref
// To this map add a set of SNP positions consistent with just 1 alignment
Map> tmpSnpRefToAsmPosMap = new HashMap>();
// Stores the ref to SNP entry mapping for this set of assembly deletions
SetMultimap refToIndelMap = HashMultimap.create();
// Stores the asm to SNP entry mappings for this set of assembly deletions
SetMultimap asmToIndelMap = HashMultimap.create();
refToIndelMap.put(refPos, entry);
asmToIndelMap.put(asmPos, entry);
List refPosList = new ArrayList();
int savedAsmPos = asmPos;
int prevRefPos = refPos;
// Get the next entry to see if we are still processing a deletion
snpIdx++;
if (snpIdx < deltaSNPs.size() ){
entry = deltaSNPs.get(snpIdx);
refPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refPosCol, totalColumns));
asmPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmPosCol, totalColumns));
curRefSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refSNPcol, totalColumns);
curAsmSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmSNPcol, totalColumns);
}
// because of alignment overlaps (PHG-324) asmPos will not always be the same.
// Instead we check that refPos either equals prevRefPos OR it increases by 1 AND assembly val = "."
// When refPos = prevRefPos, it means there was an overlapping alignment with the same SNP
while (snpIdx < deltaSNPs.size() && curAsmSNPVal.equals(".") && ( refPos == prevRefPos || (Math.abs(refPos - prevRefPos) == 1)) ) {
// next entry is good, add to list
refPosList.add(refPos);
// There should never be multiple entries where both ref and asm positions are the same
// This implies duplicate due to previous mummer overlapping entries - don't add it.
if (!(refPos == prevRefPos && asmPos == savedAsmPos)) {
refToIndelMap.put(refPos, entry);
asmToIndelMap.put(asmPos, entry);
}
prevRefPos = refPos;
snpIdx++;
if (snpIdx < deltaSNPs.size() ){
entry = deltaSNPs.get(snpIdx);
refPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refPosCol, totalColumns));
asmPos = Integer.parseInt(AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmPosCol, totalColumns));
curRefSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, refSNPcol, totalColumns);
curAsmSNPVal = AssemblyProcessingUtils.getEntryFromTabDelimitedLine(entry, asmSNPcol, totalColumns);
}
}
// we have processed the whole indel - choose a set of SNP deletion entries
if (asmToIndelMap.keySet().size() == 1) {
// no overlapping alignments, no duplicate SNP values - just add them
List refPositionList = new ArrayList<>(refToIndelMap.keySet());
Collections.sort(refPositionList);
for (int pos : refPositionList) { // should only be 1 entry
tmpSnpRefToAsmPosMap.put(pos,(new ArrayList(refToIndelMap.get(pos))));
refPosList.add(pos); // to be checked for duplicates later
}
} else {
// We have overlaps - choose. Looking at the SNP files, sometimes choosing the
// lowest asm coordinate fell in a seeming chronological order, and sometimes
// choosing the highest asm coordinate mapped better chronologically. In general,
// while the ref positions are ALWAYS in order, assembly positions are not.
// A third alternative is to choose the position that had the most consecutive
// deletions in this series. Often, though, they were equal.
//
// This code uses the lowest asm coordinate as it more often (but not always) fell within
// a chronological order.
List asmPositionList = new ArrayList<>(asmToIndelMap.keySet());
Collections.sort(asmPositionList); // get lowest asm position that maps at this deletion series
Collection entries = asmToIndelMap.get(asmPositionList.get(0));
List entriesAsList = new ArrayList<>(entries);
Collections.sort(entriesAsList);
for (String snpEntry : entriesAsList) {
int refSNPposition = Integer.parseInt(snpEntry.substring(0, snpEntry.indexOf("\t")));
refPosList.add(refSNPposition);
tmpSnpRefToAsmPosMap.put(refSNPposition,Arrays.asList(snpEntry));
}
// reset savedAsmPos to used for duplicate check below
savedAsmPos = asmPositionList.get(0);
}
// check for duplicate positions, add final
if (!asmHashSet.contains(savedAsmPos) && checkNoListEntriesInRange(refHashSet, refPosList)) {
asmHashSet.add(savedAsmPos);
int lastEntry = refPosList.size() - 1;
int refStart = refPosList.get(0) < refPosList.get(lastEntry) ? refPosList.get(0) : refPosList.get(lastEntry);
int refEnd = refPosList.get(0) < refPosList.get(lastEntry) ? refPosList.get(lastEntry) : refPosList.get(0);
for (int idx = refStart; idx <= refEnd; idx++) {
refHashSet.add(idx);
}
snpRefToAsmPosMap.putAll(tmpSnpRefToAsmPosMap);
}
}
} else {
snpIdx++;
}
}
} catch (Exception exc) {
System.out.println("verifySNPEntries exception message: " + exc.getMessage());
exc.printStackTrace();
}
// Create the final list
List refPosMap = new ArrayList(snpRefToAsmPosMap.keySet());
Collections.sort(refPosMap);
for (int refSNPpos : refPosMap) {
List snpTupleEntry = snpRefToAsmPosMap.get(refSNPpos);
snpTupleEntry.stream().forEach(item -> {
finalSNPList.add(item); // add show-snps entry to final list
});
}
myLogger.info("mergeVerifySNPEntries: total original SNPS " + deltaSNPs.size()
+ ", number after filtering by coords_final coordinates " + finalSNPList.size());
return finalSNPList;
}
/**
* Method takes a RangeSet of Integers and a list of integers.
* It returns "false" if any value from the list is present in the RangeSet
* It returns "true" if no value from the list is present in the RangeSet
* @param hashSet
* @param testList
* @return
*/
public static boolean checkNoListEntriesInRange(HashSet hashSet, List testList) {
for (int entry : testList) {
if (hashSet.contains(entry)) return false;
}
return true;
}
}