net.maizegenetics.pangenome.processAssemblyGenomes.AssemblyProcessingUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.processAssemblyGenomes;
import java.io.*;
import java.sql.Connection;
import java.sql.ResultSet;
import java.util.*;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import com.google.common.collect.*;
import htsjdk.variant.variantcontext.*;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.CreateGraphUtils;
import net.maizegenetics.pangenome.db_loading.*;
import org.apache.log4j.Logger;
import net.maizegenetics.analysis.gbs.neobio.BasicScoringScheme;
import net.maizegenetics.analysis.gbs.neobio.IncompatibleScoringSchemeException;
import net.maizegenetics.analysis.gbs.neobio.InvalidSequenceException;
import net.maizegenetics.analysis.gbs.neobio.PairwiseAlignment;
import net.maizegenetics.analysis.gbs.neobio.PairwiseAlignmentAlgorithm;
import net.maizegenetics.analysis.gbs.neobio.ScoringScheme;
import net.maizegenetics.analysis.gbs.neobio.SmithWaterman;
import net.maizegenetics.analysis.gbs.repgen.AlignmentInfo;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
/**
* This class contains methods useful for processing assembly haplotypes.
*
* @author lcj34
*
*/
public class AssemblyProcessingUtils {
private static final Logger myLogger = Logger.getLogger(AssemblyProcessingUtils.class);
/**
* Create range map from list of entries from a Mummer coordinates file
*
* @param coordsList List containing tab-delimited strings from a Mummer coordinates file
* @param chrom String with chromosome name
* @return RangeMap: A rangemap of the coordinate values form the mummer coords file list
*/
public static RangeMap> getCoordsRangeMap(List coordsList, String chrom) {
// Create RangeSet
RangeMap> coordsRangeMap = TreeRangeMap.create();
Chromosome chr = Chromosome.instance(chrom);
for (String entry : coordsList) {
// S1 E1 S2 E2 len1 len2 %id tag1 tag2
int tabIndex1 = entry.indexOf("\t");
int tabIndex2 = entry.indexOf("\t",tabIndex1+1);
int tabIndex3 = entry.indexOf("\t",tabIndex2+1);
int tabIndex4 = entry.indexOf("\t",tabIndex3+1);
int refStart;
int refEnd;
refStart = Integer.parseInt(entry.substring(0,tabIndex1));
refEnd = Integer.parseInt(entry.substring(tabIndex1+1,tabIndex2));
Range refPositionRange = Range.closed(Position.of(chr,refStart), Position.of(chr, refEnd));
int asmStart = Integer.parseInt(entry.substring(tabIndex2+1,tabIndex3));
int asmEnd = Integer.parseInt(entry.substring(tabIndex3+1,tabIndex4));
if (asmStart > asmEnd) { // asm could be ascending or descending
int temp = asmEnd;
asmEnd = asmStart;
asmStart = temp;
}
coordsRangeMap.put(refPositionRange, Arrays.asList(Position.of(chr,asmStart),Position.of(chr,asmEnd)));
}
return coordsRangeMap;
}
/**
* Returns the value for a specific column in a tab-delimited string
* @param mline
* @param entryColumn - the column number (1-based) of the entry the caller wants
* @param totalColumns - total number of tab-delimited columns in the line
* @return String containing the column contents
*/
public static String getEntryFromTabDelimitedLine(String mline, int entryColumn, int totalColumns) {
int[] tabPos = new int[totalColumns-1];
int len = mline.length();
int tabIndex = 0;
for (int idx = 0; (tabIndex < totalColumns) && (idx < len); idx++) {
if (mline.charAt(idx) == '\t') {
tabPos[tabIndex++] = idx;
}
}
int begin;
if (entryColumn == 1) {
begin = 0;
} else {
begin = tabPos[entryColumn-2] + 1; // -2 for list is 0-based, and 1st tab is after first column
}
int end;
if (entryColumn == totalColumns) {
end = mline.length();
} else {
end = tabPos[entryColumn-1];
}
return mline.substring(begin,end);
}
/**
* Find the start/end coordinates from a tab-delmimited Mummer4 coords file entry
* @param entry: Line from a mummer coords file
* @param ref Boolean: if true, get ref start end. Otherwise, get assembly start/end
* @return
*/
public static Tuple getStartEndCoordinates(String entry, boolean ref) {
// Finds the start and end coordinate position from the input file.
// For ref, these occur in the 1st and 2nd fields of the tab-delimited string
// For assembly, these occur in the 3rd and 4th fields of the tab-delimited string.
int tabIndex1 = entry.indexOf("\t");
int tabIndex2 = entry.indexOf("\t",tabIndex1+1);
int tabIndex3 = entry.indexOf("\t",tabIndex2+1);
int tabIndex4 = entry.indexOf("\t",tabIndex3+1);
int start;
int end;
if (ref) {
start = Integer.parseInt(entry.substring(0,tabIndex1));
end = Integer.parseInt(entry.substring(tabIndex1+1,tabIndex2));
} else {
start = Integer.parseInt(entry.substring(tabIndex2+1,tabIndex3));
end = Integer.parseInt(entry.substring(tabIndex3+1,tabIndex4));
}
return new Tuple(start,end);
}
/**
* Calculates distance between 2 sets of mummer coords file entries
* This is called on coordinates that are both either ascending (start < end)
* or both descending (start > end) so "sign" of entries is not checked here.
*
* @param prev
* @param current
* @return
*/
public static double calculateCoordDistance(Tuple prev, Tuple current) {
int endStartDiff = Math.abs(prev.y-current.x);
double distance = (double)endStartDiff/prev.y;
return distance;
}
/**
* Verifies if the positions from a Mummer4 snp file fall within the range map
* of reference and assembly positions created from the Mummer4 coordinates files.
* @param mline
* @return
*/
public static boolean checkSnpEntryInRange(String mline, RangeMap> coordsRangeMap, String chrom) {
int totalSnpColumns = 12; // total number of columns in tab-delimited show-snps file WITHOUT -C option
int refPosCol = 1; // column containing reference position in mummer4 SNP file
int asmPosCol = 4; // column containing assembly position in mummer4 SNP file
// get snp positions for ref and assembly
int refPos = Integer.parseInt(getEntryFromTabDelimitedLine(mline, refPosCol, totalSnpColumns));
int asmPos = Integer.parseInt(getEntryFromTabDelimitedLine(mline, asmPosCol, totalSnpColumns));
Position refAsPos = Position.of(chrom, refPos);
RangeMap> rangeWithPosMap = coordsRangeMap.subRangeMap(Range.closed(refAsPos, refAsPos));
if ( rangeWithPosMap.asMapOfRanges().isEmpty()) return false; // reference position not included in map
List asmCoords = rangeWithPosMap.get(refAsPos);
if (asmCoords == null) return false; // ref position not in range map
// If asm position falls in range that mapped to this reference position, return true
if (asmPos >= asmCoords.get(0).getPosition() && asmPos <= asmCoords.get(1).getPosition() ){
return true;
}
return false; // asm coordinate does not map to reference coordinate alignments that are stored.
}
public static List findVCListForAnchor(RangeMap positionRangeToVariantContextMap, Position refStart, Position refEnd) {
List parsedVCList = new ArrayList();
RangeMap anchorSubRangeMap
= positionRangeToVariantContextMap.subRangeMap(Range.closed(refStart,refEnd));
anchorSubRangeMap.asMapOfRanges().values().stream().forEach(vc -> {
parsedVCList.add(vc);
});
return parsedVCList;
}
/**
* This method takes a RangeSet of integers, and a single range. It finds all
* the ranges in the set that intersect the targetRange.
* Calculate both the number of bases from the targetRange that are represented
* in the rangeSet, and the percentage of the bases represented. Return a Tuple with
* this information.
*
* @param rangeSet
* @param targetRange
* @return
*/
public static Tuple getRegionCoverage(RangeSet rangeSet, Range targetRange) {
int rangeSize = (targetRange.upperEndpoint() - targetRange.lowerEndpoint()) +1;
// This just gives overlaps.
// Not looking for the exact ranges in the set, but rather the number of positions
// from the targetRange that are included in the rangeSet
RangeSet numberSubRangeSet = rangeSet.subRangeSet(targetRange);
int numBases = 0;
for (Range range : numberSubRangeSet.asRanges()) {
numBases += ((range.upperEndpoint() - range.lowerEndpoint()) +1);
}
double percentRangeCovered = ((double)(numBases)/rangeSize) * 100 ;
return new Tuple(numBases,percentRangeCovered);
}
/**
* Method to parse the Mummer SNP file into a rangemap
* The first String in the tuple is for the reference call
* The second String is for the assembly call
* @param fileName
* @param chromosome
* @return
*/
public static RangeMap> parseMummerSNPFile(String fileName, String chromosome) {
RangeMap> snpMap = TreeRangeMap.create();
try(BufferedReader reader = Utils.getBufferedReader(fileName)){
String currentLine = reader.readLine();
int tabIndex1 = currentLine.indexOf("\t");
int tabIndex2 = currentLine.indexOf("\t",tabIndex1+1);
int tabIndex3 = currentLine.indexOf("\t",tabIndex2+1);
int tabIndex4 = currentLine.indexOf("\t",tabIndex3+1);
int prevRefPos = Integer.parseInt(currentLine.substring(0,tabIndex1));
int prevRefPosStart = prevRefPos;
int prevAsmPos = Integer.parseInt(currentLine.substring(tabIndex3+1,tabIndex4));
String currentRefCall = removeMissing(currentLine.substring(tabIndex1+1,tabIndex2));
String currentCall = removeMissing(currentLine.substring(tabIndex2+1,tabIndex3));
while((currentLine = reader.readLine()) != null) {
tabIndex1 = currentLine.indexOf("\t");
tabIndex2 = currentLine.indexOf("\t",tabIndex1+1);
tabIndex3 = currentLine.indexOf("\t",tabIndex2+1);
tabIndex4 = currentLine.indexOf("\t",tabIndex3+1);
int currRefPos = Integer.parseInt(currentLine.substring(0,tabIndex1));
int currAsmPos = Integer.parseInt(currentLine.substring(tabIndex3+1,tabIndex4));
String refAllele = removeMissing(currentLine.substring(tabIndex1+1,tabIndex2));
String altAllele = removeMissing(currentLine.substring(tabIndex2+1,tabIndex3));
//Need to check this as we can have overlapping
if((currRefPos == prevRefPos && refAllele.length() == 0) || (currAsmPos == prevAsmPos && altAllele.length() == 0)) {
currentRefCall += refAllele;
currentCall += altAllele;
}
else {
//Add the call to the snpMap
snpMap.put(Range.closed(Position.of(chromosome,prevRefPosStart),Position.of(chromosome,prevRefPos)),new Tuple(currentRefCall,currentCall));
prevRefPosStart = currRefPos;
currentRefCall = refAllele;
currentCall = altAllele;
}
prevRefPos = currRefPos;
prevAsmPos = currAsmPos;
}
snpMap.put(Range.closed(Position.of(chromosome,prevRefPosStart),Position.of(chromosome,prevRefPos)),new Tuple(currentRefCall,currentCall));
}
catch(Exception e) {
throw new IllegalStateException(e);
}
return snpMap;
}
private static String removeMissing(String call) {
if(call.equals(".")) {
return "";
}
else {
return call;
}
}
/**
* Method to parse out the reference coordinates into a map which along with the SNP data can then be used to create Variants.
* @param coordFile
* @param chromosome
* @return
*/
public static Map,List> parseCoordinateRegions(String coordFile, String chromosome) {
Map,List> mappedRegions = new LinkedHashMap<>();
try(BufferedReader reader = Utils.getBufferedReader(coordFile)) {
String currentLine = "";
while((currentLine = reader.readLine()) != null) {
int tabIndex1 = currentLine.indexOf("\t");
int tabIndex2 = currentLine.indexOf("\t",tabIndex1+1);
int tabIndex3 = currentLine.indexOf("\t",tabIndex2+1);
int tabIndex4 = currentLine.indexOf("\t",tabIndex3+1);
int refStart = Integer.parseInt(currentLine.substring(0,tabIndex1));
int refEnd = Integer.parseInt(currentLine.substring(tabIndex1+1,tabIndex2));
int asmStart = Integer.parseInt(currentLine.substring(tabIndex2+1,tabIndex3));
int asmEnd = Integer.parseInt(currentLine.substring(tabIndex3+1,tabIndex4));
mappedRegions.put(Range.closed(Position.of(chromosome, refStart),Position.of(chromosome, refEnd)),
Arrays.asList(Position.of(chromosome,asmStart),Position.of(chromosome,asmEnd)));
}
}
catch(Exception e) {
throw new IllegalStateException(e);
}
return mappedRegions;
}
/**
* Creates a RangeMap of asm positions from the given reference range map,
* using lower reference position as the value.
*
* @param refCoords
* @return
*/
public static RangeMap createAsmCoordinatesRangeMap (Map,List> refCoords) {
RangeMap asmMap = TreeRangeMap.create();
refCoords.entrySet().stream().forEach(entry -> {
List asmCoords = entry.getValue();
Position refPos = entry.getKey().lowerEndpoint();
Position asmCurrentStart = asmCoords.get(0);
Position asmCurrentEnd = asmCoords.get(1);
boolean currentDirection = (asmCurrentStart.compareTo(asmCurrentEnd) <= 0 );
Range asmRange = (currentDirection)? Range.closed(asmCurrentStart,asmCurrentEnd) : Range.closed(asmCurrentEnd,asmCurrentStart);
// Storing ref staring position as the value for the asm range entry
asmMap.put(asmRange, refPos);
});
return asmMap;
}
/**
* Given a map of ranges and a range, calculate the number
* of positions within the given range that are represented
* in the RangeMap.
*
* @param asmCoveredMap
* @param asmRange
* @return
*/
public static int calculateRegionCovered(RangeMap asmCoveredMap, Range asmRange) {
Map, Position> asMap = asmCoveredMap.asMapOfRanges();
int totalPositions = 0;
for (Map.Entry, Position> entry : asMap.entrySet()) {
int upperPos = entry.getKey().upperEndpoint().getPosition();
int lowerPos = entry.getKey().lowerEndpoint().getPosition();
int count = upperPos - lowerPos;
totalPositions += count;
}
return totalPositions;
}
/**
* Test method to try to merge overlapping coordinates.
* We thought Show-SNPs would recall the SNPs in the delta file based on this information, but this was not the case.
* @param coordFile
* @param chromosome
* @return
*/
@Deprecated
public static Map,List> mergeCoords(String coordFile, String chromosome) {
Map,List> mappedRegions = new LinkedHashMap<>();
RangeSet coveredRefRanges = TreeRangeSet.create();
try(BufferedReader reader = Utils.getBufferedReader(coordFile)) {
String currentLine = "";
while((currentLine = reader.readLine()) != null) {
int tabIndex1 = currentLine.indexOf("\t");
int tabIndex2 = currentLine.indexOf("\t",tabIndex1+1);
int tabIndex3 = currentLine.indexOf("\t",tabIndex2+1);
int tabIndex4 = currentLine.indexOf("\t",tabIndex3+1);
int refStart = Integer.parseInt(currentLine.substring(0,tabIndex1));
int refEnd = Integer.parseInt(currentLine.substring(tabIndex1+1,tabIndex2));
int asmStart = Integer.parseInt(currentLine.substring(tabIndex2+1,tabIndex3));
int asmEnd = Integer.parseInt(currentLine.substring(tabIndex3+1,tabIndex4));
boolean updatedRange = false;
for(int i = refStart; i <= refEnd; i++) {
if(coveredRefRanges.contains(Position.of(chromosome, i))) {
//We need to merge this one with the the range which overlaps
Range overlappingRefRange = coveredRefRanges.rangeContaining(Position.of(chromosome,i));
List asmCoordinates = mappedRegions.get(overlappingRefRange);
myLogger.debug("Merging:"+overlappingRefRange);
if((asmCoordinates.get(0).getPosition() <= asmCoordinates.get(1).getPosition() && asmStart <= asmEnd) ||
(asmCoordinates.get(0).getPosition() >= asmCoordinates.get(1).getPosition() && asmStart >= asmEnd) ) {
if(!overlappingRefRange.encloses(Range.closed(Position.of(chromosome, refStart),Position.of(chromosome, refEnd)))) {
Range newRefRange = Range.closed(overlappingRefRange.lowerEndpoint(), Position.of(chromosome, refEnd));
List newASMRange = Arrays.asList(asmCoordinates.get(0), Position.of(chromosome, asmEnd));
mappedRegions.remove(overlappingRefRange);
mappedRegions.put(newRefRange, newASMRange);
coveredRefRanges.add(newRefRange);
updatedRange = true;
}
}
else{
//ignore as the assembly coordinates do not agree on direction
}
break;
}
}
if(!updatedRange) {
coveredRefRanges.add(Range.closed(Position.of(chromosome, refStart),Position.of(chromosome, refEnd)));
mappedRegions.put(Range.closed(Position.of(chromosome, refStart),Position.of(chromosome, refEnd)),
Arrays.asList(Position.of(chromosome,asmStart),Position.of(chromosome,asmEnd)));
}
}
}
catch(Exception e) {
throw new IllegalStateException(e);
}
return mappedRegions;
}
/**
* Test method to resize the coordinate files so they are not overlapping
* @param coordFile
* @param chromosome
* @return
*/
public static Map,List> resizeCoords(String coordFile, String chromosome) {
Map,List> mappedRegions = new LinkedHashMap<>();
RangeSet coveredRefRanges = TreeRangeSet.create();
try(BufferedReader reader = Utils.getBufferedReader(coordFile)) {
String currentLine = "";
while((currentLine = reader.readLine()) != null) {
int tabIndex1 = currentLine.indexOf("\t");
int tabIndex2 = currentLine.indexOf("\t",tabIndex1+1);
int tabIndex3 = currentLine.indexOf("\t",tabIndex2+1);
int tabIndex4 = currentLine.indexOf("\t",tabIndex3+1);
int refStart = Integer.parseInt(currentLine.substring(0,tabIndex1));
int refEnd = Integer.parseInt(currentLine.substring(tabIndex1+1,tabIndex2));
int asmStart = Integer.parseInt(currentLine.substring(tabIndex2+1,tabIndex3));
int asmEnd = Integer.parseInt(currentLine.substring(tabIndex3+1,tabIndex4));
boolean updatedRange = false;
for(int i = refStart; i <= refEnd; i++) {
if(coveredRefRanges.contains(Position.of(chromosome, i))) {
//We have an overlap so we need to scale this referenceRange to the part which is not overlapping
//We need to resize this one based on the range which overlaps
Range overlappingRefRange = coveredRefRanges.rangeContaining(Position.of(chromosome,i));
List asmCoordinates = mappedRegions.get(overlappingRefRange);
myLogger.debug("Resizing:"+overlappingRefRange);
if(!overlappingRefRange.encloses(Range.closed(Position.of(chromosome, refStart),Position.of(chromosome, refEnd)))) {
int lengthOfOverlap = overlappingRefRange.upperEndpoint().getPosition() - refStart +1;
Range newRefRange = Range.closed(Position.of(chromosome, refStart + lengthOfOverlap), Position.of(chromosome, refEnd));
List newASMRange = Arrays.asList(Position.of(chromosome,asmStart + lengthOfOverlap), Position.of(chromosome, asmEnd));
mappedRegions.put(newRefRange, newASMRange);
coveredRefRanges.add(newRefRange);
updatedRange = true;
}
break;
}
}
if(!updatedRange) {
coveredRefRanges.add(Range.closed(Position.of(chromosome, refStart),Position.of(chromosome, refEnd)));
mappedRegions.put(Range.closed(Position.of(chromosome, refStart),Position.of(chromosome, refEnd)),
Arrays.asList(Position.of(chromosome,asmStart),Position.of(chromosome,asmEnd)));
}
}
}
catch(Exception e) {
throw new IllegalStateException(e);
}
return mappedRegions;
}
/**
* Utility method to export out the merged regions
* @param mergedCoords
* @param fileName
*/
@Deprecated
public static void exportMergedRegions(Map,List> mergedCoords, String fileName) {
try(BufferedWriter writer = Utils.getBufferedWriter(fileName)) {
Set> refPositionRanged = mergedCoords.keySet();
for(Range currentRefPositionRange : refPositionRanged) {
List asmCoordinateRange = mergedCoords.get(currentRefPositionRange);
writer.write(""+currentRefPositionRange.lowerEndpoint().getPosition());
writer.write("\t");
writer.write(""+currentRefPositionRange.upperEndpoint().getPosition());
writer.write("\t");
writer.write(""+asmCoordinateRange.get(0).getPosition());
writer.write("\t");
writer.write(""+asmCoordinateRange.get(1).getPosition());
writer.write("\t");
writer.write(""+(currentRefPositionRange.upperEndpoint().getPosition()-currentRefPositionRange.lowerEndpoint().getPosition()));
writer.write("\t");
writer.write(""+(asmCoordinateRange.get(1).getPosition()-asmCoordinateRange.get(0).getPosition()));
writer.write("\t9\tchr9\n");
}
}
catch(Exception e) {
throw new IllegalStateException(e);
}
}
/**
* Method to fill in the unmapped regions coming from nucmer. This method will create multi-bp indels between the mapped regions which can then be added to the SNP list for processing into Variants.
* @param coordinates
* @param refSequence
* @param asmSequence
* @return
*/
public static RangeMap> setupIndelVariants(Map,List> coordinates, GenomeSequence refSequence, GenomeSequence asmSequence) {
RangeMap> snpMap = TreeRangeMap.create();
List> referencePositionCoordinates = coordinates.keySet().stream().collect(Collectors.toList());
// Create RangeMap of Assembly positions for verifying which have already been covered
RangeMap asmRanges = createAsmCoordinatesRangeMap (coordinates);
Range previousRange = referencePositionCoordinates.get(0);
//Set the direction if increasing direction = true
boolean direction = (coordinates.get(previousRange).get(0).compareTo(coordinates.get(previousRange).get(1)) <= 0 );
int asmRangeCounter = 0;
int lengthOfOverlap = 0;
int numAlreadyCovered = 0;
int totalPossibleIndels = 0;
int refRangeOverlaps = 0;
int directionChanged = 0;
for(int rangeIndex = 1; rangeIndex < referencePositionCoordinates.size(); rangeIndex++) {
Range currentRange = referencePositionCoordinates.get(rangeIndex);
Position previousEnd = previousRange.upperEndpoint();
Position currentStart = currentRange.lowerEndpoint();
Position currentEnd = currentRange.upperEndpoint();
Position asmPreviousStart = coordinates.get(previousRange).get(0);
Position asmPreviousEnd = coordinates.get(previousRange).get(1);
Position asmCurrentStart = coordinates.get(currentRange).get(0);
Position asmCurrentEnd = coordinates.get(currentRange).get(1);
boolean currentDirection = (asmCurrentStart.compareTo(asmCurrentEnd) <= 0 );
totalPossibleIndels++;
if(currentDirection != direction) {
//this means we swapped direction
myLogger.debug("AssemblyProcessingUtils setupIndelVariants: Not creating indel between mapped regions, Assembly direction switching:"+createUserReadableStringPositionRanges(previousRange,currentRange));
direction = (coordinates.get(currentRange).get(0).compareTo(coordinates.get(currentRange).get(1)) <= 0 );
previousRange = currentRange;
directionChanged++;
continue;
}
if(currentEnd.getPosition() < previousEnd.getPosition()) {
//Skip these as the whole mappings reference positions are covered by the previous mapping
refRangeOverlaps++;
myLogger.debug("AssemblyProcessingUtils setupIndelVariants: Not creating indel between mapped regions, Reference ranges overlap:"+createUserReadableStringPositionRanges(previousRange,currentRange));
continue;
}
Range currentASMRange = (currentDirection)? Range.closed(asmCurrentStart,asmCurrentEnd) : Range.closed(asmCurrentEnd,asmCurrentStart);
if(currentASMRange.contains(asmPreviousStart) || currentASMRange.contains(asmPreviousEnd)) {
myLogger.debug("AssemblyProcessingUtils setupIndelVariants: Not creating indel between mapped regions, Assembly ranges overlap:"+createUserReadableStringPositionRanges(previousRange,currentRange));
asmRangeCounter++;
lengthOfOverlap+= Math.abs(asmCurrentStart.getPosition() - asmPreviousEnd.getPosition());
previousRange = currentRange;
continue;
}
// see if range is already represented in part of the map.
// This attempts to remove long indels.
Range asmRange = null;
Chromosome chrom = asmCurrentStart.getChromosome();
int prevEnd = asmPreviousEnd.getPosition();
int curStart = asmCurrentStart.getPosition();
if ((asmPreviousEnd.compareTo(asmCurrentStart) > 0)) {
if ((curStart+1) == prevEnd) {
asmRange = null; // there is no gap between the ranges
} else {
asmRange = Range.closed(Position.of(chrom, asmCurrentStart.getPosition()+1), Position.of(chrom, asmPreviousEnd.getPosition()-1));
}
} else if (asmPreviousEnd.compareTo(asmCurrentStart) < 0) {
if ((prevEnd + 1) == curStart) {
asmRange = null;
} else {
asmRange = Range.closed(Position.of(chrom, asmPreviousEnd.getPosition()+1), Position.of(chrom, asmCurrentStart.getPosition()-1));
}
}
double percentCovered = 0;
int totalPositionsCovered = 0;
if (asmRange != null) {
int asmRangeSize = asmRange.upperEndpoint().getPosition() - asmRange.lowerEndpoint().getPosition();
RangeMap asmCoveredMap = asmRanges.subRangeMap(asmRange);
totalPositionsCovered = calculateRegionCovered( asmCoveredMap, asmRange);
percentCovered = (double)totalPositionsCovered/asmRangeSize;
}
if (percentCovered >= 0.20) {
//this means assembly mapping was out of order. Don't create VC for the jump between mappings
myLogger.debug("AssemblyProcessingUtils setupIndelVariants: Not creating indel between mapped regions, Assembly mapped out of order, percent already covered:" + percentCovered +":"+createUserReadableStringPositionRanges(previousRange,currentRange));
previousRange = currentRange;
numAlreadyCovered++;
continue;
}
// Made it here, we are adding the range. Add these coordinates to the
// asmCoveredMap so they don't get added again if asm mappings in coords file are
// skipping around.
if (asmRange != null) {
asmRanges.put(asmRange,currentRange.lowerEndpoint());
}
String asmCalls = "";
//Assembly is out of order we need to reverse
if(asmPreviousEnd.compareTo(asmCurrentStart) > 0) {
asmCalls = asmSequence.genotypeAsString(asmCurrentStart.getChromosome(),asmCurrentStart.getPosition()+1,asmPreviousEnd.getPosition()-1);
}
else if(asmPreviousEnd.compareTo(asmCurrentStart) < 0) {
asmCalls = asmSequence.genotypeAsString(asmCurrentStart.getChromosome(),asmPreviousEnd.getPosition()+1,asmCurrentStart.getPosition()-1);
}
if (asmCalls.length() > 1000000) {
String endStart = asmPreviousEnd.getPosition() + "\t" + asmCurrentStart.getPosition();
myLogger.debug("setupIndels: TOO BIG: asmPreviousEnd: " + asmPreviousEnd.getPosition()
+ ", asmCurrentStart position: " + asmCurrentStart.getPosition()
+ ", total positions already covered " + totalPositionsCovered + ", percent covered: " + percentCovered);
}
Position newEndOfInterMapRegion = Position.of(currentStart.getChromosome(),currentStart.getPosition()-1);
Position newStartOfInterMapRegion = Position.of(previousEnd.getChromosome(),previousEnd.getPosition()+1);
String refCalls = "";
if(newStartOfInterMapRegion.compareTo(newEndOfInterMapRegion) < 0) {
refCalls = refSequence.genotypeAsString(previousEnd.getChromosome(),previousEnd.getPosition()+1,currentStart.getPosition()-1);
if(asmCalls.length() > refCalls.length()) {
//Insertion case as we have more assembly alleles than reference
//Need to add range [previousEndPosition - (currentStart-1)]
snpMap.put(Range.closed(previousEnd,newEndOfInterMapRegion),new Tuple(refCalls,asmCalls));
}
else if(asmCalls.length() < refCalls.length()) {
//Deletion case as we have more reference alleles than assembly
//Need to add range [previousEndPosition+1 - (currentStart-1)]
snpMap.put(Range.closed(newStartOfInterMapRegion,newEndOfInterMapRegion),new Tuple(refCalls,asmCalls));
}
else {
//Simple multi allele SNP case
//Need to add Range [previousEndPosition - (currentStart-1)] but add the first allele to the start
String starterAllele = refSequence.genotypeAsString(previousEnd.getChromosome(),previousEnd.getPosition(),previousEnd.getPosition());
snpMap.put(Range.closed(previousEnd,newEndOfInterMapRegion),new Tuple(starterAllele+refCalls,starterAllele+asmCalls));
}
}
else if(previousEnd.compareTo(currentStart) == 0 || newStartOfInterMapRegion.compareTo(newEndOfInterMapRegion) >= 0){
//Slide up both 1 from previous end so it matches SNP file
if(!(asmCalls.equals("") && refCalls.equals(""))) {
Position insertionPosition = Position.of(previousEnd.getChromosome(), previousEnd.getPosition());
snpMap.put(Range.closed(insertionPosition, insertionPosition), new Tuple(refCalls, asmCalls));
}
}
else {
//we have an overlapping range. we do not add anything in for this case.
}
previousRange = currentRange;
}
myLogger.debug("Number Of overlapping assemblies: "+ asmRangeCounter);
myLogger.debug("Avg length of overlap: "+((double)lengthOfOverlap/asmRangeCounter));
myLogger.debug("Total possible indels: " + totalPossibleIndels + " Number refOverlaps: " + refRangeOverlaps + ", num sequence already covered at 40 percent: " + numAlreadyCovered);
return snpMap;
}
public static RangeSet getIndelRanges(RangeMap> coordinates) {
List> referencePositionCoordinates = coordinates.asMapOfRanges().keySet().stream().collect(Collectors.toList());
RangeSet indelRegions = TreeRangeSet.create();
Range previousRange = referencePositionCoordinates.get(0);
for(int rangeIndex = 1; rangeIndex < referencePositionCoordinates.size(); rangeIndex++) {
Range currentRange = referencePositionCoordinates.get(rangeIndex);
Position previousEnd = previousRange.upperEndpoint();
Position currentStart = currentRange.lowerEndpoint();
Position asmPreviousEnd = coordinates.asMapOfRanges().get(previousRange).get(1);
Position asmCurrentStart = coordinates.asMapOfRanges().get(currentRange).get(0);
//Check to see if we have a gap in the assembly
if(asmPreviousEnd.compareTo(asmCurrentStart)<0) {
//Check for diagonal gap
if(previousEnd.compareTo(currentStart)<0) {
//Add one to the start and subtract one from the end as we do not have overlapping endpoints
indelRegions.add(Range.closed(Position.of(previousEnd.getChromosome(),previousEnd.getPosition()+1),
Position.of(currentStart.getChromosome(),currentStart.getPosition()-1)));
}
else if(previousEnd.compareTo(currentStart)==0) {
//Means a long insertion broke up the clusters. We need to return a range of one element
indelRegions.add(Range.closed(previousEnd,currentStart));
}
}
else if( asmPreviousEnd.compareTo(asmCurrentStart)==0)
{
//We have a deletion
//Make sure the reference Coordiantes are not equal as well, if so RangeSet will coalesce correctly
if(previousEnd.compareTo(currentStart)!=0) {
indelRegions.add(Range.closed(previousEnd,currentStart));
}
}
previousRange = currentRange;
}
return indelRegions;
}
/**
* Method to build list of VariantContexts as RefRangeVCs - used when the reference and
* assembly have identical chromosome data
* @param refSequence
* @param assemblyName
* @param anchors
* @return
*/
public static List createVCasRefBlock(GenomeSequence refSequence, String assemblyName, RangeSet anchors, Map,List> refMappings ) {
List rangeVCList = new ArrayList<>();
// This should only be called from AssemblyHapltoypesMUltiThreadPlugin or AssemblyHaplotypesPlugin
// when the refMappings contained a single element and there is no SNP data.
if (refMappings.keySet().size() > 1) {
// This is a developer directive -
throw new IllegalArgumentException("createVCasRefBlock only applies to singly mapped coordinates file. Alter code if you want it to apply otherwise");
}
Collection> coordsKeys = refMappings.keySet();
for (Range key : coordsKeys){
List asmPositions = refMappings.get(key);
int refStart = key.lowerEndpoint().getPosition();
int refEnd = key.upperEndpoint().getPosition();
int asmStart = asmPositions.get(0).getPosition();
int asmEnd = asmPositions.get(1).getPosition();
if (refStart == asmStart && refEnd == asmEnd) {
// Run a loop through all the anchors
anchors.asRanges().stream().forEach(range -> {
VariantContext vc = AssemblyProcessingUtils.createRefRangeVC(refSequence, assemblyName, range.lowerEndpoint(), range.upperEndpoint(), range.lowerEndpoint(), range.upperEndpoint());
rangeVCList.add(vc);
});
} else {
throw new IllegalArgumentException("createVCasRefBlock: ref and asm coords do not match - this should never happen here");
}
}
return rangeVCList;
}
/**
* Method to build the list of VariantContexts based on the mapped coordinates and the SNPs
* @param refSequence
* @param assemblyName
* @param anchors
* @param refMappings
* @param snps
* @return
*/
public static List extractAnchorVariantContextsFromAssemblyAlignments(GenomeSequence refSequence, String assemblyName, RangeSet anchors, Map,List> refMappings , RangeMap> snps) {
List anchorVariants = new ArrayList<>();
List> mappedCoordinateRanges = refMappings.keySet().stream().collect(Collectors.toList());
//TODO Add in the anchors for the first and last unmapped regions if they exist.
Range currentRange = mappedCoordinateRanges.get(0);
int currentRangeCounter = 1;
Position refRangeStartPosition = currentRange.lowerEndpoint();
List currentRangeASMPositions = refMappings.get(currentRange);
Position lastProcessedASMPosition = Position.of(currentRangeASMPositions.get(0).getChromosome(),currentRangeASMPositions.get(0).getPosition()-1);
boolean isASMCoordIncreasing = (currentRangeASMPositions.get(1).getPosition() >= lastProcessedASMPosition.getPosition());
try {
//Walk through SNP ranges
for(Range currentSNP : snps.asMapOfRanges().keySet()) {
Tuple refAndAssemblyCalls = snps.get(currentSNP.lowerEndpoint());
Range nextRange = (currentRangeCounter < mappedCoordinateRanges.size())?mappedCoordinateRanges.get(currentRangeCounter) : null;
if (refAndAssemblyCalls.getX().equals(refAndAssemblyCalls.getY())) {
// The ref and asm alleles are identical. This can happen with entries created
// in setupIndelVariants. These are either regions that Mummer4 did not align, or
// that Mummer4 did align, but were filtered out with PHG processing.
myLogger.info("extractAnchorVariantContextsFromAssemblyAlignments- refCall equals dup string at lower position " + currentSNP.lowerEndpoint().getPosition());
Tuple asmStartAndEnd = determineGVCFRefBlockASMStartAndEnd(currentRange,refRangeStartPosition,isASMCoordIncreasing,lastProcessedASMPosition);
anchorVariants.add(createRefRangeVC(refSequence, assemblyName, refRangeStartPosition,currentRange.upperEndpoint(),asmStartAndEnd.getX(),asmStartAndEnd.getY()));
continue;
}
//if snpPosition is greater than currentRange.upperEndpoint
if(currentSNP.lowerEndpoint().compareTo(currentRange.upperEndpoint())>0) {
if(nextRange != null && nextRange.contains(currentSNP.lowerEndpoint())) {
//Finish off the range variant context:
//create RefRange VCF between refRangeStartPosition and currentRange.upperEndpoint
Tuple asmStartAndEnd = determineGVCFRefBlockASMStartAndEnd(currentRange,refRangeStartPosition,isASMCoordIncreasing,lastProcessedASMPosition);
if (refRangeStartPosition.getPosition() > currentRange.upperEndpoint().getPosition()) {
myLogger.info("extractAnchorVariantContext: no refRange VCF between refRangeStartPosition and currentRange.upperEndpoint " +
refRangeStartPosition.getPosition() + "/" + currentRange.upperEndpoint().getPosition());
} else {
anchorVariants.add(createRefRangeVC(refSequence, assemblyName, refRangeStartPosition,currentRange.upperEndpoint(),asmStartAndEnd.getX(),asmStartAndEnd.getY()));
}
}
else if(currentRange.upperEndpoint().getPosition() + 1 == currentSNP.lowerEndpoint().getPosition()) { //We have a deletion covering the gap between the mapped ranges
//This means that we have a deletion as deletions are stored in the SNP file as occurring at the site of the gap
//We need to process this like a normal deletion snp then shift the current range up and continue on so we do not put the SNP in 2x
Position refRangeEndPosition = Position.of(currentSNP.lowerEndpoint().getChromosome(), currentSNP.lowerEndpoint().getPosition() - 2);
// Handle the case where the SNP is at refRangeEnd-1, e.g SNP at 90733989, refRangEnd = 90733990.
// In this case, subtracting 2 makes the end position LESS than the start position.
// Don't create a refRangeVC. It means we miss 1 BP of ref data, which instead is included
// in the SNPVC created below. The asmStartEnd do not change when we don't create the refRangeVC.
Tuple asmStartAndEnd = determineGVCFRefBlockASMStartAndEnd(refRangeStartPosition, refRangeEndPosition, isASMCoordIncreasing,lastProcessedASMPosition);
if (refRangeEndPosition.getPosition() >= refRangeStartPosition.getPosition()) {
anchorVariants.add(createRefRangeVC(refSequence, assemblyName, refRangeStartPosition, refRangeEndPosition ,asmStartAndEnd.getX(),asmStartAndEnd.getY()));
} else {
System.out.println("extractAnchorVariantContext - case refRangeStart > end: currentSNP.lowerEndPoint "
+ currentSNP.lowerEndpoint().getPosition() + ", upper SNP endpoint: " + currentSNP.upperEndpoint().getPosition());
}
//Update lastProcessedASMPosition so we know what the next refRange should start at
lastProcessedASMPosition = asmStartAndEnd.getY();
//Create indel vcf record
//Add in the previous bp to both ref and alt
String firstAllele = refSequence.genotypeAsString(currentSNP.lowerEndpoint().getChromosome(),currentSNP.lowerEndpoint().getPosition()-1);
Tuple indelCalls = new Tuple(firstAllele+refAndAssemblyCalls.getX(), firstAllele + refAndAssemblyCalls.getY());
asmStartAndEnd = determineSNPASMStartAndEnd(indelCalls,lastProcessedASMPosition,isASMCoordIncreasing);
anchorVariants.add(createSNPVC(assemblyName,Position.of(currentSNP.lowerEndpoint().getChromosome(),currentSNP.lowerEndpoint().getPosition()-1),
Position.of(currentSNP.lowerEndpoint().getChromosome(),currentSNP.upperEndpoint().getPosition()),indelCalls,asmStartAndEnd.getX(),asmStartAndEnd.getY()));
lastProcessedASMPosition = Position.of(asmStartAndEnd.getY().getChromosome(),asmStartAndEnd.getY().getPosition());
refRangeStartPosition = Position.of(currentSNP.upperEndpoint().getChromosome(),currentSNP.upperEndpoint().getPosition()+1);
}
else{
Tuple asmStartAndEnd = determineGVCFRefBlockASMStartAndEnd(currentRange,refRangeStartPosition,isASMCoordIncreasing,lastProcessedASMPosition);
if (refRangeStartPosition.getPosition() > currentRange.upperEndpoint().getPosition()) {
myLogger.info("extractAnchorVariantContext: no refRange VCF between refRangeStartPosition and currentRange.upperEndpoint " +
refRangeStartPosition.getPosition() + "/" + currentRange.upperEndpoint().getPosition());
} else {
anchorVariants.add(createRefRangeVC(refSequence, assemblyName, refRangeStartPosition,currentRange.upperEndpoint(),asmStartAndEnd.getX(),asmStartAndEnd.getY()));
}
}
if(currentRangeCounter < mappedCoordinateRanges.size()) {
while(currentSNP.lowerEndpoint().compareTo(currentRange.upperEndpoint())>=0) {
currentRange = mappedCoordinateRanges.get(currentRangeCounter);
currentRangeCounter++;
refRangeStartPosition = currentRange.lowerEndpoint();
//Update Assembly coordinate variables
currentRangeASMPositions = refMappings.get(currentRange);
lastProcessedASMPosition = Position.of(currentRangeASMPositions.get(0).getChromosome(),currentRangeASMPositions.get(0).getPosition()-1);
isASMCoordIncreasing = (currentRangeASMPositions.get(1).getPosition() >= lastProcessedASMPosition.getPosition());
}
}
else{
//It is outside of an anchor so we should stop processing
break;
}
}
//if snpPosition is above refRangeStartPosition(it likely is)
if(currentSNP.lowerEndpoint().compareTo(refRangeStartPosition) >= 0) {
//if variant is insertion
if(refAndAssemblyCalls.getX().length() < refAndAssemblyCalls.getY().length()) {
//In nucmer the last shared bp of the insertion is the one stored in the map,
//create RefRangeVCF between refRangeStartPosition and snpPosition-1
if(refRangeStartPosition.compareTo(Position.of(currentSNP.lowerEndpoint().getChromosome(),currentSNP.lowerEndpoint().getPosition()-1)) <= 0) {
Position refRangeEndPosition = Position.of(currentSNP.lowerEndpoint().getChromosome(), currentSNP.lowerEndpoint().getPosition() - 1);
Tuple asmStartAndEnd = determineGVCFRefBlockASMStartAndEnd(refRangeStartPosition,refRangeEndPosition,isASMCoordIncreasing,lastProcessedASMPosition);
anchorVariants.add(createRefRangeVC(refSequence, assemblyName, refRangeStartPosition, refRangeEndPosition, asmStartAndEnd.getX(),asmStartAndEnd.getY()));
//Update lastProcessedASMPosition so we know what the next refRange should start at
lastProcessedASMPosition = asmStartAndEnd.getY();
}
//Create indel vcf record
//Add in the previous bp to both ref and alt
String firstAllele = refSequence.genotypeAsString(currentSNP.lowerEndpoint().getChromosome(),currentSNP.lowerEndpoint().getPosition());
Tuple indelCalls = new Tuple(firstAllele+refAndAssemblyCalls.getX(), firstAllele + refAndAssemblyCalls.getY());
Tuple asmStartAndEnd = determineSNPASMStartAndEnd(indelCalls,lastProcessedASMPosition,isASMCoordIncreasing);
anchorVariants.add(createSNPVC(assemblyName,Position.of(currentSNP.lowerEndpoint().getChromosome(),currentSNP.lowerEndpoint().getPosition()),
Position.of(currentSNP.upperEndpoint().getChromosome(),currentSNP.upperEndpoint().getPosition()),indelCalls,asmStartAndEnd.getX(),asmStartAndEnd.getY()));
lastProcessedASMPosition = asmStartAndEnd.getY();
//For the insertion case we need to set refRangeStart to currentPosition+1 as its the next available position
refRangeStartPosition = Position.of(currentSNP.upperEndpoint().getChromosome(),currentSNP.upperEndpoint().getPosition()+1);
}
//If Variant is deletion
else if( refAndAssemblyCalls.getX().length() > refAndAssemblyCalls.getY().length()) {
//create RefRangeVCF between refRangeStartPosition and snpPosition-2
if(refRangeStartPosition.compareTo(Position.of(currentSNP.lowerEndpoint().getChromosome(),currentSNP.lowerEndpoint().getPosition()-2)) <= 0) {
Position refRangeEndPosition = Position.of(currentSNP.lowerEndpoint().getChromosome(), currentSNP.lowerEndpoint().getPosition() - 2);
Tuple asmStartAndEnd = determineGVCFRefBlockASMStartAndEnd(refRangeStartPosition, refRangeEndPosition, isASMCoordIncreasing,lastProcessedASMPosition);
anchorVariants.add(createRefRangeVC(refSequence, assemblyName, refRangeStartPosition, refRangeEndPosition ,asmStartAndEnd.getX(),asmStartAndEnd.getY()));
//Update lastProcessedASMPosition so we know what the next refRange should start at
lastProcessedASMPosition = asmStartAndEnd.getY();
}
//Create indel vcf record
//Add in the previous bp to both ref and alt
String firstAllele = refSequence.genotypeAsString(currentSNP.lowerEndpoint().getChromosome(),currentSNP.lowerEndpoint().getPosition()-1);
Tuple indelCalls = new Tuple(firstAllele+refAndAssemblyCalls.getX(), firstAllele + refAndAssemblyCalls.getY());
Tuple asmStartAndEnd = determineSNPASMStartAndEnd(indelCalls,lastProcessedASMPosition,isASMCoordIncreasing);
anchorVariants.add(createSNPVC(assemblyName,Position.of(currentSNP.lowerEndpoint().getChromosome(),currentSNP.lowerEndpoint().getPosition()-1),
Position.of(currentSNP.lowerEndpoint().getChromosome(),currentSNP.upperEndpoint().getPosition()),indelCalls,asmStartAndEnd.getX(),asmStartAndEnd.getY()));
lastProcessedASMPosition = asmStartAndEnd.getY();
refRangeStartPosition = Position.of(currentSNP.upperEndpoint().getChromosome(),currentSNP.upperEndpoint().getPosition()+1);
}
else {
//Its a normal SNP
//create RefRangeVCF between refRangeStartPosition and snpPosition-1
if(refRangeStartPosition.compareTo(Position.of(currentSNP.lowerEndpoint().getChromosome(),currentSNP.lowerEndpoint().getPosition()-1)) <= 0) {
Position refRangeEndPosition = Position.of(currentSNP.lowerEndpoint().getChromosome(), currentSNP.lowerEndpoint().getPosition() - 1);
Tuple asmStartAndEnd = determineGVCFRefBlockASMStartAndEnd(refRangeStartPosition, refRangeEndPosition, isASMCoordIncreasing,lastProcessedASMPosition);
anchorVariants.add(createRefRangeVC(refSequence, assemblyName, refRangeStartPosition, refRangeEndPosition, asmStartAndEnd.getX(), asmStartAndEnd.getY()));
//Update lastProcessedASMPosition so we know what the next refRange should start at
lastProcessedASMPosition = asmStartAndEnd.getY();
}
//Create SNP vcf record
//figure out the assembly coordinates
Tuple asmStartAndEnd = determineSNPASMStartAndEnd(refAndAssemblyCalls,lastProcessedASMPosition,isASMCoordIncreasing);
anchorVariants.add(createSNPVC(assemblyName,currentSNP.lowerEndpoint(),currentSNP.upperEndpoint(),refAndAssemblyCalls, asmStartAndEnd.getX(),asmStartAndEnd.getY()));
lastProcessedASMPosition = asmStartAndEnd.getY();
refRangeStartPosition = Position.of(currentSNP.upperEndpoint().getChromosome(),currentSNP.upperEndpoint().getPosition()+1);
}
//Need to check to see if the currentSNP was at the end of the mapped region
//if so we need to update currentRange
if(currentSNP.lowerEndpoint().compareTo(currentRange.upperEndpoint())==0 && currentRangeCounter < mappedCoordinateRanges.size()) {
while(currentSNP.lowerEndpoint().compareTo(currentRange.upperEndpoint())>=0) {
currentRange = mappedCoordinateRanges.get(currentRangeCounter);
currentRangeCounter++;
refRangeStartPosition = currentRange.lowerEndpoint();
}
}
if(currentRangeCounter >= mappedCoordinateRanges.size()){
//It is outside of an anchor so we should stop processing
break;
}
}
// Handle the odd case where the SNP position
if (currentSNP.lowerEndpoint().compareTo(currentRange.upperEndpoint())==0 &&
refRangeStartPosition.getPosition() == currentRange.upperEndpoint().getPosition()+1) {
myLogger.info("extractAnchorVariants: SNP at last bp of range: currentSNP lower "
+ currentSNP.lowerEndpoint().getPosition() + ", currentRange upper " + currentRange.upperEndpoint().getPosition()
+ ", nextRangeStart: " + refRangeStartPosition.getPosition());
// Just slide this up. This is an odd case where the SNP falls as the
// last bp of a range, and the next range starts at the next BP - no gap.
// WIthout skipping here, we end up attempting to create a refRangeVC with
// the start > end by 1.
//Need to check to see if the currentSNP was at the end of the mapped region
//if so we need to update currentRange
if(currentSNP.lowerEndpoint().compareTo(currentRange.upperEndpoint())==0 && currentRangeCounter < mappedCoordinateRanges.size()) {
while(currentSNP.lowerEndpoint().compareTo(currentRange.upperEndpoint())>=0) {
currentRange = mappedCoordinateRanges.get(currentRangeCounter);
currentRangeCounter++;
refRangeStartPosition = currentRange.lowerEndpoint();
}
}
if(currentRangeCounter >= mappedCoordinateRanges.size()){
//It is outside of an anchor so we should stop processing
break;
}
}
}
} catch (Exception exc) {
myLogger.debug("extractAnchorVariantContextsFromAssemblyAlignments failed ", exc);
throw exc;
}
//Get a list of ranges which are anchor regions
return anchorVariants;
}
/**
* Method to split up the reference range by anchor mappings. Basically this method will take the variant contexts and the anchor coordinates.
* If a variant context is a reference block which is spanning the start or end of the anchor(should happen frequently if anchor ends are truly conserved),
* we need to break up the variant context into two adjacent reference blocks with the end point being the start or end of one of the variants.
* This will allow for easy querying of the list of Variants when attempting to load into the db.
* @param variantContexts
* @param anchorMapping
* @param refSequence
* @return
*/
public static List splitRefRange(List variantContexts, Map anchorMapping, GenomeSequence refSequence) {
RangeMap positionRangeToVariantContextMap = TreeRangeMap.create();
for(VariantContext variantContext : variantContexts) {
positionRangeToVariantContextMap.put(Range.closed(Position.of(variantContext.getContig(),variantContext.getStart()),
Position.of(variantContext.getContig(),variantContext.getEnd())),
variantContext);
}
for(Integer anchorId : anchorMapping.keySet()) {
ReferenceRange referenceRange = anchorMapping.get(anchorId);
Position stPos = Position.of(referenceRange.chromosome(),referenceRange.start());
Position endPos = Position.of(referenceRange.chromosome(),referenceRange.end());
VariantContext overlappingStartVC = positionRangeToVariantContextMap.get(stPos);
// Check to make sure this is a reference GVCF block
if(overlappingStartVC != null && isRefBlock(overlappingStartVC) && overlappingStartVC.getStart() != stPos.getPosition()) {
List resizedVariants = resizeRefBlock(overlappingStartVC,refSequence, stPos, true);
//Figure out the offset for the reference
//Resize the RefRangeVC
//remove the overlappingStartVC from the list/map
Range overlappingRange = positionRangeToVariantContextMap.getEntry(stPos).getKey();
positionRangeToVariantContextMap.remove(overlappingRange);
//Add in the new 2 reference ranges
positionRangeToVariantContextMap.put(Range.closed(Position.of(resizedVariants.get(0).getContig(),resizedVariants.get(0).getStart()),
Position.of(resizedVariants.get(0).getContig(),resizedVariants.get(0).getEnd())),
resizedVariants.get(0));
positionRangeToVariantContextMap.put(Range.closed(Position.of(resizedVariants.get(1).getContig(),resizedVariants.get(1).getStart()),
Position.of(resizedVariants.get(1).getContig(),resizedVariants.get(1).getEnd())),
resizedVariants.get(1));
}
VariantContext overlappingEndVC = positionRangeToVariantContextMap.get(endPos);
if(overlappingEndVC != null && isRefBlock(overlappingEndVC) && overlappingEndVC.getEnd() != endPos.getPosition()) {
List resizedVariants = resizeRefBlock(overlappingEndVC,refSequence, endPos, false);
//Figure out the offset for the reference
//Resize the RefRangeVC
//remove the overlappingStartVC from the list/map
Range overlappingRange = positionRangeToVariantContextMap.getEntry(endPos).getKey();
positionRangeToVariantContextMap.remove(overlappingRange);
//Add in the new 2 reference ranges
positionRangeToVariantContextMap.put(Range.closed(Position.of(resizedVariants.get(0).getContig(),resizedVariants.get(0).getStart()),
Position.of(resizedVariants.get(0).getContig(),resizedVariants.get(0).getEnd())),
resizedVariants.get(0));
positionRangeToVariantContextMap.put(Range.closed(Position.of(resizedVariants.get(1).getContig(),resizedVariants.get(1).getStart()),
Position.of(resizedVariants.get(1).getContig(),resizedVariants.get(1).getEnd())),
resizedVariants.get(1));
}
}
Map,VariantContext> finalVariantMapping = positionRangeToVariantContextMap.asMapOfRanges();
return finalVariantMapping.keySet().stream().map(refRange -> finalVariantMapping.get(refRange)).collect(Collectors.toList());
}
/**
* Helper method to create a Reference Range VariantContext for assemblies. The DP value
* is defaulted to 0 for assemblies. If this is not set, -1 is used as default in
* GenotypeBuilder. That causes assembly problems down the line when storing the
* value as a byte in a long.
* @param refSequence
* @param assemblyTaxon
* @param refRangeStart
* @param refRangeEnd
* @param asmStart
* @param asmEnd
* @return
*/
public static VariantContext createRefRangeVC(GenomeSequence refSequence, String assemblyTaxon, Position refRangeStart, Position refRangeEnd, Position asmStart, Position asmEnd) {
Allele firstRefAllele = Allele.create(refSequence.genotypeAsString(refRangeStart.getChromosome(),refRangeStart.getPosition()),true);
Genotype gt = new GenotypeBuilder().name(assemblyTaxon).alleles(Arrays.asList(firstRefAllele)).DP(2).AD(new int[]{2,0}).make();
if (refRangeStart.getPosition() > refRangeEnd.getPosition()) {
throw new IllegalStateException("createRefRangeVC - start postion greater than end: start=" +
refRangeStart.getPosition() + " end=" + refRangeEnd.getPosition());
}
VariantContextBuilder vcb = new VariantContextBuilder()
.chr(refRangeStart.getChromosome().getName())
.start(refRangeStart.getPosition())
.stop(refRangeEnd.getPosition())
.attribute("END",refRangeEnd.getPosition())
.alleles(Arrays.asList(firstRefAllele, Allele.NON_REF_ALLELE))
.genotypes(gt);
if(asmStart != null && asmEnd != null) {
// Set the asm coordinates as VC record attributes
vcb = vcb.attribute("ASM_Start",asmStart.getPosition());
vcb = vcb.attribute("ASM_End",asmEnd.getPosition());
// if (asmStart.getPosition() > asmEnd.getPosition()) {
// vcb = vcb.attribute("ASM_Start",asmEnd.getPosition());
// vcb = vcb.attribute("ASM_End",asmStart.getPosition());
// } else {
// vcb = vcb.attribute("ASM_Start",asmStart.getPosition());
// vcb = vcb.attribute("ASM_End",asmEnd.getPosition());
// }
}
return vcb.make();
}
/**
* Helper method to create a SNP Variant context for assemblies. The DP value
* is defaulted to 0 for assemblies. If this is not set, -1 is used as default in
* GenotypeBuilder. That causes assembly problems down the line when storing the
* value as a byte in a long.
* @param assemblyTaxon
* @param startPosition
* @param endPosition
* @param calls
* @param asmStart
* @param asmEnd
* @return
*/
public static VariantContext createSNPVC(String assemblyTaxon, Position startPosition, Position endPosition, Tuple calls, Position asmStart, Position asmEnd) {
Allele refCall = Allele.create(calls.getX(),true);
Allele altCall = Allele.create(calls.getY(),false);
if (startPosition.getPosition() > endPosition.getPosition()) {
throw new IllegalStateException("createSNPVC - start postion greater than end: start=" +
startPosition.getPosition() + " end=" + endPosition.getPosition());
}
//Need to add AD for Alt >0 here so that the API will work correctly. Otherwise it is treated as missing as it thinks AD = 0,0.
// When coming from an assembly it should always use the ALT in a SNP pos
Genotype gt = new GenotypeBuilder().name(assemblyTaxon).alleles(Arrays.asList(altCall)).DP(2).AD(new int[]{0,2,0}).make();
VariantContextBuilder vcb = new VariantContextBuilder()
.chr(startPosition.getChromosome().getName())
.start(startPosition.getPosition())
.stop(endPosition.getPosition())
.alleles(Arrays.asList(refCall,altCall, Allele.NON_REF_ALLELE))
.genotypes(gt);
//Only add in the Assembly start and end if they exist
if(asmStart != null && asmEnd != null) {
// Set the asm coordinates as VC record attributes
vcb = vcb.attribute("ASM_Start",asmStart.getPosition());
vcb = vcb.attribute("ASM_End",asmEnd.getPosition());
// if (asmStart.getPosition() > asmEnd.getPosition()) {
// vcb = vcb.attribute("ASM_Start",asmEnd.getPosition());
// vcb = vcb.attribute("ASM_End",asmStart.getPosition());
// } else {
// vcb = vcb.attribute("ASM_Start",asmStart.getPosition());
// vcb = vcb.attribute("ASM_End",asmEnd.getPosition());
// }
}
return vcb.make();
}
/**
* Helper method to easily determine the assembly start and end for a GVCF reference block
* @param currentRange
* @param refRangeStartPosition
* @param isASMCoordIncreasing
* @param lastProcessedASMPosition
* @return
*/
private static Tuple determineGVCFRefBlockASMStartAndEnd(Range currentRange, Position refRangeStartPosition, boolean isASMCoordIncreasing, Position lastProcessedASMPosition) {
int lengthOfReferenceCovered = currentRange.upperEndpoint().getPosition() - refRangeStartPosition.getPosition();
int asmStartIndex = (isASMCoordIncreasing) ? lastProcessedASMPosition.getPosition()+1 : lastProcessedASMPosition.getPosition()-1;
int asmEndIndex = (isASMCoordIncreasing) ? asmStartIndex + lengthOfReferenceCovered : asmStartIndex - lengthOfReferenceCovered;
Position vcASMStart = Position.of(lastProcessedASMPosition.getChromosome(),asmStartIndex);
Position vcASMEnd = Position.of(vcASMStart.getChromosome(),asmEndIndex);
return new Tuple<>(vcASMStart,vcASMEnd);
}
/**
* Helper method to easily determine the assembly start and end for a GVCF reference block. This version allows you to just use a reference range Start position rather than the whole range.
* @param refRangeStartPosition
* @param refRangeEndPosition
* @param isASMCoordIncreasing
* @param lastProcessedASMPosition
* @return
*/
private static Tuple determineGVCFRefBlockASMStartAndEnd(Position refRangeStartPosition, Position refRangeEndPosition, boolean isASMCoordIncreasing, Position lastProcessedASMPosition) {
int lengthOfReferenceCovered = refRangeEndPosition.getPosition() - refRangeStartPosition.getPosition();
int asmStartIndex = (isASMCoordIncreasing) ? lastProcessedASMPosition.getPosition()+1 : lastProcessedASMPosition.getPosition()-1;
int asmEndIndex = (isASMCoordIncreasing) ? asmStartIndex + lengthOfReferenceCovered : asmStartIndex - lengthOfReferenceCovered;
Position vcASMStart = Position.of(lastProcessedASMPosition.getChromosome(),asmStartIndex);
Position vcASMEnd = Position.of(vcASMStart.getChromosome(),asmEndIndex);
return new Tuple<>(vcASMStart,vcASMEnd);
}
/**
* Helper method to easily determine the Assembly start and end for a SNP or INDEL
* @param refAndAssemblyCalls
* @param lastProcessedASMPosition
* @param isASMCoordIncreasing
* @return
*/
private static Tuple determineSNPASMStartAndEnd(Tuple refAndAssemblyCalls, Position lastProcessedASMPosition, boolean isASMCoordIncreasing) {
int lengthOfSNP = refAndAssemblyCalls.getY().length();
int asmStartIndex = (isASMCoordIncreasing) ? lastProcessedASMPosition.getPosition()+1 : lastProcessedASMPosition.getPosition()-1;
int asmEndIndex = (isASMCoordIncreasing) ? asmStartIndex + lengthOfSNP-1 : asmStartIndex - lengthOfSNP+1;
Position vcASMStart = Position.of(lastProcessedASMPosition.getChromosome(),asmStartIndex);
Position vcASMEnd = Position.of(vcASMStart.getChromosome(),asmEndIndex);
return new Tuple<>(vcASMStart,vcASMEnd);
}
/**
* Helper method to make a more readable debug statement for when a intermapped INDEL is not able to be created
* @param firstRange
* @param secondRange
* @return
*/
private static String createUserReadableStringPositionRanges(Range firstRange, Range secondRange) {
StringBuilder sb = new StringBuilder();
sb.append("\n\tFirstRange:[Chr: ");
sb.append(""+firstRange.lowerEndpoint().getChromosome().getName());
sb.append(" StPos: ");
sb.append(""+firstRange.lowerEndpoint().getPosition());
sb.append(" - Chr: ");
sb.append(""+firstRange.upperEndpoint().getChromosome().getName());
sb.append(" EndPos: ");
sb.append(""+firstRange.upperEndpoint().getPosition());
sb.append("\n\tSecondRange:[Chr: ");
sb.append(""+secondRange.lowerEndpoint().getChromosome().getName());
sb.append(" StPos: ");
sb.append(""+secondRange.lowerEndpoint().getPosition());
sb.append(" - Chr: ");
sb.append(""+secondRange.upperEndpoint().getChromosome().getName());
sb.append(" EndPos: ");
sb.append(""+secondRange.upperEndpoint().getPosition());
sb.append("\n");
return sb.toString();
}
/**
* Simple method to determine if the current variant context is a reference block or not.
* @param vc
* @return
*/
public static boolean isRefBlock(VariantContext vc) {
//if only 1 allele in reference but stop position is higher than start position
if(vc.getReference().getBaseString().length()==1 && vc.getEnd() - vc.getStart() >0) {
return true;
}
else {
return false;
}
}
/**
* Method which will take a variant Context which needs to be split and will output 2 new variants while updating ASM_* annotations.
*
* Depending on if the splitting position is a start or end or if the assembly is increasing or decreasing, it will have to handle things differently.
* @param vc
* @param refSequence
* @param positionToSplit
* @param isStart
* @return
*/
public static List resizeRefBlock(VariantContext vc, GenomeSequence refSequence, Position positionToSplit, boolean isStart) {
//Check to see if the ordering of the assembly is increasing or decreasing
int asmStart = vc.getAttributeAsInt("ASM_Start",-1);
int asmEnd = vc.getAttributeAsInt("ASM_End",-1);
boolean hasASM = (asmStart!=-1 && asmEnd!=-1);
List splitVariants = new ArrayList<>();
Position newEndPosition = (isStart)?Position.of(vc.getContig(),positionToSplit.getPosition()-1):Position.of(vc.getContig(),positionToSplit.getPosition());
Position newStartPosition = (isStart)?Position.of(vc.getContig(),positionToSplit.getPosition()):Position.of(vc.getContig(),positionToSplit.getPosition()+1);
int bpOffset = newEndPosition.getPosition()-vc.getStart();
if(hasASM) {
//Figure out if the assembly is increasing or decreasing. We need to check as we either need to add or subtract the number of bps we are splitting
if(asmEnd>=asmStart) {
Position newASMEndPosition = (isStart)?Position.of(vc.getContig(),asmStart + bpOffset):Position.of(vc.getContig(),asmStart+bpOffset);
Position newASMStartPosition = (isStart)?Position.of(vc.getContig(),asmStart + bpOffset+1):Position.of(vc.getContig(),asmStart+bpOffset+1);
splitVariants.add(createRefRangeVC(refSequence,vc.getSampleNamesOrderedByName().get(0),Position.of(vc.getContig(),vc.getStart()),newEndPosition,Position.of(vc.getContig(),asmStart),newASMEndPosition));
splitVariants.add(createRefRangeVC(refSequence,vc.getSampleNamesOrderedByName().get(0),newStartPosition,Position.of(vc.getContig(),vc.getEnd()),newASMStartPosition,Position.of(vc.getContig(),asmEnd)));
}
else {
Position newASMEndPosition = (isStart)?Position.of(vc.getContig(),asmStart - bpOffset + 1):Position.of(vc.getContig(),asmStart-bpOffset);
Position newASMStartPosition = (isStart)?Position.of(vc.getContig(),asmStart - bpOffset):Position.of(vc.getContig(),asmStart-bpOffset-1);
splitVariants.add(createRefRangeVC(refSequence,vc.getSampleNamesOrderedByName().get(0),Position.of(vc.getContig(),vc.getStart()),newEndPosition,Position.of(vc.getContig(),asmStart),newASMEndPosition));
splitVariants.add(createRefRangeVC(refSequence,vc.getSampleNamesOrderedByName().get(0),newStartPosition,Position.of(vc.getContig(),vc.getEnd()),newASMStartPosition,Position.of(vc.getContig(),asmEnd)));
}
}
else {
splitVariants.add(createRefRangeVC(refSequence,vc.getSampleNamesOrderedByName().get(0),Position.of(vc.getContig(),vc.getStart()),newEndPosition,null,null));
splitVariants.add(createRefRangeVC(refSequence,vc.getSampleNamesOrderedByName().get(0),newStartPosition,Position.of(vc.getContig(),vc.getEnd()),null,null));
}
return splitVariants;
}
/**
* Find the lowest reference start entry from a mummer snp file list of entries.
* There could be more than 1 string of indels for this asm snp. Find the start
* and end of the string of indels whose positions overlap the reference position
* for the SNP in question. Return the start position and the length of this string
* of indels
* @param refSnpPos
* @param snpEntries
* @return
*/
public static Tuple findRefIndelStart(int refSnpPos,Collection snpEntries){
int refStart = Integer.MAX_VALUE;
List refPosList = new ArrayList();
try {
for (String currentEntry : snpEntries) {
int tabIndex1 = currentEntry.indexOf("\t");
int refPos = Integer.parseInt(currentEntry.substring(0,tabIndex1));
refPosList.add(refPos);
}
} catch (Exception exc) {
throw new IllegalArgumentException("AssemblyProcessingUtils:findRefIndelStart: error finding ref start: " + exc.getMessage());
}
Collections.sort(refPosList);
// Walk through the sorted list, find the string of indels
// that contain the reference position where we have our SNP.
// Due to multiple alignmentsin the mummer coords file, the same assembly position
// can map as a deletion to different places on the reference.
int prevRefPos = refPosList.get(0);
int curRefPos = prevRefPos;
refStart = prevRefPos;
int len=1;
for (int refIdx = 1; refIdx < refPosList.size(); refIdx++) {
curRefPos = refPosList.get(refIdx);
if ((curRefPos-prevRefPos) != 1){ // a gap in positions means multiple strings of indels
if (curRefPos <= refSnpPos) {
refStart = curRefPos; // start counting from this gap
len = 0; // this is incremented to 1 below
} else { // reference position falls within the previous string of indels
return new Tuple(refStart,len);
}
}
len++;
prevRefPos = curRefPos;
}
// the full list has been processed, return the latest SNP start position
// and indel length
return new Tuple(refStart,len);
}
// Find the lowest assembly (query) start entry from a mummer snp file list of entries
public static int findAsmIndelStart(Collection snpEntries) {
int asmStart = Integer.MAX_VALUE;
try {
for (String currentEntry : snpEntries) {
int tabIndex1 = currentEntry.indexOf("\t");
int tabIndex2 = currentEntry.indexOf("\t",tabIndex1+1);
int tabIndex3 = currentEntry.indexOf("\t",tabIndex2+1);
int tabIndex4 = currentEntry.indexOf("\t",tabIndex3+1);
int tempAsm = Integer.parseInt(currentEntry.substring(tabIndex3+1, tabIndex4));
if (tempAsm < asmStart) asmStart = tempAsm;
}
} catch (Exception exc) {
throw new IllegalArgumentException("AssemblyProcessingUtils:findAsmIndelStart: error finding ref start: " + exc.getMessage());
}
return asmStart;
}
/**
* Create a RangeSet from a map of ranges
* @param anchorEntries
* @return
*/
public static RangeSet getAnchorRangeSet(Map anchorEntries) {
RangeSet anchors = TreeRangeSet.create();
// For each value on the map, create an entry in the anchors rangeSet.
// RangeSet coalesces entries, but there should be no overlaps in the anchors.
for (ReferenceRange refRange : anchorEntries.values()) {
anchors.add(Range.closed(Position.of(refRange.chromosome(),refRange.start()),Position.of(refRange.chromosome(),refRange.end())));
}
return anchors;
}
/**
* Find all reference ranges for a particular chromosome
* Query pulls all reference ranges for that chrom from the reference_ranges table.
* The assembly should be processed against all defined reference ranges.
*
* @param database
* @param chrom
* @return A map of all ReferenceRange data for a specific chromosome. The map is keyed by referenceRangeID.
*/
public static Map referenceRangeForChromMap(Connection database, String chrom) {
if (database == null) {
throw new IllegalArgumentException("AssemblyHaplotypesPlugin: referenceRangeForChromMap: Must specify database connection.");
}
long time = System.nanoTime();
// Create method name for querying initial ref region and inter-region ref_range_group method ids
String refLine = CreateGraphUtils.getRefLineName( database);
// We need a method_name for the ReferenceRange object. But it will not be used in
// the assembly processing code. Here, we want to grab all ranges.
// Ideally, the methodName would be the method stored in the *_load_data.txt file provided
// when the reference intervals were loaded. We don't have that data unless we add additional
// parameters. Since this information is only needed for the ReferenceRange object, and that
// data as used by the assembly code does not make use of the methodName field, we add a dummy value
String methodName = "referenceIntervals"; // this is a dummy name, will be not used in ReferenceRange in Assembly code
// Grab all the ranges stored in the reference_range table.
StringBuilder querySB = new StringBuilder();
querySB.append("select reference_ranges.ref_range_id, chrom, range_start, range_end from reference_ranges where chrom='");
querySB.append(chrom);
querySB.append("'");
String query = querySB.toString();
myLogger.info("referenceRangesForChromMap: query statement: " + query);
ImmutableMap.Builder builder = ImmutableMap.builder();
try (ResultSet rs = database.createStatement().executeQuery(query)) {
while (rs.next()) {
int id = rs.getInt("ref_range_id");
String chromosome = rs.getString("chrom");
int start = rs.getInt("range_start");
int end = rs.getInt("range_end");
builder.put(id, new ReferenceRange(refLine, Chromosome.instance(chromosome), start, end, id, methodName));
}
} catch (Exception exc) {
myLogger.debug(exc.getMessage(), exc);
throw new IllegalStateException("AssemblyHaplotypesPlugin: referenceRanges: Problem querying the database: " + exc.getMessage());
}
Map result = builder.build();
myLogger.info("referenceRangeForChromMap: number of reference ranges: " + result.size());
myLogger.info("referenceRangeForChromMap: time: " + ((System.nanoTime() - time) / 1e9) + " secs.");
return result;
}
/**
* load initial genotype and method data to the database
* @param assemblyName
* @param method
* @param dbConn
* @return
*/
public static Tuple loadInitialAssemblyData(String assemblyName, String method, int clusterSize, Connection dbConn,
Map pluginParams, List fastaInfo, boolean isTestMethod) {
int ploidy = 1;
boolean is_ref = false;
int hapNumber = 0;
String line_data = method + ": Assembly aligned via mummer4 with clusterSize " + clusterSize + " and --mum parameters";
boolean genesPhased = true;
boolean chromsPhased = true;
float conf = 1;
// Get PHGdbAccess connection
PHGDataWriter phg = new PHGdbAccess(dbConn);
// add the assembly genotype/haplotype info to DB
GenoHaploData ghd = new GenoHaploData(ploidy,is_ref,assemblyName,line_data, genesPhased, chromsPhased, hapNumber, conf);
phg.putGenoAndHaploTypeData(ghd);
// Get the assembly genoid created in call to putGenoAndHaplotypeData()
// Needed for adding assembly genome_file_data entry
int genoid = phg.getGenoidFromLine(assemblyName);
String asm_genome_path = fastaInfo.get(4);
String asm_file = fastaInfo.get(5);
// This id needed for the haplotypes table
int genomeDataFileID = phg.putGenomeFileData(asm_genome_path,asm_file,genoid,DBLoadingUtils.GenomeFileType.FASTA.getValue());
// Load the gamete_groups and gamete_haplotypes table
String nameWithHap = assemblyName + "_" + hapNumber; // ref line is always hap0
List gameteGroupList = new ArrayList();
gameteGroupList.add(nameWithHap);
phg.putGameteGroupAndHaplotypes(gameteGroupList);
// Put the method data - identifies for each assembly how the anchors were created
myLogger.info("loadInitialAssemblyData: calling putMethod with method name " + method);
pluginParams.put("notes",line_data);
DBLoadingUtils.MethodType methodType = DBLoadingUtils.MethodType.ASSEMBLY_HAPLOTYPES;
if (isTestMethod) {
methodType = DBLoadingUtils.MethodType.TEST_ASSEMBLY_HAPLOTYPES;
}
int methodId = phg.putMethod(method, methodType,pluginParams); // line_data and method_details are the same in this plugin
// Get gamete_grp_id to return. This is used when loading the assembly haploytpe data
int gamete_grp_id = phg.getGameteGroupIDFromTaxaList(gameteGroupList);
// do NOT close phg here - it will drop the connection needed for later processing
return new Tuple (gamete_grp_id,genomeDataFileID);
}
/**
* Load the assembly haplotype data to the database
* @param gamete_grp_id
* @param method
* @param dbConn
* @param anchorSequences
*/
public static void loadAssemblyDataToDB(int gamete_grp_id, String method, Connection dbConn, Map anchorSequences,
String chromosome, int genomeFileId) {
Long time = System.nanoTime();
// Get PHGdbAccess connection
PHGDataWriter phg = new PHGdbAccess(dbConn);
int method_id = phg.getMethodIdFromName(method);
// Load the haplotypes data to haplotypes table
// THis is called from the original AssemblyHaplotypesMultiThreadPlugin,
// which loads the genome_file_data table prior to loading haplotypes.
// Based on this, we send null for asmServerDir and asmFasta - no need to populate them here.
myLogger.info("loadAssemblyDataToDB: starting putHaplotypesData for gamete_grp_id " + gamete_grp_id + ", chr " + chromosome);
phg.putHaplotypesData(gamete_grp_id, method_id, anchorSequences, chromosome, genomeFileId, -1);
// phg connection is closed in createAndLoadAssemblyData()
myLogger.info(" loadAssemblyDataToDB time to load haplotypes for gamete_grp_id " + gamete_grp_id + ", chr " + chromosome + ": " + (System.nanoTime()-time)/1e9 + " seconds");
}
}