![JAR search and dependency download from the Maven repository](/logo.png)
net.maizegenetics.pangenome.hapcollapse.ConsensusProcessingUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.hapcollapse;
import java.sql.Connection;
import java.util.*;
import net.maizegenetics.pangenome.api.*;
import net.maizegenetics.taxa.distance.DistanceMatrix;
import net.maizegenetics.taxa.distance.DistanceMatrixBuilder;
import org.apache.log4j.Logger;
import com.google.common.collect.BiMap;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.pangenome.api.HaplotypeNode.VariantInfo;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.Tuple;
/**
* This class holds methods for processing consensus using the variants and alleles tables
* or data derived from them.
*
* @author lcj34
*
*/
public class ConsensusProcessingUtils {
private static final Logger myLogger = Logger.getLogger(ConsensusProcessingUtils.class);
public static byte[] gtvalues = new byte[] {GenotypeTable.UNKNOWN_DIPLOID_ALLELE,
NucleotideAlignmentConstants.getNucleotideDiploidByte("AA"),
NucleotideAlignmentConstants.getNucleotideDiploidByte("CC"),
NucleotideAlignmentConstants.getNucleotideDiploidByte("GG"),
NucleotideAlignmentConstants.getNucleotideDiploidByte("TT"),
NucleotideAlignmentConstants.getNucleotideDiploidByte("++"),
NucleotideAlignmentConstants.getNucleotideDiploidByte("--"),
};
public static Map createGtvalueMap()
{
Map gtMap = new HashMap();
gtMap.put(GenotypeTable.UNKNOWN_DIPLOID_ALLELE, 0);
gtMap.put(NucleotideAlignmentConstants.getNucleotideDiploidByte("AA"),1);
gtMap.put(NucleotideAlignmentConstants.getNucleotideDiploidByte("CC"), 2);
gtMap.put(NucleotideAlignmentConstants.getNucleotideDiploidByte("GG"), 3);
gtMap.put(NucleotideAlignmentConstants.getNucleotideDiploidByte("TT"), 4);
gtMap.put(NucleotideAlignmentConstants.getNucleotideDiploidByte("++"), 5);
gtMap.put(NucleotideAlignmentConstants.getNucleotideDiploidByte("--"), 6);
return gtMap;
}
/**
* RefAltData comes from processing the Longs stored for each haplotype.
* It is used for consensus processing. Different data is available than is
* with the VariantMappingData used for processing single haplotypes.
*
* @param rad
* @return
*/
public static long getLongFromRefAltData(RefAltData rad) {
long vmLong = 0;
if ( rad.getVariantId() == -1) {
// create long for reference record
// format: 1bit=ref | 2 bytes 7 bits = refLength | 1 bytes=refDepth | 4 bytes=position on chrom
vmLong = rad.getLen(); // length gets upper 3 bytes
vmLong = (vmLong << 8) + rad.getRefDepth(); // refDepth get next byte
vmLong = (vmLong << 32) + rad.getPosition(); // position on chrom gets last 4 bytes
vmLong |= 1L << 63; // set upper bit to indicate this is a reference record
} else {
// create long for variant record
// format: 4 bytes= variant_mapping table id | 1 byte=refDepth | 1 byte=altDepth | 2 bytes=GC
vmLong = rad.getVariantId(); // id gets 4 upper bytes
vmLong = (vmLong << 8) + rad.getRefDepth(); // refDepth gets next byte
vmLong = (vmLong << 8) + rad.getAltDepth(); // altDepth gets a byte
byte isIndel = rad.getLen() > 1 ? (byte)1 : 0; // RefAltData now stores actual indel length
vmLong = (vmLong << 8) + isIndel; // isIndel gets a byte
vmLong = (vmLong << 8) + 0; // undefined other data as last byte
}
return vmLong;
}
// Determine if any taxon have a reference range at startPos. If yes, find longest stretch where most have refRange
// that occurs before the next SNP is seen (nextSnpPos)
// parameter "maxRefLen" is the maximum length for a refBlock. This is determined by where the next SNP falls -
// we stop the refBLock before the next SNP. If maxRefLen = -1, means no more snps so length isn't limited.
// Returns: RefAltData. A variant ID of -1 means is refblock. 0 means store N. BUt we don't have variantID for N block! ANd it needs to be N to Ref
public static RefAltData findTaxonRefCoverage(Position startPos, int maxRefLen, TaxaList tList, Map> taxonToPosDataMap, double maxError, double minTaxaCoverage) {
int refDepth = 0; // filled in for refBLock
int altDepth = 0; // filled in for refBlock (will be 0, altdepth not stored for refBlock)
int indelRefDepth = 0; // filled in when we have non-ref block
int indelAltDepth = 0; // filled in when we have non-ref block
int taxaRefCount = 0;
int middleOfVariantCount = 0;
int refLen = Integer.MAX_VALUE; // start high - we pick smallest Ref covered
// Split taxa names back to original so their Ref data can be grabbed
TaxaList splitTList = splitConsensusTaxa(tList);
// This map holds a list of variantIds and the number of times
// they were seen. The RefAltData for each variantId is also stored.
Map> varidCountRADMap = new HashMap<> ();
for (Taxon taxa : splitTList) {
RangeMap txRangeMap = taxonToPosDataMap.get(taxa);
if (txRangeMap == null) {
throw new IllegalArgumentException("findTaxonRefCoverage: no data for taxa " + taxa.getName());
}
Map.Entry, RefAltData> posRadEntry = txRangeMap.getEntry(startPos);
RefAltData rad = null;
if (posRadEntry != null ) {
rad = posRadEntry.getValue();
}
if (rad != null && rad.getVariantId() == -1) { // have data at this position and it is a ref block
taxaRefCount++;
refDepth += rad.getRefDepth();
altDepth += rad.getAltDepth();
// the range contains the startpos. Determine actual ref covered based on parameter startPos
// We want the minimum number covered. rad.getPosition() is the start of the range.
// ex: range starts at 3, is of length 9. The range was inclusive/inclusive so
// goes from 3-11. The current ref index (startPos) is at 7. From 7 until the
// end there are 5 position (7-11). (3 + 9) - 7 = 5
int taxaRefCovered = (rad.getPosition() + rad.getLen()) - startPos.getPosition();
if (taxaRefCovered < refLen) {
refLen = taxaRefCovered; // keeping the smallest ref length
}
} else if (rad != null){ // is indel or SNP
// The range start position must equal the current ref start position or it is skipped.
// To be compared, variants must start at the same positions
if (rad.getPosition() != startPos.getPosition()) {
middleOfVariantCount ++;
continue; // are in the middle of a variant
}
indelRefDepth += rad.getRefDepth();
indelAltDepth += rad.getAltDepth();
if (varidCountRADMap.containsKey(rad.getVariantId())) {
Tuple oldData = varidCountRADMap.get(rad.getVariantId());
Tuple newValues = new Tuple(oldData.getX()+1,oldData.getY());
varidCountRADMap.put(rad.getVariantId(), newValues);
} else {
varidCountRADMap.put(rad.getVariantId(), new Tuple(1,rad));
}
} // otherwise do nothing
}
int totalCounter = taxaRefCount + middleOfVariantCount;
int currentMax = taxaRefCount;
if (maxRefLen > 0 && refLen > maxRefLen) refLen = maxRefLen; // stop refStretch before next SNP.
RefAltData currentMaxRefAltData = null;
if(varidCountRADMap.size() > 0) {
for (Tuple data : varidCountRADMap.values()) {
if(data.getX() > currentMax) {
currentMax = data.getX();
currentMaxRefAltData = data.getY();
}
totalCounter += data.getX();
}
}
// Check the maxError - avoid dividing by 0
double errorRate = (totalCounter == 0 ) ? Double.MAX_VALUE : 1.0 - ((double)currentMax / (double)totalCounter);
// If coverage level is too low or the error rate is too high, return missing (variant ID = 0)
if((double)totalCounter/splitTList.size() < minTaxaCoverage || errorRate > maxError) {
return new RefAltData(0, startPos.getChromosome().getName(), startPos.getPosition(), 1, indelRefDepth, indelAltDepth);
}
else {
if(currentMaxRefAltData == null) { // return Ref call (variant ID = -1)
if (refLen <=0 ) { // this should not happen
myLogger.debug("findTaxonRefCoverage: currentMaxRefAltData is NULL, using bad refLen of " + refLen);
}
return new RefAltData(-1, startPos.getChromosome().getName(), startPos.getPosition(), refLen, refDepth, altDepth);
}
if (currentMaxRefAltData.getLen() <= 0 ) { // should not happen
myLogger.debug("findTaxonRefCoverage: using currentMaxRefAltData wih bad refLen of " + currentMaxRefAltData.getLen());
}
return currentMaxRefAltData; // return the variant info
}
}
// FindHaplotypeClusters merged the taxa into 1, changing the name
// to be the concatenation of the taxa names, with a "_0" added to each name.
// Names are separated by ":". Split these back up and make a taxa list to traverse
public static TaxaList splitConsensusTaxa(TaxaList tList){
TaxaListBuilder taxaBuilder = new TaxaListBuilder();
tList.stream().forEach(taxa ->{
String taxaName = taxa.getName();
StringTokenizer tokenizer = new StringTokenizer(taxaName, ":");
while (tokenizer.hasMoreElements()) {
String name = (String)tokenizer.nextElement();
int nameEnd = name.lastIndexOf("_");
if (nameEnd < 0) {
throw new IllegalArgumentException("ConsensusProcessingUtils:splitConsensusTaxa: no underscore found in name " + name);
}
name = name.substring(0,nameEnd);
taxaBuilder.add(name);
}
});
TaxaList taxaList = taxaBuilder.build();
return taxaList;
}
// need ordered listof taxon INCLUDING the _ for sending to
// putConsensusData for loading to DB
private static List getOrderedConsensusTaxon(TaxaList tList){
List orderedTaxonList = new ArrayList();
tList.stream().forEach(taxa ->{
String taxaName = taxa.getName();
StringTokenizer tokenizer = new StringTokenizer(taxaName, ":");
while (tokenizer.hasMoreElements()) {
orderedTaxonList.add((String)tokenizer.nextElement());
}
});
Collections.sort(orderedTaxonList);
return orderedTaxonList;
}
// THis method chooses the most frequent value found at each position.
// Currently, it does not check counts, N's, etc. And breaking of ties
// is arbitrary.
public static Map chooseVarIdForSNPPositionFromGenotypeTable(GenotypeTable genotypeTable, Map> posCallVarIdMap ) {
Map positionToVariantIDMap = new HashMap();
// genotypes defined in ConsensusProcessinUtils.gtvalues. alleleRedirect map has index into this array
// Get the reverse map
Map genotypeToIndex = createGtvalueMap();
for (int idx = 0; idx < genotypeTable.numberOfSites(); idx ++) {
// There is only 1 taxon in the genotypeTable as this is consensus and
// the genotype at each position was already called.
byte majorGenotype = genotypeTable.genotype(0, idx);;
// store call for this position to the map.
Position pos = genotypeTable.positions().get(idx);
BiMap varidtoCallBiMap = posCallVarIdMap.get(idx); // positions index, not physical position, was stored
Integer varId;
Integer gtMapIndex = genotypeToIndex.get(majorGenotype);
if (gtMapIndex == null) {
throw new IllegalStateException("chooseVarIdForSNPPositionFromGenotypeTable: genotype " + majorGenotype + " not found in map!!");
}
if (gtMapIndex == 0) {
varId = -2; // is missing
} else if (gtMapIndex == 1) {
varId = -1; // is ref
} else {
varId = varidtoCallBiMap.inverse().get(gtMapIndex); // translate the call to the variantID
if (varId == null) {
throw new IllegalArgumentException("VariantProcessingUtils:getPositionForSNPsMap: no entry for chrom "
+ pos.getChromosome().getName() + " position " + pos.getPosition());
}
}
positionToVariantIDMap.put(pos, varId);
}
return positionToVariantIDMap;
}
/**
* Returns a map of variantID to RefAltData for each variantId/data lookup.
* variantID of -1 (ref) is handled in calling method.
* @param taxonToPosDataMap
* @return
*/
public static Map createVarIDtoRefAltData(Map> taxonToPosDataMap) {
Map varIdToDataMap = new HashMap();
for (RangeMap mapData : taxonToPosDataMap.values()) {
for (RefAltData rad : mapData.asMapOfRanges().values()) {
varIdToDataMap.put(rad.getVariantId(),rad);
}
}
return varIdToDataMap;
}
public static String convertVariantsToSequence(List variants, ReferenceRange refRange, GenomeSequence refSequence) {
//make sure variants are in order by start and have unique start positions, resolve any overlapping variants
//rules for resolving overlaps
//1. Overwrite any multisite ref calls
//2. Do not overwrite alt calls (this will almost always be a deletion because SNPS only cover one ref position)
//variants will already be in order by start and the starts will be non-overlapping.
//So, just need to compare each variant to the previous variant.
//Once the previous variant range has been determined its sequence can be added to the string
Chromosome chr = refRange.chromosome();
if (!variantsInOrder(variants)) throw new IllegalArgumentException("Variants not sorted by start position or variant start not unique");
VariantInfo previousVariant = null;
StringBuilder mySequence = new StringBuilder();
for (VariantInfo info : variants) {
if (previousVariant != null) {
//does this variant overlap the previous variant?
if (info.start() > previousVariant.end()) {
//no overlap
addVariantToSequence(mySequence, previousVariant, chr, refSequence);
//if there is a gap, pad with N's after previous sequence
for (int pos = previousVariant.end() + 1; pos < info.start(); pos++) mySequence.append("N");
previousVariant = info;
} else {
//apply rules one and two to determine range for these variants
//then add sequence from previousVariant to mySequence
boolean prevAltCall = previousVariant.isVariant() && previousVariant.genotypeString().equals(previousVariant.altAlleleString());
if (prevAltCall) {
//do not overwrite the previous call (leave its range as is)
addVariantToSequence(mySequence, previousVariant, chr, refSequence);
int numberOfCharactersToTrim = previousVariant.end() - info.start() + 1;
//if this info is a variant trim necessary characters of the beginning of the genotype string
if (info.isVariant()) {
String trimmedGenotype;
if (info.genotypeString().length() <= numberOfCharactersToTrim) trimmedGenotype = "";
else trimmedGenotype = info.genotypeString().substring(numberOfCharactersToTrim);
previousVariant = new VariantInfo(info.chromosome(), info.start(), info.end(), trimmedGenotype,
info.refAlleleString(), info.altAlleleString(), info.isVariant(), info.depth());
} else {
//if info is a ref block adjust its range to start after the previous one ends
previousVariant = new VariantInfo(info.chromosome(), previousVariant.end() + 1, info.end(), info.genotypeString(),
info.refAlleleString(), info.altAlleleString(), info.isVariant(), info.depth());
}
} else if (previousVariant.isVariant()) {
//trim the end off the previous genotype so that it ends before this one
int numberOfCharactersInGenotype = info.start() - previousVariant.start();
String trimmedGenotype = previousVariant.genotypeString().substring(0,numberOfCharactersInGenotype);
previousVariant = new VariantInfo(previousVariant.chromosome(), previousVariant.start(), previousVariant.end(), trimmedGenotype,
previousVariant.refAlleleString(), previousVariant.altAlleleString(), previousVariant.isVariant(), previousVariant.depth());
addVariantToSequence(mySequence, previousVariant, chr, refSequence);
previousVariant = info;
} else {
//reduce the previous range so that it ends before this one
previousVariant = new VariantInfo(previousVariant.chromosome(), previousVariant.start(), info.start() - 1, previousVariant.genotypeString(),
previousVariant.refAlleleString(), previousVariant.altAlleleString(), previousVariant.isVariant(), previousVariant.depth());
addVariantToSequence(mySequence, previousVariant, chr, refSequence);
previousVariant = info;
}
}
} else {
previousVariant = info;
}
}
//add the final variant
addVariantToSequence(mySequence, previousVariant, chr, refSequence);
//Convert range map to String
//Add N's for any reference positions not covered by the range map
return mySequence.toString();
}
private static void addVariantToSequence(StringBuilder sequence, VariantInfo variant, Chromosome chr, GenomeSequence refseq) {
if (variant.end() < variant.start()) {
StringBuilder sb = new StringBuilder("Variant end less than variant start for chr ");
sb.append(variant.chromosome()).append(" ").append(variant.start());
sb.append(" to ").append(variant.end()).append(", geno=");
sb.append(variant.genotypeString()).append(" ref=").append(variant.refAlleleString()).append(" alt=").append(variant.altAlleleString());
myLogger.debug(sb.toString());
throw new IllegalArgumentException("variant end less than variant start");
}
if (variant.isVariant()) {
//add the genotypeString
sequence.append(variant.genotypeString());
} else {
//add ref for the variant range
sequence.append(refseq.genotypeAsString(chr, variant.start(), variant.end()));
}
}
public static boolean variantsInOrder(List variants) {
Iterator variantIterator = variants.iterator();
VariantInfo previousVariant;
if (variantIterator.hasNext()) previousVariant = variantIterator.next();
else return true;
while (variantIterator.hasNext()) {
VariantInfo currentVariant = variantIterator.next();
if (currentVariant.chromosome().equals(previousVariant.chromosome())) {
if (currentVariant.start() < previousVariant.start()) return false;
}
}
return true;
}
/**
* Encode a reference block to a Long
* @param refLength length of the reference block
* @param refDepth readDepth in the reference block. Max depth is 127 because it is encoded by a single byte.
* @param pos the start position of the block on a chromosome
* @return the long encoding this information
*
* */
public static Long encodeRefBlockToLong(int refLength, int refDepth, int pos ) {
//ref: 1bit=ref | 2 bytes 7 bits = refLength | 1 bytes=refDepth | 4 bytes=position on chrom
int MAX_REF_LENGTH = 8388607;
int MAX_REF_DEPTH = 127;
int reflen = Math.min(MAX_REF_LENGTH, refLength);
int refdep = Math.min(MAX_REF_DEPTH, refDepth);
long encodedLong = reflen;
encodedLong <<= 8;
encodedLong += refdep;
encodedLong <<= 32;
encodedLong += pos;
encodedLong |= 1L << 63; // set upper bit to indicate this is a reference record
return encodedLong;
}
/**
* Encode variant information as a Long
* @param variantId
* @param refDepth read depth of the reference allele. Max = 127
* @param altDepth read depth of the alt allele. Max = 127
* @param isIndel true if the variant is an indel, false otherwise.
* @return the Long encoding this information
* */
public static Long encodeVariantToLong(int variantId, int refDepth, int altDepth, boolean isIndel) {
//Variant: 4 bytes= variant_mapping table id | 1 byte=refDepth | 1 byte=altDepth | 1 isIndel | 1 byte=unused
int MAX_DEPTH = 127;
int refdep = Math.min(MAX_DEPTH, refDepth);
int altdep = Math.min(MAX_DEPTH, altDepth);
long encodedLong = variantId;
encodedLong <<= 8;
encodedLong += refdep;
encodedLong <<= 8;
encodedLong += altdep;
encodedLong <<= 8;
encodedLong += isIndel ? 1 : 0;
encodedLong <<= 8;
return encodedLong;
}
/**
* Method to verify that the taxa in the graph are in the ranking file.
* We can have more taxon in the ranking file than in the graph, but not the other way around.
* @param graph
* @param rankingMap
* @return
*/
public static boolean areGraphTaxaInRankingMap(HaplotypeGraph graph, Map rankingMap) {
//Get out the taxa list for the graph
TaxaList taxaList = graph.taxaInGraph();
long numGraphTaxonNotInRank = taxaList.stream().map(taxon -> taxon.getName())
.filter(taxonName -> !rankingMap.keySet().contains(taxonName))
.count();
//If the number of graph taxon not in the ranking file is above 0, we are not able to pick the correct ranking.
if(numGraphTaxonNotInRank>0) {
for (Taxon taxon : taxaList) {
if (rankingMap.get(taxon.getName()) == null) {
myLogger.warn("areGraphTaxaInRankingMap: missing from rankingFile:" + taxon.getName());
}
}
return false;
}
else {
return true;
}
}
/**
* Method to verify that rankings are unique. This is used to throw a Warning message in the Assembly consensus.
* @param rankingMap
* @return
*/
public static boolean areRankingsUnique(Map rankingMap) {
Set rankingSet = new HashSet<>();
for(String taxonName : rankingMap.keySet()) {
if(rankingSet.contains(rankingMap.get(taxonName))) {
return false;
}
else {
rankingSet.add(rankingMap.get(taxonName));
}
}
return true;
}
/**
* Function to create a disntance matrix given a set of variants for a single reference range.
*
* This will ignore Ns and indels
*
* Distance is #SNPs/(#SNPs + #RefPos)
*
* @param ntaxa
* @param chr
* @param taxaWithInfo
* @param taxonToVariantInfoMap
* @return
*/
public static DistanceMatrix createDistanceMatrix(int ntaxa, Chromosome chr, ReferenceRange currentRefRange, TaxaList taxaWithInfo, Map> taxonToVariantInfoMap ) {
DistanceMatrixBuilder dmBuilder = DistanceMatrixBuilder.getInstance(taxaWithInfo);
myLogger.debug(taxaWithInfo.size() + " taxa used to build distance matrix in createDistanceMatrix (line 549)");
//set diagonal to 0
for (int i = 0; i < ntaxa; i++) dmBuilder.set(i, i, 0.0);
for (int i = 0; i < ntaxa - 1; i++) {
RangeMap headMap = taxonToVariantInfoMap.get(taxaWithInfo.get(i));
for (int j = i + 1; j < ntaxa; j++) {
RangeMap compMap = taxonToVariantInfoMap.get(taxaWithInfo.get(j));
int totalSiteCount = 0;
int diffSiteCount = 0;
int pos = currentRefRange.start();
int prevPos = 0;
while (pos <= currentRefRange.end()) {
if(pos > prevPos) {
prevPos = pos;
}
VariantInfo headInfo = headMap.get(pos);
VariantInfo compInfo = compMap.get(pos);
//if either is null increment pos, no information so cannot make a comparison
if (headInfo == null || compInfo == null) pos++;
//if !isVariant for both, then these are ref blocks
//increment total sites by min(head.end, comp.end) - current pos + 1
//set pos = min(head.end, comp.end) + 1
else if (!headInfo.isVariant() && !compInfo.isVariant()) {
int blockend = Math.min(headInfo.end(), compInfo.end());
if(pos>blockend) {
System.out.println("Pos:"+pos+" "+blockend);
}
blockend = Math.min(blockend, currentRefRange.end()); //ref blocks may extend beyond the end of the range
int coveredLength = blockend - pos + 1;
if (coveredLength < 1) {
String msg = String.format("CoveredLength < 1 at chr %s pos %d, taxa = %s, %s",
chr.getName(), pos, taxaWithInfo.taxaName(i), taxaWithInfo.taxaName((j)));
throw new IllegalArgumentException(msg);
}
totalSiteCount += coveredLength;
pos = blockend + 1;
}
//if head is not variant and comp is variant:
//if comp.start != pos, we landed in the middle of an indel, skip to comp.end + 1
//if comp.start == pos, increment total site count by 1
//set pos = comp.end + 1
else if (!headInfo.isVariant() && compInfo.isVariant()) {
if (compInfo.start() == pos) {
totalSiteCount++;
if (!compInfo.isIndel() && compInfo.genotypeString().equals(compInfo.altAlleleString())) {
diffSiteCount++;
}
}
pos = compInfo.end() + 1;
}
//same for head is variant and comp is not variant
else if (headInfo.isVariant() && !compInfo.isVariant()) {
if (headInfo.start() == pos) {
totalSiteCount++;
if (!headInfo.isIndel() && headInfo.genotypeString().equals(headInfo.altAlleleString())) {
diffSiteCount++;
}
}
pos = headInfo.end() + 1;
}
//if both are variant
//if head.start = comp.start = pos process otherwise skip to max(head.end, comp.end)
//compare genotypes add same site count total, add diff site count to total and diff
//set pos = max(head.end, comp.end) + 1
else {
if (headInfo.start() == compInfo.start() && compInfo.start() == pos) {
totalSiteCount++;
if(!headInfo.isIndel() && !compInfo.isIndel() && !headInfo.genotypeString().equals(compInfo.genotypeString()))
diffSiteCount++;
}
pos = Math.max(headInfo.end(), compInfo.end()) + 1;
}
}
double dist = (double) diffSiteCount / totalSiteCount;
dmBuilder.set(i, j, dist);
}
}
return dmBuilder.build();
}
/**
* Function to create a new DistanceMatrix setting NaNs to the maximum value in both its row and column.
* @param originalDM
* @return
*/
public static DistanceMatrix setNsToMax(DistanceMatrix originalDM) {
DistanceMatrixBuilder dmBuilder = DistanceMatrixBuilder.getInstance(originalDM.getTaxaList());
List nanCoords = new ArrayList<>();
for(int row = 0; row < originalDM.numberOfTaxa(); row++) {
for( int col = row; col < originalDM.numberOfTaxa();col++) {
if(Float.isNaN(originalDM.getDistance(row,col))) {
//Store coords for fixing later
nanCoords.add(new Integer[]{row,col});
}
else {
dmBuilder.set(row,col,originalDM.getDistance(row,col));
}
}
}
//Loop through the NanCoordinates
for(Integer[] nanCoordinate : nanCoords) {
dmBuilder.set(nanCoordinate[0],nanCoordinate[1],maxDistance(originalDM,nanCoordinate[0],nanCoordinate[1]));
}
return dmBuilder.build();
}
/**
* Function to figure out what the maximum row and col distance are for a given position in the oringinal distance Matrix.
* @param matrix
* @param row
* @param col
* @return
*/
public static float maxDistance(DistanceMatrix matrix,int row, int col) {
float currentMax = 0.0f;
for(int colCounter = 0; colCounter < matrix.numberOfTaxa(); colCounter++) {
if( !Float.isNaN(matrix.getDistance(row,colCounter)) && matrix.getDistance(row,colCounter) > currentMax) {
currentMax = matrix.getDistance(row,colCounter);
}
}
for( int rowCounter = 0; rowCounter < matrix.numberOfTaxa(); rowCounter++) {
if(!Float.isNaN(matrix.getDistance(rowCounter,col)) && matrix.getDistance(rowCounter,col) > currentMax) {
currentMax = matrix.getDistance(rowCounter,col);
}
}
return currentMax;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy