net.maizegenetics.pangenome.api.VariantUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.api;
import com.google.common.collect.ImmutableMap;
import htsjdk.variant.variantcontext.*;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.snp.score.AlleleDepthUtil;
import net.maizegenetics.pangenome.api.HaplotypeNode.VariantInfo;
import net.maizegenetics.pangenome.db_loading.VariantsProcessingUtils;
import net.maizegenetics.util.Tuple;
import org.apache.log4j.Logger;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import java.util.stream.Collectors;
public class VariantUtils {
private static final Logger myLogger = Logger.getLogger(VariantUtils.class);
public static double maxError = 0.2;
public static List extractOutVariantContext(List variantList, String chromosome, GenomeSequence refSequence, String taxon, Map variantMapping) {
List vcs = new ArrayList<>();
//Loop through the variantList
for(Long variantEncoded:variantList) {
if(variantEncoded <0) {
//If variantEncoded is a negative number, it is a reference range
vcs.add(parseRefRangeLong(variantEncoded, chromosome,refSequence,taxon));
}
else {
vcs.add(parseVariant(variantEncoded,chromosome,taxon,variantMapping));
}
}
return vcs;
}
private static VariantContext parseRefRangeLong(long refBlockEncoded, String chromosome, GenomeSequence refSequence, String taxon) {
//RefBlocks are encoded like this:
//1bitSign | 2 byte 7 bit Length | 1 byte DP | 4 bytes position
int position = (int)(refBlockEncoded & 0xFFFF);
int depth = AlleleDepthUtil.depthByteToInt((byte)((refBlockEncoded>>32) & 0xFF));
int length = (int)((refBlockEncoded>>40) & 0x7FF);
Allele firstRefAllele = Allele.create(refSequence.genotypeAsString(Chromosome.instance(chromosome),position),true);
Genotype gt = new GenotypeBuilder().name(taxon).alleles(Arrays.asList(firstRefAllele)).DP(depth).make();
VariantContextBuilder vcb = new VariantContextBuilder().chr(chromosome)
.start(position)
.stop(position+length)
.attribute("END",position+length)
.alleles(Arrays.asList(firstRefAllele))
.genotypes(gt);
return vcb.make();
}
private static VariantContext parseVariant(long variantEncoded, String chromosome, String taxon, Map variantMapping) {
//Alt encoding is like this:
//variantID 4 byte | refDepth 1 byte | altDepth 1 byte | empty 2 bytes
int variantID = (int)((variantEncoded>>32)&0xFFFF);
int refDepth = AlleleDepthUtil.depthByteToInt((byte)((variantEncoded>>24) & 0xFF));
int altDepth = AlleleDepthUtil.depthByteToInt((byte)((variantEncoded>>16) & 0xFF));
DBVariant currentVariantRecord = variantMapping.get(variantID);
GenotypeBuilder genotypeBuilder = new GenotypeBuilder().name(taxon).DP(refDepth+altDepth).AD( new int[]{refDepth,altDepth});
if(altDepth>refDepth) {
genotypeBuilder.alleles(Arrays.asList(currentVariantRecord.getMyAltAllele()));
}
else {
genotypeBuilder.alleles(Arrays.asList(currentVariantRecord.getMyRefAllele()));
}
VariantContextBuilder vcb = new VariantContextBuilder()
.chr(chromosome)
.start(currentVariantRecord.getMyStartPosition())
.stop(currentVariantRecord.getMyStartPosition()+currentVariantRecord.getMyRefAllele().length()-1)
.alleles(Arrays.asList(currentVariantRecord.getMyRefAllele(),currentVariantRecord.getMyAltAllele()))
.genotypes(genotypeBuilder.make());
return vcb.make();
}
public static void addGraphVariantsToVariantMap(HaplotypeGraph hapGraph, Connection dbConn) {
Set variantIds = getAllVariantIds(hapGraph);
if (variantIds.size() == 0 ) variantIds = getAllVariantIds(hapGraph, dbConn); //the variant lists were not loaded in PHG
if (variantIds.size() > 0) addVariantIdsToVariantMap(variantIds, dbConn);
}
private static void addVariantIdsToVariantMap(Collection variantIds, Connection dbConn) {
Map alleleMap = getAlleleMap(dbConn);
// First get variant info using connection, then get the alleles not in db
String sqlString = "SELECT chrom, position, ref_allele_id, alt_allele_id FROM variants WHERE variant_id = ?";
try (PreparedStatement st = dbConn.prepareStatement(sqlString)) {
for (Integer id : variantIds) {
//no need to query db if the variant is already in the map
if (Variant.masterVariantMap.containsKey(id)) continue;
if (id < 1) {
throw new IllegalArgumentException("addVariantIdsToVariantMap: Bad value for variantID: " + id);
}
st.setInt(1, id);
ResultSet rs = st.executeQuery();
if (rs.next()) {
int refAlleleId = rs.getInt("ref_allele_id");
int altAlleleId = rs.getInt("alt_allele_id");
String chrom = rs.getString("chrom");
int pos = rs.getInt("position");
rs.close(); // close result set to allow for new query in call below
// This method goes to the DB if the allele data is not on the map.
Tuple refAltAlleles = getAlleleInfoFromDB(dbConn, alleleMap, refAlleleId, altAlleleId);
Variant.masterVariantMap.put(id, new Variant(id, chrom, pos, refAltAlleles.getX(), refAltAlleles.getY()));
} else {
throw new IllegalArgumentException("addVariantIdsToVariantMap: no values returned for variantId " + id);
}
}
} catch(SQLException e) {
throw new IllegalArgumentException("Error querying PHG db getting variant data.", e);
}
}
public static Map variantIdsToVariantMap(String chromosome, int start, int end, Connection database) {
int startPos = start < 0 ? -1 : start;
int endPos = end < 0 ? -1 : end;
if (endPos < startPos) {
throw new IllegalArgumentException("VariantUtils: variantIdsToVariantMap: end: " + end + " must be greater than start: " + start);
}
Map alleleMap = getAlleleMap(database);
// First get variant info using connection, then get the alleles not in db
StringBuilder sqlString = new StringBuilder();
sqlString.append("SELECT variant_id, chrom, position, ref_allele_id, alt_allele_id FROM variants");
if (chromosome != null) {
sqlString.append(" WHERE chrom = '");
sqlString.append(chromosome);
sqlString.append(("'"));
if (startPos > -1) {
sqlString.append(" AND position BETWEEN ");
sqlString.append(startPos);
sqlString.append(" AND ");
sqlString.append(endPos);
}
}
sqlString.append(";");
String query = sqlString.toString();
myLogger.info("variantIdsToVariantMap: query statement: " + query);
ImmutableMap.Builder result = new ImmutableMap.Builder<>();
try (ResultSet rs = database.createStatement().executeQuery(query)) {
while (rs.next()) {
int variantId = rs.getInt("variant_id");
int refAlleleId = rs.getInt("ref_allele_id");
int altAlleleId = rs.getInt("alt_allele_id");
String chrom = rs.getString("chrom");
int pos = rs.getInt("position");
// This method goes to the DB if the allele data is not on the map.
Tuple refAltAlleles = getAlleleInfoFromDB(database, alleleMap, refAlleleId, altAlleleId);
result.put(variantId, new Variant(variantId, chrom, pos, refAltAlleles.getX(), refAltAlleles.getY()));
}
} catch (SQLException e) {
myLogger.error(e.getMessage(), e);
throw new IllegalArgumentException("VariantUtils: variantIdsToVariantMap: Error querying PHG db getting variant data.", e);
}
return result.build();
}
public static Set getAllVariantIds(HaplotypeGraph hapGraph) {
return hapGraph.nodeStream()
.map(hn -> hn.byteEncodedVariants())
.filter(ba -> ba != null)
.flatMap(byteArray -> VariantsProcessingUtils.decodeByteArrayToVariantLongList(byteArray).stream())
.filter(val -> val >= 0)
.mapToInt(val -> (int) (val >> 32))
.boxed()
.collect(Collectors.toSet());
}
// NOTE: for the sqlite db, using a PreparedStatement for the select is returning an
// empty result set. Consequently select is created without
public static Set getAllVariantIds(HaplotypeGraph hapGraph, Connection dbConn) {
int batchSize = 5000;
int[] ids = hapGraph.nodeStream().mapToInt(node -> node.id()).toArray();
int numberOfIds = ids.length;
int numberOfBatches = numberOfIds / batchSize;
Set varidSet = new HashSet<>();
try {
for (int ndx = 0; ndx < numberOfBatches + 1; ndx++) {
int start = ndx * batchSize;
int end = Math.min(start + batchSize, ids.length);
int[] idSubset = Arrays.copyOfRange(ids, start, end );
String inString = Arrays.stream(idSubset).mapToObj(Integer::toString).collect(Collectors.joining(","));
StringBuilder sb = new StringBuilder();
sb.append("SELECT variant_list FROM haplotypes WHERE haplotypes_id IN (");
sb.append(inString);
sb.append(")");
ResultSet rs = dbConn.createStatement().executeQuery(sb.toString());
while (rs.next()) {
byte[] encodedVariants = rs.getBytes("variant_list");
if (encodedVariants != null && encodedVariants.length > 0) {
List longList = VariantsProcessingUtils.decodeByteArrayToVariantLongList(encodedVariants);
for (Long val : longList) {
if (val > 0) // negative values are reference ranges - we only want variants
varidSet.add((int) (val >> 32));
}
}
}
rs.close();
}
} catch (SQLException sqle) {
throw new IllegalArgumentException("Error processing sql statment to selelct variant_List from haplotypes for a specific haplotypes_id: ", sqle);
}
myLogger.info("getAllVariantIds: size of varidSet to return: " + varidSet.size());
return varidSet;
}
public static Set getAllVariantIds(Connection dbConn, String variantQuery) {
Set varidSet = new HashSet<>();
try (ResultSet rs = dbConn.createStatement().executeQuery(variantQuery)) {
while (rs.next()) {
List varLongs = VariantsProcessingUtils.decodeByteArrayToVariantLongList(rs.getBytes(1));
List ids = varLongs.stream()
.mapToInt(val -> decodeLongVariant(val)[1])
.boxed().collect(Collectors.toList());
varidSet.addAll(ids);
}
} catch (SQLException e) {
throw new IllegalArgumentException("Problem querying database using: " + variantQuery, e);
}
return varidSet;
}
/**
* @param variant
* @return int[] containing variant data. The content depends on the whether variant is positive or negative.
*
* If variant is positive then the long encodes data for a variant.
* The return int[] is 1, variant id, refDepth, altDepth.
*
* If the variant is negative then the long encodes data for a reference block
* The return int[] is -1, block length, read depth, the block chromosomal position.
*/
public static int[] decodeLongVariant(Long variant) {
int[] info = new int[4];
long vmlong = variant.longValue();
if (variant >= 0) {
info[0] = 1;
//Variant: 4 bytes= variant_mapping table id | 1 byte=refDepth | 1 byte=altDepth | 1 isIndel | 1 byte=unused
vmlong >>= 16;
byte altDepthByte = (byte)(vmlong & 0xFF); // to correctly handle negative numbers, load first into byte
info[3] = (int) altDepthByte; //altDepth
vmlong >>= 8;
byte refDepthByte = (byte) (vmlong & 0xFF);
info[2] = (int) refDepthByte; //refDepth
vmlong >>= 8;
info[1] = (int) vmlong; //variant id from database
} else {
info[0] = -1;
//ref: 1bit=ref | 2 bytes 7 bits = refLength | 1 bytes=refDepth | 4 bytes=position on chrom
vmlong ^= 1L << 63;
info[3] = (int) (vmlong & 0xFFFFFFFF);
vmlong >>= 32;
byte refDepthByte = (byte) (vmlong & 0xFF);
info[2] = (int) refDepthByte;
vmlong >>= 8;
info[1] = (int) vmlong;
}
return info;
}
public static Map getAlleleMap(Connection dbConn) {
Map alleleInfoMap = new HashMap<>();
// This assumes 5-mers were initially loaded to the DB. Only those alleles are initially stored
// in the map. All others will be queried from the DB as needed.
// The 3906: initial alleles loaded to the DB will be 5-mers containing alleles A,C,G,T,N
// 3906 comes from 5 + 5^2 + 5^3 + 5^4 + 5^5 = 5 + 25 + 125 + 625 + 3125 = 3905
String sql = "SELECT allele_id, allele_string, display_string, len FROM alleles where allele_id < 3906";
try (ResultSet rs = dbConn.createStatement().executeQuery(sql)) {
while (rs.next()) {
int id = rs.getInt(1);
int len = rs.getInt("len");
if (len < 100) {
// only adding alleles with length < 100.
// If the db was created with 5-mers as the initial alleles, all will be less than 100 (ie 5 )
alleleInfoMap.put(id, new AlleleInfo(id, rs.getBytes(2), rs.getString(3), len));
}
}
} catch(SQLException e) {
throw new IllegalArgumentException("Error executing query: " + sql, e);
}
return alleleInfoMap;
}
public static String assignGenotpe(String refAllele, String altAllele, int refDepth, int altDepth) {
if (refDepth > altDepth && altDepth < maxError * (refDepth + altDepth) ) {
return refAllele;
}
if (refDepth < altDepth && refDepth < maxError * (refDepth + altDepth) ) {
return altAllele;
}
return VariantInfo.missing;
}
/**
* Takes refAlleleId and altAlleleId. Looks for the corresponding data from the alleleMap.
* If not found, the allele Data is queried from the DB.
*
* A Tuple of representing is returned.
* @param dbConn
* @param alleleMap
* @param refAlleleId
* @param altAlleleId
* @return
*/
public static Tuple getAlleleInfoFromDB(Connection dbConn, Map alleleMap, int refAlleleId, int altAlleleId){
StringBuilder queryIds = new StringBuilder();
AlleleInfo refAlleleInfo = alleleMap.get(refAlleleId);
AlleleInfo altAlleleInfo = alleleMap.get(altAlleleId);
if (refAlleleInfo == null) {
queryIds.append(Integer.toString(refAlleleId));
if (altAlleleInfo == null) {
queryIds.append(",").append(Integer.toString(altAlleleId));
}
} else if (altAlleleInfo == null) {
queryIds.append(Integer.toString(altAlleleId));
}
if (queryIds.length() > 0) {
// query the DB for the info missing from the map
StringBuilder querySB = new StringBuilder();
querySB.append("SELECT allele_id, allele_string, display_string, len FROM alleles where allele_id IN (");
querySB.append(queryIds.toString());
querySB.append(" );");
try (ResultSet rs = dbConn.createStatement().executeQuery(querySB.toString())) {
while (rs.next()) {
int id = rs.getInt("allele_id");
if (id == refAlleleId) {
refAlleleInfo = new AlleleInfo(id, rs.getBytes("allele_string"), rs.getString("display_string"), rs.getInt("len"));
if (refAlleleInfo.length() < 100) {
// allele strings largest than 100 are probably unique so no need to add to map
alleleMap.put(id, refAlleleInfo);
}
}
if (id == altAlleleId) {
altAlleleInfo = new AlleleInfo(id, rs.getBytes("allele_string"), rs.getString("display_string"), rs.getInt("len"));
if (altAlleleInfo.length() < 100) {
// allele strings largest than 100 are probably unique so no need to add to map
alleleMap.put(id, altAlleleInfo);
}
}
}
// ResetSet is closed with try
} catch(SQLException sqle) {
throw new IllegalArgumentException("Error executing query: " + querySB.toString(), sqle);
}
}
Tuple refAltAlleles = new Tuple(refAlleleInfo, altAlleleInfo);
return refAltAlleles;
}
}