net.maizegenetics.pangenome.db_loading.VariantsProcessingUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.db_loading;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.nio.ByteBuffer;
import java.sql.Connection;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import org.apache.log4j.Logger;
import org.xerial.snappy.Snappy;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.hapcollapse.GVCFUtils;
import net.maizegenetics.util.Tuple;
/**
* This class contains methods to aid in processing a VariantContext list
* into the PHG db variants, alleles, and haplotypes tables
* @author lcj34
*
*/
public class VariantsProcessingUtils {
private static final Logger myLogger = Logger.getLogger(VariantsProcessingUtils.class);
/**
* createALleleList returns an array list containing just the ref allele string,
* if the VariantContext record is a ref record. Or both the ref allele and first
* alt allele, if the VariantContext record is a variant record.
*
* THis method does not check if the allele exists. The returned list
* is ALL alleles - this assumes they will be added via an INSERT/IGNORE db command.
*
* @param vc
* @return
*/
public static List createAlleleList( VariantContext vc) {
List alleles = new ArrayList<>();
String refAllele = vc.getReference().getBaseString();
if (GVCFUtils.isReferenceRecord(vc)) {
// only need to check the reference
alleles.add(refAllele);
} else {
List alt = vc.getAlternateAlleles();
String altAllele;
// We are only processing 1 alternate allele per record
if (alt.size() == 1) {
altAllele = alt.get(0).getBaseString();
} else {
if (!alt.get(0).getBaseString().equals("")) { // "" gets stripped and becomes ""
altAllele = alt.get(0).getBaseString();
} else { // this assumes NON_REF was the first alt and it was stripped to ""
altAllele = alt.get(1).getBaseString();
}
}
alleles.add(altAllele);
}
return alleles;
}
/**
* Method to create a list of alleles based on if the variant info is a ref or variant.
* @param vi
* @return
*/
public static List createAlleleList( HaplotypeNode.VariantInfo vi) {
List alleles = new ArrayList<>();
String refAllele = vi.refAlleleString();
alleles.add(refAllele);
if (vi.isVariant()) {
String altAllele = vi.altAlleleString();
alleles.add(altAllele);
}
return alleles;
}
/**
* Method will return a Tuple of the hash of (chrom, start_position, refAlleleID, altAlleleID)
* and a VariantMappingData record
*
* The alleleHashMap should contain the initial alleles pre-populated. If the alleleId
* cannot be found from this hash map, the db will be queried. Generally 2/3 of all
* alleles will be found on the hashmap.
*
* @param alleleHashMap
* @param vi
* @param dbConn
* @return
*/
public static Tuple getVariantData(String chrom, Map alleleHashMap, HaplotypeNode.VariantInfo vi,
Connection dbConn) {
String refAllele = vi.refAlleleString();
int refDepth = vi.depth()[0]; // should always be the ref
int other = 0; // additional data as yet undefined.
Integer refAlleleID = alleleHashMap.get(AnchorDataPHG.getChecksumForString(refAllele,"MD5"));
if (refAlleleID == null) {
// get it from the db
refAlleleID = findAlleleIDFromDB(refAllele,dbConn);
}
if (!vi.isVariant()) {
if (refAlleleID == null || refAlleleID == 0) { // means we didn't correctly populate the allele table
throw new IllegalStateException("VariantProcessingUtils:getVariantData: no value for refSTring on allele map " );
}
Integer altAlleleID = alleleHashMap.get(AnchorDataPHG.getChecksumForString("none","MD5"));
if (altAlleleID == null) {
myLogger.debug("VariantProcessingUtils:getVariantData: altAlleleID is NULL, alleleHashMap size: " + alleleHashMap.keySet().size());
throw new IllegalStateException("VariantsProcessingUtils:getVariantData: 0 NULL alleleID for VC record: " + vi.toString());
}
int refLen = (vi.end() - vi.start()) + 1;
VariantMappingData vmd = new VariantMappingData(Position.of(chrom, vi.start()), refAlleleID, altAlleleID,refDepth,0, true,refLen,(byte)0, other);
StringBuilder hashSB = new StringBuilder();
hashSB.append(chrom).append(":");
hashSB.append(vi.start()).append(":");
hashSB.append(refAlleleID).append(":");
hashSB.append(altAlleleID).append(":");
hashSB.append("-1"); // this is default - will need to be passed in when the field is actually used
String vmHash = AnchorDataPHG.getChecksumForString(hashSB.toString(),"MD5");
return new Tuple<>(vmHash,vmd);
}
// not a reference record
Integer altAlleleID = null;
byte isIndel = 0;
// We are only processing 1 alternate allele per record, we need the altAlleleID, not the alt allele string
String altAllele = vi.altAlleleString();
isIndel = (altAllele.length() > 1) ? (byte)1 : 0;
altAlleleID = alleleHashMap.get(AnchorDataPHG.getChecksumForString(altAllele,"MD5"));
if (altAlleleID == null) {
// check the db
altAlleleID = findAlleleIDFromDB(altAllele, dbConn);
if (altAlleleID == null || altAlleleID == 0) { // means we didn't correctly populate the allele table
throw new IllegalStateException("VariantProcessingUtils:getVariantData: no value for altAllele on allele map " );
}
}
// Get alternate allele depth, create VariantMappingData record
int[] alleleDepth = vi.depth();
int altDepth = 1;
if (alleleDepth != null && alleleDepth.length > 1) {
refDepth = alleleDepth[0];
altDepth = alleleDepth[1]; // first one is ref, second one is alt
}
else {
refDepth = alleleDepth[0];
altDepth = 0;
}
int refLen = (vi.end() - vi.start()) + 1;
VariantMappingData vmd = new VariantMappingData(Position.of(chrom, vi.start()), refAlleleID, altAlleleID,refDepth,altDepth, false, refLen, isIndel,other);
StringBuilder hashSB = new StringBuilder();
hashSB.append(chrom).append(":");
hashSB.append(vi.start()).append(":");
hashSB.append(refAlleleID).append(":");
hashSB.append(altAlleleID).append(":");
hashSB.append("-1");
String vmHash = AnchorDataPHG.getChecksumForString(hashSB.toString(),"MD5");
return new Tuple<>(vmHash,vmd);
}
/**
* Takes a list of variantIdhash to VariantMappingData and creates a list
* of "long" values, that represents the variant context for each entry
* @param variantMappingDataList
* @param hashIDMap Map of variant_hash to variantID
* @return
*/
public static List getLongListOfVariantData (List> variantMappingDataList,
Map hashIDMap) {
// The list should be sorted for later vcf reconstruction.
// Entries were processed in parallel, so added to list in unknown order.
Collections.sort(variantMappingDataList, (Tuple vm1, Tuple vm2 )->
vm1.getY().compareTo(vm2.getY()));
// Create the list of long. Need the hash to get the Variant_Mapping table record ID
List vmLongList = new ArrayList<>();
for (Tuple entry : variantMappingDataList) {
Integer vmID = hashIDMap.get(entry.getX());
if (vmID == null) vmID = 0; // 0 means we have a reference record, not a variant (ref records are not in variant_mapping table
long vmdLong = getLongFromVMData(vmID,entry.getY());
vmLongList.add(vmdLong);
}
return vmLongList;
}
/**
* Takes an ID and a VariantMappingData object and creates a list
* of variant_ids with additional data.
* @param vmID
* @param vmd
* @return
*/
public static long getLongFromVMData(int vmID, VariantMappingData vmd) {
long vmLong = 0;
byte refDepth = vmd.refDepth() > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)vmd.refDepth();
byte altDepth = vmd.altDepth() > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)vmd.altDepth();
if (vmd.isReference()) {
vmLong = getLongRefRecord(vmd.refLen(), vmd.refDepth(), vmd.position().getPosition());
} else {
vmLong = getLongVariantRecord(vmID, vmd.refDepth(), vmd.altDepth(),vmd.isIndel(),vmd.otherData());
}
return vmLong;
}
/**
* Takes a reference length, depth and start position on the chromosome.
* Returns an encode long holding this information.
*
* format: 1bit=ref | 2 bytes 7 bits = refLength | 1 bytes=refDepth | 4 bytes=position on chrom
*
* @param refLen
* @param refDepth
* @param startPos
* @return
*/
public static long getLongRefRecord(int refLen, int refDepth, int startPos) {
long vmLong = 0;
byte refDepthByte = refDepth > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)refDepth;
// create long for reference record
// format: 1bit=ref | 2 bytes 7 bits = refLength | 1 bytes=refDepth | 4 bytes=position on chrom
vmLong = refLen; // length get upper 3 bytes
vmLong = (vmLong << 8) + Byte.toUnsignedLong(refDepthByte); // refDepth get next byte
vmLong = (vmLong << 32) + startPos; // position on chrom gets last 4 bytes
vmLong |= 1L << 63; // set upper bit to indicate this is a reference record
return vmLong;
}
/**
* Takes a variantMapping id, reference depth, alternate depth, indication as to if is an indel, and a dummy int
* Returns a long formatted with this data.
* format: 4 bytes= variant_mapping table id | 1 byte=refDepth | 1 byte=altDepth | 1 bytes=isIndel | 1 byte unused
*
* @param vmID
* @param refDepth
* @param altDepth
* @param isIndel
* @param otherData
* @return
*/
public static long getLongVariantRecord(int vmID, int refDepth, int altDepth, byte isIndel, int otherData) {
long vmLong = 0;
byte refDepthByte = refDepth > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)refDepth;
byte altDepthByte = altDepth > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)altDepth;
// create long for variant record
// format: 4 bytes= variant_mapping table id | 1 byte=refDepth | 1 byte=altDepth | 1 bytes=isIndel | 1 byte unused
vmLong = vmID; // id gets 4 upper bytes
vmLong = (vmLong << 8) + Byte.toUnsignedLong(refDepthByte); // refDepth gets next byte
vmLong = (vmLong << 8) + Byte.toUnsignedLong(altDepthByte); // altDepth gets a byte
vmLong = (vmLong << 8) + Byte.toUnsignedLong((byte)isIndel); // isIndel gets a byte
vmLong = (vmLong << 8) + Byte.toUnsignedLong((byte)otherData); // undefined other data as last byte
return vmLong;
}
/**
* Method takes a List of Long objects and converts to a Snappy compressed byte stream
* @param variantLongList
* @return
*/
public static byte[] encodeVariantLongListToByteArray(List variantLongList) {
try {
ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
ObjectOutputStream objectStream = new ObjectOutputStream(byteStream);
objectStream.writeObject(variantLongList);
byte[] serializedBytes = byteStream.toByteArray();
objectStream.close();
byteStream.close();
return Snappy.compress(serializedBytes);
} catch (Exception exc) {
throw new IllegalStateException("VariantsProcessingUtils:encodeVariantLongListToByteArray: failed to encode bytes: " + exc.getMessage());
}
}
/**
* Method takes a Snappy compressed byte stream and decodes it into a List of Long objects
* @param encodedByteArray
* @return
*/
public static List decodeByteArrayToVariantLongList(byte[] encodedByteArray) {
try {
encodedByteArray = Snappy.uncompress(encodedByteArray);
ByteArrayInputStream byteStream = new ByteArrayInputStream(encodedByteArray);
ObjectInputStream objectStream = new ObjectInputStream(byteStream);
List variantLongList = (List)objectStream.readObject();
objectStream.close();
byteStream.close();
return variantLongList;
}
catch(Exception exc) {
throw new IllegalStateException("VariantsProcessingUtils:decodeByteArrayToVariantLongList: failed to decode bytes: " + exc.getMessage());
}
}
/**
* @param ListOfLongs a List or Collection of Longs to be converted into a byte[] array
* @return a byte[] array containing the big endian converted Longs
*/
public static byte[] longListToByteArray(Collection ListOfLongs) {
ByteBuffer tempBuffer = ByteBuffer.allocate(ListOfLongs.size() * 8);
for (Long val : ListOfLongs) tempBuffer.putLong(val);
return tempBuffer.array();
}
/**
* @param byteArray an array of bytes converting longs
* @return a List of the longs encoded by the bytes
*/
public static List byteArrayToLongList(byte[] byteArray) {
int arraylen = byteArray.length;
int numberOfLongs = arraylen / 8;
List longList = new ArrayList<>(numberOfLongs);
ByteBuffer tempBuffer = ByteBuffer.wrap(byteArray);
for (int i = 0; i < numberOfLongs; i++) {
longList.add(tempBuffer.getLong());
}
return longList;
}
/**
* From an allele string, compute the hash value and search for
* a corresponding ID in the DB alleles table.
*
* @param allele
* @param dbConn
* @return
*/
public static int findAlleleIDFromDB(String allele,Connection dbConn){
int allele_id = 0;
StringBuilder sb = new StringBuilder();
String allele_hash = AnchorDataPHG.getChecksumForString(allele, "MD5");
sb.append("select allele_id from alleles where allele_hash='");
sb.append(allele_hash);
sb.append("';");
String query = sb.toString();
try (ResultSet rs = dbConn.createStatement().executeQuery(query)) {
if (rs.next()) {
allele_id = rs.getInt("allele_id");
}
return allele_id;
} catch (Exception exc) {
throw new IllegalStateException("findAlleleIDFromDB - allele not found in alleles tables: " + allele);
}
}
}