All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.db_loading.VariantsProcessingUtils Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
/**
 * 
 */
package net.maizegenetics.pangenome.db_loading;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.nio.ByteBuffer;
import java.sql.Connection;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;

import net.maizegenetics.pangenome.api.HaplotypeNode;
import org.apache.log4j.Logger;
import org.xerial.snappy.Snappy;

import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.hapcollapse.GVCFUtils;
import net.maizegenetics.util.Tuple;

/**
 * This class contains methods to aid in processing a VariantContext list
 * into the PHG db variants, alleles, and haplotypes tables
 * @author lcj34
 *
 */
public class VariantsProcessingUtils {

    private static final Logger myLogger = Logger.getLogger(VariantsProcessingUtils.class);
  
    /**
     * createALleleList returns an array list containing just the ref allele string,
     * if the VariantContext record is a ref record.  Or both the ref allele and first
     * alt allele, if the VariantContext record is a variant record.
     *
     * THis method does not check if the allele exists.  The returned list
     * is ALL alleles - this assumes they will be added via an INSERT/IGNORE db command.
     *
     * @param vc
     * @return
     */
    public static List createAlleleList( VariantContext vc) {
        List alleles = new ArrayList<>();
        String refAllele = vc.getReference().getBaseString();

        if (GVCFUtils.isReferenceRecord(vc)) {
            // only need to check the reference
            alleles.add(refAllele);

        } else {
            List alt = vc.getAlternateAlleles();
            String altAllele;
            // We are only processing 1 alternate allele per record
            if (alt.size() == 1) {
                altAllele = alt.get(0).getBaseString();
            } else {
                if (!alt.get(0).getBaseString().equals("")) { // "" gets stripped and becomes ""
                    altAllele = alt.get(0).getBaseString();

                } else { // this assumes NON_REF was the first alt and it was stripped to ""
                    altAllele = alt.get(1).getBaseString();
                }
            }
            alleles.add(altAllele);
        }
        return alleles;
    }

    /**
     * Method to create a list of alleles based on if the variant info is a ref or variant.
     * @param vi
     * @return
     */
    public static List createAlleleList( HaplotypeNode.VariantInfo vi) {
        List alleles = new ArrayList<>();
        String refAllele = vi.refAlleleString();
        alleles.add(refAllele);
        if (vi.isVariant()) {
            String altAllele = vi.altAlleleString();
            alleles.add(altAllele);

        }
        return alleles;
    }

    /**
     * Method will return a Tuple of the hash of (chrom, start_position, refAlleleID, altAlleleID)
     * and a VariantMappingData record
     * 
     * The alleleHashMap should contain the initial alleles pre-populated. If the alleleId
     * cannot be found from this hash map, the db will be queried.  Generally 2/3 of all
     * alleles will be found on the hashmap.
     * 
     * @param alleleHashMap
     * @param vi
     * @param dbConn
     * @return
     */
    public static Tuple getVariantData(String chrom, Map alleleHashMap, HaplotypeNode.VariantInfo vi,
                                                                  Connection dbConn) {

        String refAllele = vi.refAlleleString();


        int refDepth = vi.depth()[0]; // should always be the ref
        int other = 0; // additional data as yet undefined.
        Integer refAlleleID = alleleHashMap.get(AnchorDataPHG.getChecksumForString(refAllele,"MD5"));
        if (refAlleleID == null) {
            // get it from the db
            refAlleleID = findAlleleIDFromDB(refAllele,dbConn);
        }
        if (!vi.isVariant()) {

            if (refAlleleID == null || refAlleleID == 0) { // means we didn't correctly populate the allele table
                throw new IllegalStateException("VariantProcessingUtils:getVariantData: no value for refSTring on allele map " );
            }

            Integer altAlleleID = alleleHashMap.get(AnchorDataPHG.getChecksumForString("none","MD5"));

            if (altAlleleID == null) {
                myLogger.debug("VariantProcessingUtils:getVariantData: altAlleleID is NULL, alleleHashMap size: " + alleleHashMap.keySet().size());
                throw new IllegalStateException("VariantsProcessingUtils:getVariantData: 0 NULL alleleID for VC record: " + vi.toString());
            }
            int refLen = (vi.end() - vi.start()) + 1;

            VariantMappingData vmd = new VariantMappingData(Position.of(chrom, vi.start()), refAlleleID, altAlleleID,refDepth,0, true,refLen,(byte)0, other);
            StringBuilder hashSB = new StringBuilder();
            hashSB.append(chrom).append(":");
            hashSB.append(vi.start()).append(":");
            hashSB.append(refAlleleID).append(":");
            hashSB.append(altAlleleID).append(":");
            hashSB.append("-1"); // this is default - will need to be passed in when the field is actually used
            String vmHash = AnchorDataPHG.getChecksumForString(hashSB.toString(),"MD5");

            return new Tuple<>(vmHash,vmd);
        }

        // not a reference record
        Integer altAlleleID = null;
        byte isIndel = 0;

        // We are only processing 1 alternate allele per record, we need the altAlleleID, not the alt allele string
        String altAllele = vi.altAlleleString();
        isIndel =  (altAllele.length() > 1) ? (byte)1 : 0;

        altAlleleID = alleleHashMap.get(AnchorDataPHG.getChecksumForString(altAllele,"MD5"));
        if (altAlleleID == null) {
            // check the db
            altAlleleID = findAlleleIDFromDB(altAllele, dbConn);
            if (altAlleleID == null || altAlleleID == 0) { // means we didn't correctly populate the allele table
                throw new IllegalStateException("VariantProcessingUtils:getVariantData: no value for altAllele on allele map " );
            }

        }
        // Get alternate allele depth, create VariantMappingData record
        int[] alleleDepth = vi.depth();
        int altDepth = 1;
        if (alleleDepth != null && alleleDepth.length > 1) {
            refDepth = alleleDepth[0];
            altDepth = alleleDepth[1]; // first one is ref, second one is alt
        }
        else {
            refDepth = alleleDepth[0];
            altDepth = 0;
        }

        int refLen = (vi.end() - vi.start()) + 1;
        VariantMappingData vmd = new VariantMappingData(Position.of(chrom, vi.start()), refAlleleID, altAlleleID,refDepth,altDepth, false, refLen, isIndel,other);
        StringBuilder hashSB = new StringBuilder();
        hashSB.append(chrom).append(":");
        hashSB.append(vi.start()).append(":");
        hashSB.append(refAlleleID).append(":");
        hashSB.append(altAlleleID).append(":");
        hashSB.append("-1");
        String vmHash = AnchorDataPHG.getChecksumForString(hashSB.toString(),"MD5");

        return new Tuple<>(vmHash,vmd);

    }
    /**
     * Takes a list of variantIdhash to VariantMappingData and creates a list
     * of "long" values, that represents the variant context for each entry
     * @param variantMappingDataList
     * @param hashIDMap  Map of variant_hash to variantID
     * @return
     */
    public static List getLongListOfVariantData (List> variantMappingDataList,
            Map hashIDMap) {
        
        // The list should be sorted for later vcf reconstruction.
        // Entries were processed in parallel, so added to list in unknown order.
        Collections.sort(variantMappingDataList, (Tuple vm1, Tuple vm2 )->
                            vm1.getY().compareTo(vm2.getY()));
        
        // Create the list of long.  Need the hash to get the Variant_Mapping table record ID
        List vmLongList = new ArrayList<>();
        for (Tuple entry : variantMappingDataList) {           
            Integer vmID = hashIDMap.get(entry.getX());
            if (vmID == null) vmID = 0; // 0  means we have a reference record, not a variant (ref records are not in variant_mapping table
            long vmdLong = getLongFromVMData(vmID,entry.getY()); 
            vmLongList.add(vmdLong);
        }
        
        return vmLongList; 
    }
    
    /**
     * Takes an ID and a VariantMappingData object and creates a list
     * of variant_ids with additional data.
     * @param vmID
     * @param vmd
     * @return
     */
    public static long getLongFromVMData(int vmID, VariantMappingData vmd) {

        long vmLong = 0;
        byte refDepth = vmd.refDepth() > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)vmd.refDepth();
        byte altDepth = vmd.altDepth() > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)vmd.altDepth();
        if (vmd.isReference()) {
            vmLong = getLongRefRecord(vmd.refLen(), vmd.refDepth(), vmd.position().getPosition());
        } else {
            vmLong = getLongVariantRecord(vmID, vmd.refDepth(), vmd.altDepth(),vmd.isIndel(),vmd.otherData());
        }
        
        return vmLong;
    }

    /**
     * Takes a reference length, depth and start position on the chromosome.
     * Returns an encode long holding this information.
     *
     * format:  1bit=ref | 2 bytes 7 bits = refLength | 1 bytes=refDepth | 4 bytes=position on chrom
     *
     * @param refLen
     * @param refDepth
     * @param startPos
     * @return
     */
    public static long getLongRefRecord(int refLen, int refDepth, int startPos) {
        long vmLong = 0;
        byte refDepthByte = refDepth > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)refDepth;

        // create long for reference record
        // format:  1bit=ref | 2 bytes 7 bits = refLength | 1 bytes=refDepth | 4 bytes=position on chrom
        vmLong = refLen; // length get upper 3 bytes
        vmLong = (vmLong << 8) + Byte.toUnsignedLong(refDepthByte); // refDepth get next byte
        vmLong = (vmLong << 32) + startPos; // position on chrom gets last 4 bytes
        vmLong |= 1L << 63; // set upper bit to indicate this is a reference record

        return vmLong;
    }

    /**
     * Takes a variantMapping id, reference depth, alternate depth, indication as to if is an indel, and a dummy int
     * Returns a long formatted with this data.
     * format: 4 bytes= variant_mapping table id | 1 byte=refDepth | 1 byte=altDepth | 1 bytes=isIndel | 1 byte unused
     *
     * @param vmID
     * @param refDepth
     * @param altDepth
     * @param isIndel
     * @param otherData
     * @return
     */
    public static long getLongVariantRecord(int vmID, int refDepth, int altDepth, byte isIndel, int otherData) {
        long vmLong = 0;
        byte refDepthByte = refDepth > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)refDepth;
        byte altDepthByte = altDepth > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)altDepth;

        // create long for variant record
        // format: 4 bytes= variant_mapping table id | 1 byte=refDepth | 1 byte=altDepth | 1 bytes=isIndel | 1 byte unused
        vmLong = vmID; // id gets 4 upper bytes
        vmLong = (vmLong << 8) + Byte.toUnsignedLong(refDepthByte); // refDepth gets next byte
        vmLong = (vmLong << 8) + Byte.toUnsignedLong(altDepthByte); // altDepth gets a byte
        vmLong = (vmLong << 8) + Byte.toUnsignedLong((byte)isIndel); // isIndel gets a byte
        vmLong = (vmLong << 8) + Byte.toUnsignedLong((byte)otherData); // undefined other data as last byte

        return vmLong;
    }

    /**
     * Method takes a List of Long objects and converts to a Snappy compressed byte stream
     * @param variantLongList
     * @return
     */
    public static byte[] encodeVariantLongListToByteArray(List variantLongList) {
        
        try {
            ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
            ObjectOutputStream objectStream = new ObjectOutputStream(byteStream);

            objectStream.writeObject(variantLongList);
            byte[] serializedBytes = byteStream.toByteArray();

            objectStream.close();
            byteStream.close();

            return Snappy.compress(serializedBytes);
        } catch (Exception exc) {
            throw new IllegalStateException("VariantsProcessingUtils:encodeVariantLongListToByteArray: failed to encode bytes: " + exc.getMessage());
        }

    }
    
    /**
     * Method takes a Snappy compressed byte stream and decodes it into a List of Long objects
     * @param encodedByteArray
     * @return
     */
    public static List decodeByteArrayToVariantLongList(byte[] encodedByteArray) {
        try {
            encodedByteArray = Snappy.uncompress(encodedByteArray);
            ByteArrayInputStream byteStream = new ByteArrayInputStream(encodedByteArray);

            ObjectInputStream objectStream = new ObjectInputStream(byteStream);

            List variantLongList = (List)objectStream.readObject();

            objectStream.close();
            byteStream.close();

            return variantLongList;
        }
        catch(Exception exc) {
            throw new IllegalStateException("VariantsProcessingUtils:decodeByteArrayToVariantLongList: failed to decode bytes: " + exc.getMessage());           
        }

    }
    
    /**
     * @param ListOfLongs   a List or Collection of Longs to be converted into a byte[] array
     * @return  a byte[] array containing the big endian converted Longs
     */
    public static byte[] longListToByteArray(Collection ListOfLongs) {
        ByteBuffer tempBuffer = ByteBuffer.allocate(ListOfLongs.size() * 8);
        for (Long val : ListOfLongs) tempBuffer.putLong(val);
        return tempBuffer.array();
    }
    
    /**
     * @param byteArray an array of bytes converting longs
     * @return  a List of the longs encoded by the bytes
     */
    public static List byteArrayToLongList(byte[] byteArray) {
        int arraylen = byteArray.length;
        int numberOfLongs = arraylen /  8;
        List longList = new ArrayList<>(numberOfLongs);
        ByteBuffer tempBuffer = ByteBuffer.wrap(byteArray);
        for (int i = 0; i < numberOfLongs; i++) {
            longList.add(tempBuffer.getLong());
        }
        return longList;
    }

    /**
     * From an allele string, compute the hash value and search for
     * a corresponding ID in the DB alleles table.
     * 
     * @param allele
     * @param dbConn
     * @return
     */
    public static int findAlleleIDFromDB(String allele,Connection dbConn){
        int allele_id = 0;
        StringBuilder sb = new StringBuilder();
        String allele_hash = AnchorDataPHG.getChecksumForString(allele, "MD5");
        sb.append("select allele_id from alleles where allele_hash='");
        sb.append(allele_hash);
        sb.append("';");

        String query = sb.toString();
        try (ResultSet rs = dbConn.createStatement().executeQuery(query)) {

            if (rs.next()) {
                allele_id = rs.getInt("allele_id");                
            }
            return allele_id;
        } catch (Exception exc) {
            throw new IllegalStateException("findAlleleIDFromDB - allele not found in alleles tables: " + allele);
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy