All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.api.VariantUtils Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.api;

import com.google.common.collect.ImmutableMap;
import htsjdk.variant.variantcontext.*;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.snp.score.AlleleDepthUtil;
import net.maizegenetics.pangenome.api.HaplotypeNode.VariantInfo;
import net.maizegenetics.pangenome.db_loading.VariantsProcessingUtils;
import net.maizegenetics.util.Tuple;
import org.apache.log4j.Logger;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import java.util.stream.Collectors;

public class VariantUtils {
    private static final Logger myLogger = Logger.getLogger(VariantUtils.class);
    public static double maxError = 0.2;
    
    public static List extractOutVariantContext(List variantList, String chromosome, GenomeSequence refSequence, String taxon, Map variantMapping) {
        List vcs = new ArrayList<>();

        //Loop through the variantList
        for(Long variantEncoded:variantList) {
            if(variantEncoded <0) {
                //If variantEncoded is a negative number, it is a reference range
                vcs.add(parseRefRangeLong(variantEncoded, chromosome,refSequence,taxon));
            }
            else {
                vcs.add(parseVariant(variantEncoded,chromosome,taxon,variantMapping));
            }
        }


        return vcs;
    }

    private static VariantContext parseRefRangeLong(long refBlockEncoded, String chromosome, GenomeSequence refSequence, String taxon) {

        //RefBlocks are encoded like this:
        //1bitSign | 2 byte 7 bit Length | 1 byte DP | 4 bytes position

        int position = (int)(refBlockEncoded & 0xFFFF);
        int depth = AlleleDepthUtil.depthByteToInt((byte)((refBlockEncoded>>32) & 0xFF));
        int length = (int)((refBlockEncoded>>40) & 0x7FF);

        Allele firstRefAllele = Allele.create(refSequence.genotypeAsString(Chromosome.instance(chromosome),position),true);
        Genotype gt = new GenotypeBuilder().name(taxon).alleles(Arrays.asList(firstRefAllele)).DP(depth).make();

        VariantContextBuilder vcb = new VariantContextBuilder().chr(chromosome)
                .start(position)
                .stop(position+length)
                .attribute("END",position+length)
                .alleles(Arrays.asList(firstRefAllele))
                .genotypes(gt);


        return vcb.make();
    }

    private static VariantContext parseVariant(long variantEncoded, String chromosome, String taxon, Map variantMapping) {
        //Alt encoding is like this:
        //variantID 4 byte | refDepth 1 byte | altDepth 1 byte | empty 2 bytes

        int variantID = (int)((variantEncoded>>32)&0xFFFF);
        int refDepth = AlleleDepthUtil.depthByteToInt((byte)((variantEncoded>>24) & 0xFF));
        int altDepth = AlleleDepthUtil.depthByteToInt((byte)((variantEncoded>>16) & 0xFF));


        DBVariant currentVariantRecord = variantMapping.get(variantID);

        GenotypeBuilder genotypeBuilder = new GenotypeBuilder().name(taxon).DP(refDepth+altDepth).AD( new int[]{refDepth,altDepth});

        if(altDepth>refDepth) {
            genotypeBuilder.alleles(Arrays.asList(currentVariantRecord.getMyAltAllele()));
        }
        else {
            genotypeBuilder.alleles(Arrays.asList(currentVariantRecord.getMyRefAllele()));
        }

        VariantContextBuilder vcb = new VariantContextBuilder()
                .chr(chromosome)
                .start(currentVariantRecord.getMyStartPosition())
                .stop(currentVariantRecord.getMyStartPosition()+currentVariantRecord.getMyRefAllele().length()-1)
                .alleles(Arrays.asList(currentVariantRecord.getMyRefAllele(),currentVariantRecord.getMyAltAllele()))
                .genotypes(genotypeBuilder.make());

        return vcb.make();
    }

    public static void addGraphVariantsToVariantMap(HaplotypeGraph hapGraph, Connection dbConn) {
        Set variantIds = getAllVariantIds(hapGraph);
        if (variantIds.size() == 0 ) variantIds = getAllVariantIds(hapGraph, dbConn); //the variant lists were not loaded in PHG
        if (variantIds.size() > 0) addVariantIdsToVariantMap(variantIds, dbConn);
    }
    
    private static void addVariantIdsToVariantMap(Collection variantIds, Connection dbConn) {
        Map alleleMap = getAlleleMap(dbConn);
        
        //  First get variant info using connection, then get the alleles not in db
        String sqlString = "SELECT chrom, position, ref_allele_id, alt_allele_id FROM variants WHERE variant_id = ?";
        try (PreparedStatement st = dbConn.prepareStatement(sqlString)) {
            for (Integer id : variantIds) {                
              //no need to query db if the variant is already in the map
                if (Variant.masterVariantMap.containsKey(id)) continue; 
                if (id < 1) {
                    throw new IllegalArgumentException("addVariantIdsToVariantMap: Bad value for variantID: " + id);
                }
                st.setInt(1, id);                
                ResultSet rs = st.executeQuery();
                if (rs.next()) {
                    int refAlleleId = rs.getInt("ref_allele_id");
                    int altAlleleId = rs.getInt("alt_allele_id");
                    String chrom = rs.getString("chrom");
                    int pos = rs.getInt("position");

                    
                    rs.close(); // close result set to allow for new query in call below
                    
                    // This method goes to the DB if the allele data is not on the map.
                    Tuple refAltAlleles = getAlleleInfoFromDB(dbConn, alleleMap, refAlleleId, altAlleleId);
                    Variant.masterVariantMap.put(id, new Variant(id, chrom, pos, refAltAlleles.getX(), refAltAlleles.getY())); 
                } else {
                	throw new IllegalArgumentException("addVariantIdsToVariantMap: no values returned for variantId " + id);
                }

            } 
            
        } catch(SQLException e) {
            throw new IllegalArgumentException("Error querying PHG db getting variant data.", e);
        }
    }

    public static Map variantIdsToVariantMap(String chromosome, int start, int end, Connection database) {

        int startPos = start < 0 ? -1 : start;
        int endPos = end < 0 ? -1 : end;

        if (endPos < startPos) {
            throw new IllegalArgumentException("VariantUtils: variantIdsToVariantMap: end: " + end + " must be greater than start: " + start);
        }

        Map alleleMap = getAlleleMap(database);

        //  First get variant info using connection, then get the alleles not in db
        StringBuilder sqlString = new StringBuilder();
        sqlString.append("SELECT variant_id, chrom, position, ref_allele_id, alt_allele_id FROM variants");
        if (chromosome != null) {
            sqlString.append(" WHERE chrom = '");
            sqlString.append(chromosome);
            sqlString.append(("'"));

            if (startPos > -1) {
                sqlString.append(" AND position BETWEEN ");
                sqlString.append(startPos);
                sqlString.append(" AND ");
                sqlString.append(endPos);
            }

        }
        sqlString.append(";");

        String query = sqlString.toString();
        myLogger.info("variantIdsToVariantMap: query statement: " + query);

        ImmutableMap.Builder result = new ImmutableMap.Builder<>();

        try (ResultSet rs = database.createStatement().executeQuery(query)) {

            while (rs.next()) {
                int variantId = rs.getInt("variant_id");
                int refAlleleId = rs.getInt("ref_allele_id");
                int altAlleleId = rs.getInt("alt_allele_id");
                String chrom = rs.getString("chrom");
                int pos = rs.getInt("position");

                // This method goes to the DB if the allele data is not on the map.
                Tuple refAltAlleles = getAlleleInfoFromDB(database, alleleMap, refAlleleId, altAlleleId);
                result.put(variantId, new Variant(variantId, chrom, pos, refAltAlleles.getX(), refAltAlleles.getY()));
            }

        } catch (SQLException e) {
            myLogger.error(e.getMessage(), e);
            throw new IllegalArgumentException("VariantUtils: variantIdsToVariantMap: Error querying PHG db getting variant data.", e);
        }

        return result.build();

    }
    
    public static Set getAllVariantIds(HaplotypeGraph hapGraph) {
        return hapGraph.nodeStream()
                .map(hn -> hn.byteEncodedVariants())
                .filter(ba -> ba != null)
                .flatMap(byteArray -> VariantsProcessingUtils.decodeByteArrayToVariantLongList(byteArray).stream())
                .filter(val -> val >= 0)
                .mapToInt(val -> (int) (val >> 32))
                .boxed()
                .collect(Collectors.toSet());
    }
    
    // NOTE:  for the sqlite db, using a PreparedStatement for the select is returning an
    // empty result set. Consequently select is created without
    public static Set getAllVariantIds(HaplotypeGraph hapGraph, Connection dbConn) {
        int batchSize = 5000;
        int[] ids = hapGraph.nodeStream().mapToInt(node -> node.id()).toArray();
        int numberOfIds = ids.length;
        int numberOfBatches = numberOfIds / batchSize;
        Set varidSet = new HashSet<>();

        try {

            for (int ndx = 0; ndx < numberOfBatches + 1; ndx++) {
                int start = ndx * batchSize;
                int end = Math.min(start + batchSize, ids.length);
                int[] idSubset = Arrays.copyOfRange(ids, start, end );
                String inString = Arrays.stream(idSubset).mapToObj(Integer::toString).collect(Collectors.joining(","));
                StringBuilder sb = new StringBuilder();
                sb.append("SELECT variant_list FROM haplotypes WHERE haplotypes_id IN (");
                sb.append(inString);
                sb.append(")");
                ResultSet rs = dbConn.createStatement().executeQuery(sb.toString());
                while (rs.next()) {
                    byte[] encodedVariants = rs.getBytes("variant_list");
                    if (encodedVariants != null && encodedVariants.length > 0) {
                        List longList = VariantsProcessingUtils.decodeByteArrayToVariantLongList(encodedVariants);
                        for (Long val : longList) {
                            if (val > 0) // negative values are reference ranges - we only want variants
                            varidSet.add((int) (val >> 32));
                        }
                    }
                }
                rs.close();
            }
        } catch (SQLException sqle) {
            throw new IllegalArgumentException("Error processing sql statment to selelct variant_List from haplotypes for a specific haplotypes_id: ", sqle);
        }
        myLogger.info("getAllVariantIds: size of varidSet to return: " + varidSet.size());
        return varidSet;
    }
    
    public static Set getAllVariantIds(Connection dbConn, String variantQuery) {
        Set varidSet = new HashSet<>();
        try (ResultSet rs = dbConn.createStatement().executeQuery(variantQuery)) {
            while (rs.next()) {
                 List varLongs = VariantsProcessingUtils.decodeByteArrayToVariantLongList(rs.getBytes(1));
                 List ids = varLongs.stream()
                         .mapToInt(val -> decodeLongVariant(val)[1])
                         .boxed().collect(Collectors.toList());
                 varidSet.addAll(ids);
            }
        } catch (SQLException e) {
            throw new IllegalArgumentException("Problem querying database using: " + variantQuery, e);
        }
        
        return varidSet;
    }
    
    /**
     * @param variant
     * @return int[] containing variant data. The content depends on the whether variant is positive or negative.
     * 

* If variant is positive then the long encodes data for a variant. * The return int[] is 1, variant id, refDepth, altDepth. *

* If the variant is negative then the long encodes data for a reference block * The return int[] is -1, block length, read depth, the block chromosomal position. */ public static int[] decodeLongVariant(Long variant) { int[] info = new int[4]; long vmlong = variant.longValue(); if (variant >= 0) { info[0] = 1; //Variant: 4 bytes= variant_mapping table id | 1 byte=refDepth | 1 byte=altDepth | 1 isIndel | 1 byte=unused vmlong >>= 16; byte altDepthByte = (byte)(vmlong & 0xFF); // to correctly handle negative numbers, load first into byte info[3] = (int) altDepthByte; //altDepth vmlong >>= 8; byte refDepthByte = (byte) (vmlong & 0xFF); info[2] = (int) refDepthByte; //refDepth vmlong >>= 8; info[1] = (int) vmlong; //variant id from database } else { info[0] = -1; //ref: 1bit=ref | 2 bytes 7 bits = refLength | 1 bytes=refDepth | 4 bytes=position on chrom vmlong ^= 1L << 63; info[3] = (int) (vmlong & 0xFFFFFFFF); vmlong >>= 32; byte refDepthByte = (byte) (vmlong & 0xFF); info[2] = (int) refDepthByte; vmlong >>= 8; info[1] = (int) vmlong; } return info; } public static Map getAlleleMap(Connection dbConn) { Map alleleInfoMap = new HashMap<>(); // This assumes 5-mers were initially loaded to the DB. Only those alleles are initially stored // in the map. All others will be queried from the DB as needed. // The 3906: initial alleles loaded to the DB will be 5-mers containing alleles A,C,G,T,N // 3906 comes from 5 + 5^2 + 5^3 + 5^4 + 5^5 = 5 + 25 + 125 + 625 + 3125 = 3905 String sql = "SELECT allele_id, allele_string, display_string, len FROM alleles where allele_id < 3906"; try (ResultSet rs = dbConn.createStatement().executeQuery(sql)) { while (rs.next()) { int id = rs.getInt(1); int len = rs.getInt("len"); if (len < 100) { // only adding alleles with length < 100. // If the db was created with 5-mers as the initial alleles, all will be less than 100 (ie 5 ) alleleInfoMap.put(id, new AlleleInfo(id, rs.getBytes(2), rs.getString(3), len)); } } } catch(SQLException e) { throw new IllegalArgumentException("Error executing query: " + sql, e); } return alleleInfoMap; } public static String assignGenotpe(String refAllele, String altAllele, int refDepth, int altDepth) { if (refDepth > altDepth && altDepth < maxError * (refDepth + altDepth) ) { return refAllele; } if (refDepth < altDepth && refDepth < maxError * (refDepth + altDepth) ) { return altAllele; } return VariantInfo.missing; } /** * Takes refAlleleId and altAlleleId. Looks for the corresponding data from the alleleMap. * If not found, the allele Data is queried from the DB. * * A Tuple of representing is returned. * @param dbConn * @param alleleMap * @param refAlleleId * @param altAlleleId * @return */ public static Tuple getAlleleInfoFromDB(Connection dbConn, Map alleleMap, int refAlleleId, int altAlleleId){ StringBuilder queryIds = new StringBuilder(); AlleleInfo refAlleleInfo = alleleMap.get(refAlleleId); AlleleInfo altAlleleInfo = alleleMap.get(altAlleleId); if (refAlleleInfo == null) { queryIds.append(Integer.toString(refAlleleId)); if (altAlleleInfo == null) { queryIds.append(",").append(Integer.toString(altAlleleId)); } } else if (altAlleleInfo == null) { queryIds.append(Integer.toString(altAlleleId)); } if (queryIds.length() > 0) { // query the DB for the info missing from the map StringBuilder querySB = new StringBuilder(); querySB.append("SELECT allele_id, allele_string, display_string, len FROM alleles where allele_id IN ("); querySB.append(queryIds.toString()); querySB.append(" );"); try (ResultSet rs = dbConn.createStatement().executeQuery(querySB.toString())) { while (rs.next()) { int id = rs.getInt("allele_id"); if (id == refAlleleId) { refAlleleInfo = new AlleleInfo(id, rs.getBytes("allele_string"), rs.getString("display_string"), rs.getInt("len")); if (refAlleleInfo.length() < 100) { // allele strings largest than 100 are probably unique so no need to add to map alleleMap.put(id, refAlleleInfo); } } if (id == altAlleleId) { altAlleleInfo = new AlleleInfo(id, rs.getBytes("allele_string"), rs.getString("display_string"), rs.getInt("len")); if (altAlleleInfo.length() < 100) { // allele strings largest than 100 are probably unique so no need to add to map alleleMap.put(id, altAlleleInfo); } } } // ResetSet is closed with try } catch(SQLException sqle) { throw new IllegalArgumentException("Error executing query: " + querySB.toString(), sqle); } } Tuple refAltAlleles = new Tuple(refAlleleInfo, altAlleleInfo); return refAltAlleles; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy