All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.api.VariantUtils Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.api;

import com.google.common.collect.ImmutableMap;
import htsjdk.variant.vcf.VCFFileReader;
import net.maizegenetics.pangenome.api.HaplotypeNode.VariantInfo;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils;
import net.maizegenetics.util.Tuple;
import org.apache.log4j.Logger;

import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;

import net.maizegenetics.pangenome.io.SFTPConnection;

public class VariantUtils {
    private static final Logger myLogger = Logger.getLogger(VariantUtils.class);
    public static double maxError = 0.2;

    /**
     * This method returns a map of a gvcf file name (NOT full path) to
     * that file's stored hash value.  This can be used to verify that
     * files with this name already stored to a local gvcf folder have the
     * same hash as the file referenced in the table.
     * @param database
     * @return
     */
    public static Map gvcfFileNameToHash(Connection database) {
        String query = "SELECT  genome_file, file_checksum from genome_file_data where type=" + DBLoadingUtils.GenomeFileType.GVCF.getValue();
        ImmutableMap.Builder result = new ImmutableMap.Builder<>();

        try (ResultSet rs = database.createStatement().executeQuery(query)) {

            while (rs.next()) {
                String file = rs.getString("genome_file");
                String hash = rs.getString("file_checksum");

                result.put(file,hash);
            }
        } catch (SQLException se) {
            // This helps debug when queries have a problem
            int count = 1;
            while (se != null) {
                myLogger.error("SQLException " + count);
                myLogger.error("Code: " + se.getErrorCode());
                myLogger.error("SqlState: " + se.getSQLState());
                myLogger.error("Error Message: " + se.getMessage());
                se = se.getNextException();
                count++;
            }
            throw new IllegalStateException("error querying genome_file_data ");
        } catch (Exception exc) {
            myLogger.error(exc.getMessage(), exc);
            throw new IllegalStateException("VariantUtils: gvcfFileNameToHash: Error querying PHG db for genome_file_data:" + exc.getMessage());
        }
        return result.build();
    }

    /**
     * This method returns all the gvcf file entries from the genome_file_data table.
     * These are the entries with type=2 (type=1 are assemblies)
     * @param database
     * @return
     */
    public static Map gvcfIdsToGvcfFileMap(Connection database) {
        String query = "SELECT id, genome_path, genome_file from genome_file_data where type=" + DBLoadingUtils.GenomeFileType.GVCF.getValue();
        ImmutableMap.Builder result = new ImmutableMap.Builder<>();

        try (ResultSet rs = database.createStatement().executeQuery(query)) {

            while (rs.next()) {
                int gvcfId = rs.getInt("id");
                String genome_path = rs.getString("genome_path");
                String file = rs.getString("genome_file");
                String serverFilePath = genome_path + "/" + file;

                // gvcfServerPath is of format:  semi-colon separated server and path, e.g. myserver.com;/path/to/gvcfs
                result.put(gvcfId,serverFilePath);
            }
        } catch (SQLException se) {
            // This helps debug when queries have a problem
            int count = 1;
            while (se != null) {
                myLogger.error("SQLException " + count);
                myLogger.error("Code: " + se.getErrorCode());
                myLogger.error("SqlState: " + se.getSQLState());
                myLogger.error("Error Message: " + se.getMessage());
                se = se.getNextException();
                count++;
            }
            throw new IllegalStateException("error querying genome_file_data ");
        } catch (Exception exc) {
            myLogger.error(exc.getMessage(), exc);
            throw new IllegalStateException("VariantUtils: gvcfIdsToGvcfFileMap: Error querying PHG db for genome_file_data:" + exc.getMessage());
        }
        return result.build();
    }

    /**
     * This method takes a STring of the form  and returns
     * a tuple with Tuple.x=server and Tuple.y=path.
     * @param genomePath
     * @return
     */
    public static Tuple splitGenomePath(String genomePath) {
        int semiIndex = genomePath.indexOf(";");
        String server = "";
        String path = genomePath;
        if (semiIndex > -1) {
            server = genomePath.substring(0,semiIndex);
            path = genomePath.substring(semiIndex+1);
        } else {
            throw new IllegalArgumentException("genome path variable must be a semi-colon separated string, with the first portion indicating the server address, e.g. server;/path/to/file. Error on genomePath: " + genomePath);
        }
        return new Tuple(server,path);
    }

//    /**
//     * @param variant
//     * @return int[] containing variant data. The content depends on the whether variant is positive or negative.
//     * 

// * If variant is positive then the long encodes data for a variant. // * The return int[] is 1, variant id, refDepth, altDepth. // *

// * If the variant is negative then the long encodes data for a reference block // * The return int[] is -1, block length, read depth, the block chromosomal position. // */ // public static int[] decodeLongVariant(Long variant) { // int[] info = new int[4]; // long vmlong = variant.longValue(); // if (variant >= 0) { // info[0] = 1; // //Variant: 4 bytes= variant_mapping table id | 1 byte=refDepth | 1 byte=altDepth | 1 isIndel | 1 byte=unused // vmlong >>= 16; // byte altDepthByte = (byte)(vmlong & 0xFF); // to correctly handle negative numbers, load first into byte // info[3] = (int) altDepthByte; //altDepth // vmlong >>= 8; // byte refDepthByte = (byte) (vmlong & 0xFF); // info[2] = (int) refDepthByte; //refDepth // vmlong >>= 8; // info[1] = (int) vmlong; //variant id from database // } else { // info[0] = -1; // //ref: 1bit=ref | 2 bytes 7 bits = refLength | 1 bytes=refDepth | 4 bytes=position on chrom // vmlong ^= 1L << 63; // info[3] = (int) (vmlong & 0xFFFFFFFF); // vmlong >>= 32; // byte refDepthByte = (byte) (vmlong & 0xFF); // info[2] = (int) refDepthByte; // vmlong >>= 8; // info[1] = (int) vmlong; // } // // return info; // } public static String assignGenotpe(String refAllele, String altAllele, int refDepth, int altDepth) { if (refDepth > altDepth && altDepth < maxError * (refDepth + altDepth) ) { return refAllele; } if (refDepth < altDepth && refDepth < maxError * (refDepth + altDepth) ) { return altAllele; } return VariantInfo.missing; } /** * takes the remote gvcf file paths as stored in the db genome_file_data table, * and translates them to a local path based on the user provided localGVCFFolder value * @param gvcfIdToRemotePath * @param localGVCFFolder * @return */ public static Map convertGVCFRemoteToLocalFiles(Map gvcfIdToRemotePath,String localGVCFFolder) { Map idToLocalFile = new HashMap<>(); for (Map.Entry entry : gvcfIdToRemotePath.entrySet()) { int gvcfId = entry.getKey(); Tuple serverPath = splitGenomePath(entry.getValue()); // populate the idToLocalFile map String justName = new File(serverPath.getY()).getName(); String localPathName = localGVCFFolder + "/" + justName; idToLocalFile.put(gvcfId,localPathName); } return idToLocalFile; } /** * GIven a Map of gvcfFileIds to remote path (data from the genome_file_data table), * return a list of servers and the gvcf files do be downloaded from them. * * It is required that each "path" have a server indicated, even if it is local. * @param gvcfIdToRemotePath * @return */ public static Map> getGVCFforDownload(Map gvcfIdToRemotePath) { // This is a Map of a file server name, to a list of files to pull from that server // Initially, we are only supported 1 server per db, but allowing for multiple here // in case that changes. Map> serverToFileList = new HashMap<>(); for (Map.Entry entry : gvcfIdToRemotePath.entrySet()) { Tuple serverPath = splitGenomePath(entry.getValue()); // determine server name if there is one String server = "Local"; if (!serverPath.getX().equals("")) { server = serverPath.getX(); } List remoteFileList = serverToFileList.get(server); if (remoteFileList == null) { remoteFileList = new ArrayList(); serverToFileList.put(server,remoteFileList); } remoteFileList.add(serverPath.getY()); } return serverToFileList; } /** * This needs to return a Map of gvcfId to Tuple * The remote path will be parsed later for downloading from servers. * @param gvcfIdToRemotePath * @param LocalGVCFFolder * @return */ public static Tuple, Map>> parseGVCFData(Map gvcfIdToRemotePath,String LocalGVCFFolder){ // THis is a map of the gvcfId to the localFilePath Map idToLocalFile = new HashMap<>(); // This is a Map of a file server name, to a list of files to pull from that server Map> serverToFileList = new HashMap<>(); for (Map.Entry entry : gvcfIdToRemotePath.entrySet()) { int gvcfId = entry.getKey(); Tuple serverPath = splitGenomePath(entry.getValue()); // populate the idToLocalFile map String justName = new File(serverPath.getY()).getName(); String localPathName = LocalGVCFFolder + "/" + justName; idToLocalFile.put(gvcfId,localPathName); // add to serverToFIleList String server = "Local"; if (!serverPath.getX().equals("")) { server = serverPath.getX(); } List remoteFileList = serverToFileList.get(server); if (remoteFileList == null) { remoteFileList = new ArrayList(); serverToFileList.put(server,remoteFileList); } remoteFileList.add(serverPath.getY()); } return new Tuple, Map>>(idToLocalFile,serverToFileList); } /** * If needIndex is true, this is a gvcf and we assume there is an indexed file available * with the name convention of .tbi * * @param server * @param user * @param pwd * @param filesToDownload * @param localPath * @param needIndex */ public static void downloadFilesFromServer(String server, String user, String pwd, List filesToDownload, String localPath, boolean needIndex, MapfileNameHashValues) { try { SFTPConnection sftpConnection = new SFTPConnection(); sftpConnection.createConnection(server,user,pwd,true); for (String file: filesToDownload) { String parentFile = new File(file).getParent(); String fileName = new File(file).getName(); String fullLocalFile = localPath + "/" + fileName; if (Files.exists(Paths.get(fullLocalFile))) { String storedHash = fileNameHashValues.get(fileName); String existingFileHash = DBLoadingUtils.getChecksumForFile(new File(fullLocalFile),"MD5"); if (storedHash.equals(existingFileHash)) { myLogger.info("downloadFIlesFromServer: file " + fullLocalFile + " already exists with matching hash, skipping download."); continue; } } sftpConnection.downloadFile(localPath,parentFile,fileName); myLogger.info("downloadFilesFromServer: downloaded " + parentFile + "/" + fileName); if (needIndex == true) { String indexFile = fileName + ".tbi"; sftpConnection.downloadFile(localPath,parentFile,indexFile); myLogger.info("downloadFilesFromServer: downloaded " + parentFile + "/" + indexFile); } } sftpConnection.close(); } catch (Exception exc) { myLogger.error("downloadFileFromServer: caught exception trying to download files from server: " + server); throw new IllegalStateException("downloadFileFromServer, error downloading files: " + exc.getMessage()); } } /** * This function takes a database connection and a local path that holds gvcf files. * It queries the db for GVCF file data from the genome_file_data table. * Then converts the paths to local paths * Then opens a VCFFileReader for each gvcf file. * * It returns a map of , to be used by all nodes in the graph * * @param database * @param localGVCFFolder * @return */ public static Map createReadersForGVCFfiles(Connection database, String localGVCFFolder) { Map gvcfIdToRemotePath = VariantUtils.gvcfIdsToGvcfFileMap(database); Map gvcfIdToLocalFile = convertGVCFRemoteToLocalFiles(gvcfIdToRemotePath,localGVCFFolder); Map gvcfIdToVCFReader = new HashMap<>(); for (int gvcfFileId: gvcfIdToLocalFile.keySet()) { System.out.println("LCJ - createReadersForGVCFfiles - adding reader fro gvcfFileId " + gvcfFileId); String vcfFile = gvcfIdToLocalFile.get(gvcfFileId); String vcfIndex = vcfFile + ".tbi"; VCFFileReader variantReader = new VCFFileReader(new File(vcfFile),new File(vcfIndex)); gvcfIdToVCFReader.put(gvcfFileId,variantReader); } return gvcfIdToVCFReader; } /** * Closes all open VCFFileReaders for the input list. * @param gvcfIdToVCFReader */ public static void closeGVCFfileReaders( Map gvcfIdToVCFReader) { for (int gvcfFileId: gvcfIdToVCFReader.keySet()) { VCFFileReader reader = gvcfIdToVCFReader.get(gvcfFileId); reader.close(); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy