net.maizegenetics.pangenome.api.VariantUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.api;
import com.google.common.collect.ImmutableMap;
import htsjdk.variant.vcf.VCFFileReader;
import net.maizegenetics.pangenome.api.HaplotypeNode.VariantInfo;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils;
import net.maizegenetics.util.Tuple;
import org.apache.log4j.Logger;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import net.maizegenetics.pangenome.io.SFTPConnection;
public class VariantUtils {
private static final Logger myLogger = Logger.getLogger(VariantUtils.class);
public static double maxError = 0.2;
/**
* This method returns a map of a gvcf file name (NOT full path) to
* that file's stored hash value. This can be used to verify that
* files with this name already stored to a local gvcf folder have the
* same hash as the file referenced in the table.
* @param database
* @return
*/
public static Map gvcfFileNameToHash(Connection database) {
String query = "SELECT genome_file, file_checksum from genome_file_data where type=" + DBLoadingUtils.GenomeFileType.GVCF.getValue();
ImmutableMap.Builder result = new ImmutableMap.Builder<>();
try (ResultSet rs = database.createStatement().executeQuery(query)) {
while (rs.next()) {
String file = rs.getString("genome_file");
String hash = rs.getString("file_checksum");
result.put(file,hash);
}
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("error querying genome_file_data ");
} catch (Exception exc) {
myLogger.error(exc.getMessage(), exc);
throw new IllegalStateException("VariantUtils: gvcfFileNameToHash: Error querying PHG db for genome_file_data:" + exc.getMessage());
}
return result.build();
}
/**
* This method returns all the gvcf file entries from the genome_file_data table.
* These are the entries with type=2 (type=1 are assemblies)
* @param database
* @return
*/
public static Map gvcfIdsToGvcfFileMap(Connection database) {
String query = "SELECT id, genome_path, genome_file from genome_file_data where type=" + DBLoadingUtils.GenomeFileType.GVCF.getValue();
ImmutableMap.Builder result = new ImmutableMap.Builder<>();
try (ResultSet rs = database.createStatement().executeQuery(query)) {
while (rs.next()) {
int gvcfId = rs.getInt("id");
String genome_path = rs.getString("genome_path");
String file = rs.getString("genome_file");
String serverFilePath = genome_path + "/" + file;
// gvcfServerPath is of format: semi-colon separated server and path, e.g. myserver.com;/path/to/gvcfs
result.put(gvcfId,serverFilePath);
}
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("error querying genome_file_data ");
} catch (Exception exc) {
myLogger.error(exc.getMessage(), exc);
throw new IllegalStateException("VariantUtils: gvcfIdsToGvcfFileMap: Error querying PHG db for genome_file_data:" + exc.getMessage());
}
return result.build();
}
/**
* This method takes a STring of the form and returns
* a tuple with Tuple.x=server and Tuple.y=path.
* @param genomePath
* @return
*/
public static Tuple splitGenomePath(String genomePath) {
int semiIndex = genomePath.indexOf(";");
String server = "";
String path = genomePath;
if (semiIndex > -1) {
server = genomePath.substring(0,semiIndex);
path = genomePath.substring(semiIndex+1);
} else {
throw new IllegalArgumentException("genome path variable must be a semi-colon separated string, with the first portion indicating the server address, e.g. server;/path/to/file. Error on genomePath: " + genomePath);
}
return new Tuple(server,path);
}
// /**
// * @param variant
// * @return int[] containing variant data. The content depends on the whether variant is positive or negative.
// *
// * If variant is positive then the long encodes data for a variant.
// * The return int[] is 1, variant id, refDepth, altDepth.
// *
// * If the variant is negative then the long encodes data for a reference block
// * The return int[] is -1, block length, read depth, the block chromosomal position.
// */
// public static int[] decodeLongVariant(Long variant) {
// int[] info = new int[4];
// long vmlong = variant.longValue();
// if (variant >= 0) {
// info[0] = 1;
// //Variant: 4 bytes= variant_mapping table id | 1 byte=refDepth | 1 byte=altDepth | 1 isIndel | 1 byte=unused
// vmlong >>= 16;
// byte altDepthByte = (byte)(vmlong & 0xFF); // to correctly handle negative numbers, load first into byte
// info[3] = (int) altDepthByte; //altDepth
// vmlong >>= 8;
// byte refDepthByte = (byte) (vmlong & 0xFF);
// info[2] = (int) refDepthByte; //refDepth
// vmlong >>= 8;
// info[1] = (int) vmlong; //variant id from database
// } else {
// info[0] = -1;
// //ref: 1bit=ref | 2 bytes 7 bits = refLength | 1 bytes=refDepth | 4 bytes=position on chrom
// vmlong ^= 1L << 63;
// info[3] = (int) (vmlong & 0xFFFFFFFF);
// vmlong >>= 32;
// byte refDepthByte = (byte) (vmlong & 0xFF);
// info[2] = (int) refDepthByte;
// vmlong >>= 8;
// info[1] = (int) vmlong;
// }
//
// return info;
// }
public static String assignGenotpe(String refAllele, String altAllele, int refDepth, int altDepth) {
if (refDepth > altDepth && altDepth < maxError * (refDepth + altDepth) ) {
return refAllele;
}
if (refDepth < altDepth && refDepth < maxError * (refDepth + altDepth) ) {
return altAllele;
}
return VariantInfo.missing;
}
/**
* takes the remote gvcf file paths as stored in the db genome_file_data table,
* and translates them to a local path based on the user provided localGVCFFolder value
* @param gvcfIdToRemotePath
* @param localGVCFFolder
* @return
*/
public static Map convertGVCFRemoteToLocalFiles(Map gvcfIdToRemotePath,String localGVCFFolder) {
Map idToLocalFile = new HashMap<>();
for (Map.Entry entry : gvcfIdToRemotePath.entrySet()) {
int gvcfId = entry.getKey();
Tuple serverPath = splitGenomePath(entry.getValue());
// populate the idToLocalFile map
String justName = new File(serverPath.getY()).getName();
String localPathName = localGVCFFolder + "/" + justName;
idToLocalFile.put(gvcfId,localPathName);
}
return idToLocalFile;
}
/**
* GIven a Map of gvcfFileIds to remote path (data from the genome_file_data table),
* return a list of servers and the gvcf files do be downloaded from them.
*
* It is required that each "path" have a server indicated, even if it is local.
* @param gvcfIdToRemotePath
* @return
*/
public static Map> getGVCFforDownload(Map gvcfIdToRemotePath) {
// This is a Map of a file server name, to a list of files to pull from that server
// Initially, we are only supported 1 server per db, but allowing for multiple here
// in case that changes.
Map> serverToFileList = new HashMap<>();
for (Map.Entry entry : gvcfIdToRemotePath.entrySet()) {
Tuple serverPath = splitGenomePath(entry.getValue());
// determine server name if there is one
String server = "Local";
if (!serverPath.getX().equals("")) {
server = serverPath.getX();
}
List remoteFileList = serverToFileList.get(server);
if (remoteFileList == null) {
remoteFileList = new ArrayList();
serverToFileList.put(server,remoteFileList);
}
remoteFileList.add(serverPath.getY());
}
return serverToFileList;
}
/**
* This needs to return a Map of gvcfId to Tuple
* The remote path will be parsed later for downloading from servers.
* @param gvcfIdToRemotePath
* @param LocalGVCFFolder
* @return
*/
public static Tuple