net.maizegenetics.pangenome.db_loading.DBLoadingUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.db_loading;
import com.google.common.collect.*;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFileReader;
import kotlin.Pair;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.pangenome.hapcollapse.GVCFUtils;
import net.maizegenetics.plugindef.ParameterCache;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
import org.sqlite.SQLiteConfig;
import org.xerial.snappy.Snappy;
import com.google.common.io.CharStreams;
import javax.json.Json;
import javax.json.JsonObject;
import javax.json.JsonObjectBuilder;
import javax.json.JsonReader;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.security.MessageDigest;
import java.sql.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/**
* Common methods used by postgres and sqlite dbs for loading/retrieving
* data from the PHG dbs. This is the place encoding/decoding methods for
* table data should stored.
*
* Authors zrm22 and lcj34.
*/
public class DBLoadingUtils {
private static final Logger myLogger = Logger.getLogger(DBLoadingUtils.class);
public static final String REGION_REFERENCE_RANGE_GROUP = "refRegionGroup";
public static final String INTER_REGION_REFERENCE_RANGE_GROUP = "refInterRegionGroup";
// When pulling reference_ranges, user may request just anchor, just interanchor or both.
// These are now refered to a "focus" or non-focus". This enum may be obsolete and
// thus removed in the future.
public static enum AnchorType {
INTER_ANCHOR(0),
ANCHOR(1),
BOTH(2) ;
int value;
private AnchorType(int typeValue) {
value = typeValue;
}
}
// Used to identify methods in the method table
// If adding new method types, add to the end.
// methods with type TEST_* will not be cached when running phg webKtor service
public static enum MethodType {
ANCHOR_HAPLOTYPES (1), // non-consensus, non-assembly methods loaded to the haplotypes table
ASSEMBLY_HAPLOTYPES(2), // methods used to load the assemblies to the haplotypes table
CONSENSUS_ANCHOR_SEQUENCE (3), // consensus methods loaded to the haplotypes table
EDGE(4),
READ_MAPPING(5), // read_mapping table methods
PATHS(6), // paths table methods
REF_RANGE_GROUP(7), // ref_range_groups method
TEST_ANCHOR_HAPLOTYPES (8), // anchor haplotypes testing
TEST_ASSEMBLY_HAPLOTYPES(9), // test method for assemblies
TEST_CONSENSUS_ANCHOR_SEQUENCE(10), // test method for consensus haplotypes
TEST_READ_MAPPING (11), // test method for read_mapppings
TEST_PATHS (12); // test method for paths
int value;
private MethodType(int typeValue) {
value = typeValue;
}
public int getValue() {
return value;
}
}
// this enum identifies an entry in the genome_file_data
// table as either fasta or gvcf
public static enum GenomeFileType {
FASTA(1),
GVCF(2);
int value;
private GenomeFileType(int typeValue) {
value = typeValue;
}
public int getValue() {
return value;
}
}
/**
* Creates a database connection from the TASSEL ParameterCache It is expected that only initial db loading methods
* will call this with "createNew" = true.
*
* @param createNew Indicates if the request is to connect to an existing db or to create a new one with the
* specified name.
*
* @return database connection
*/
public static Connection connection(boolean createNew) {
Optional hostOpt = ParameterCache.value("host");
if (!hostOpt.isPresent()) {
throw new IllegalArgumentException("DBLoadingUtils: connection: host not defined by configuration file (ParameterCache)");
}
String host = hostOpt.get();
Optional userOpt = ParameterCache.value("user");
if (!userOpt.isPresent()) {
throw new IllegalArgumentException("DBLoadingUtils: connection: user not defined by configuration file (ParameterCache)");
}
String user = userOpt.get();
Optional passwordOpt = ParameterCache.value("password");
if (!passwordOpt.isPresent()) {
throw new IllegalArgumentException("DBLoadingUtils: connection: password not defined by configuration file (ParameterCache)");
}
String password = passwordOpt.get();
Optional dbOpt = ParameterCache.value("DB");
if (!dbOpt.isPresent()) {
throw new IllegalArgumentException("DBLoadingUtils: connection: DB not defined by configuration file (ParameterCache)");
}
String dbName = dbOpt.get();
Optional typeOpt = ParameterCache.value("DBtype");
if (!typeOpt.isPresent()) {
throw new IllegalArgumentException("DBLoadingUtils: connection: DBtype not defined by configuration file (ParameterCache)");
}
String type = typeOpt.get();
myLogger.info("first connection: dbName from config file = " + dbName +
" host: " + host + " user: " + user + " type: " + type);
return connection(host, user, password, dbName, type, createNew);
}
/**
* Creates a database connection given a properties file
* It is expected that only initial db loading methods will
* call this with "createNew" = true.
*
* @param propertiesFile properties file
* @param createNew Indicates if the request is to connect to an existing db
* or to create a new one with the specified name.
*
* @return database connection
*/
public static Connection connection(String propertiesFile, boolean createNew) {
Properties properties = new Properties();
try {
properties.load(Utils.getBufferedReader(propertiesFile));
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalArgumentException("DBLoadingUtils:connection: problem reading properties file: " + propertiesFile);
}
String host = properties.getProperty("host");
String user = properties.getProperty("user");
String password = properties.getProperty("password");
String dbName = properties.getProperty("DB");
String type = properties.getProperty("DBtype");
myLogger.info("first connection: dbName from config file = " + dbName +
" host: " + host + " user: " + user + " type: " + type);
return connection(host, user, password, dbName, type, createNew);
}
/**
*
* Creates a new database connection or returns connection to existing db
* If createNew is FALSE then try to connect, and if db doesn't exist, return NULL
*
* NOTE: from postgres, User should never create a db that matches
* all lower case to an existing db. This will cause errors as our
* db check verifies based on all-lower case.
*
* To get a camel-case db name, the db must be created and accessed using
* This is likely to cause confusion, so this code defaults to postgres all-lowercase db names.
*
* @param host hostname
* @param user user id
* @param password password
* @param dbName database name
* @param type database type (sqlite or postgres)
* @param createNew if true, delete old db if it exists; create new db from PHG schema
*
* @return SQLite database connection
*/
public static Connection connection(String host, String user, String password, String dbName, String type,
boolean createNew) {
Connection connection = null;
String url = "";
if (type.equalsIgnoreCase("sqlite")) {
connection = getSQLiteConnection( host, user, password, dbName, createNew);
} else if (type.equalsIgnoreCase("postgres")){
// template1 url used to check db existance via SELECT datname FROM pg_catalog.pg_database WHERE lower(datname) = lower('dbname');
url = "jdbc:postgresql://" + host + "/template1" ;
myLogger.info("DBLoadingUtils:connection attempting Postgres connection, url is " + url);
Connection postGresDB = getPostgresConnection( url, host, user, password, dbName, createNew) ;
return postGresDB;
} else {
throw new IllegalStateException("DBLoadingUtils:connection: DBType must be sqlite or postgres. Unsupported db type: " + type);
}
myLogger.info("Connected to database: " + url + "\n");
return connection;
}
private static Connection getSQLiteConnection( String host, String user, String password, String dbName,
boolean createNew) {
Connection connection;
try {
boolean doesDBExist= Files.exists(Paths.get(dbName));
if (!doesDBExist && !createNew) {
// Doesn't exist, and don't create a new one: Specified when user expects to be connecting
// to an existing db to retrieve data or add new.
throw new IllegalStateException("DBLoadingUtils:getSQLiteConnection: requested DB does not exist: " + dbName);
}
if (doesDBExist && createNew) {
// DB exists, new one requested. Generally just used when initially loading from scratch
// Called from LoadGenomeIntervalsToPHGdbPlugin
try {
myLogger.info("\ndeleting old db\n");
Files.delete(Paths.get(dbName));
doesDBExist = false;
} catch (Exception exc){
myLogger.error("LoadGenomeIntervalsToPHGdbPluginError when trying to delete database file: " + dbName);
myLogger.error("File delete error: " + exc.getMessage());
throw new IllegalStateException ("DBLoadingUtils: getSQLiteConnection: could not delete old SQLite db: " + dbName);
}
}
// code returns an existing db, or creates a new DB instance with the user specified name
String url = "jdbc:sqlite:" + dbName;
myLogger.info("Database URL: " + url);
Class.forName("org.sqlite.JDBC");
connection = DriverManager.getConnection(url, user, password);
SQLiteConfig config=new SQLiteConfig();
connection = DriverManager.getConnection("jdbc:sqlite:" + dbName, config.toProperties());
connection.setAutoCommit(true); //This has massive performance effects
Statement statement = connection.createStatement();
statement.setQueryTimeout(30); // set timeout to 30 sec.
if(doesDBExist==false) {
String schema = CharStreams.toString(new InputStreamReader(PHGdbAccess.class.getResourceAsStream("PHGSchema.sql")));
myLogger.info("Database does not exist, creating new with schema: " + schema);
statement.executeUpdate(schema);
}
return connection;
} catch (Exception exc) {
myLogger.error("DBLoadingUtils: getSQLiteConnection error: " + exc.getMessage());
throw new IllegalStateException ("DBLoadingUtils: getSQLiteConnection: could not get SQLite db: " + dbName);
}
}
private static Connection getPostgresConnection(String url, String host, String user, String password, String dbName,
boolean createNew) {
Connection connection;
try {
Class.forName("org.postgresql.Driver");
connection = DriverManager.getConnection(url,user,password);
Statement statement = connection.createStatement();
// Note on Postgres: If you don't put quotes around the db name when adding it,
// the db name is stored in all lower case. The names ARE case-sensitive, but to get
// mixed-case in the name you have to use quotes.
// CREATE DATABASE testTemp
// Above creates a db named testtemp
//
// If you want the db to be named testTemp, you must create it via:
// CREATE DATABASE "testTemp"
// WHen searching for the db, the same applies. If it was created with quotes, you need to look for it via:
// select datname from pg_catalog.pg_database where datname = 'testTemplate';
// Otherwise, look for all lower vai:
// select datname from pg_catalog.pg_database where datname = 'testTemplate';
// Dropping is the same: If you want case-sensitive name, use:
// drop database "testTemplate";
// otherwise use:
// drop database testTemplate;
// This one above will look for "testtemplate" and drop that.
//
// NOTE: In this code, I am letting all default to lower case. It seems a bad idea to require all DB access
// to quote each reference to the db. Better to assume all is lower case.
// Check for db existence
String query = "SELECT datname FROM pg_catalog.pg_database WHERE lower(datname) = '" + dbName.toLowerCase() + "'";
myLogger.info("Query: " + query);
ResultSet rs = statement.executeQuery(query);
if (rs.next()) {
// DB exists - create and return connection
String dbNameLower = dbName.toLowerCase(); // need to figure out the lower/upper case stuff.
if (createNew) {
myLogger.info("Dropping old database " + dbNameLower);
query = "DROP DATABASE " + dbNameLower;
connection.createStatement().executeUpdate(query);
} else { // user wants the existing one
url = "jdbc:postgresql://" + host + "/" + dbNameLower;
myLogger.info("Database exists, Database URL: " + url);
rs.close();
connection.close(); // closing original connection to template1 db
Connection dbConnection = DriverManager.getConnection(url, user, password); // get connection to user specified db
return dbConnection;
}
}
myLogger.info("Database does NOT exist or was deleted per request: create it");
rs.close();
if (createNew) {
// New command - create db, load the schema
String dbNameLower = dbName.toLowerCase(); // need to figure out the lower/upper case stuff.
// if you don't specify template0, it adds tables already in existence in other dbs!
// https://www.postgresql.org/docs/9.0/static/sql-createdatabase.html
query = "CREATE DATABASE " + dbName + " with template template0";
myLogger.info("Createdb query, NOTE: db will be all lowercase when created: " + query);
statement.executeUpdate(query); // if it doesn't work, an exception will be thrown
statement.close();
connection.close(); // close connection to template1 db
// read schema into db
myLogger.info("Database successfully created, now add schema " + dbName);
url = "jdbc:postgresql://" + host + "/" + dbNameLower;
Connection dbConnection = DriverManager.getConnection(url, user, password);
String schema = CharStreams.toString(new InputStreamReader(DBLoadingUtils.class.getResourceAsStream("PHGPostgreSQLSchema.sql")));
// myLogger.info("Adding schema : \n" + schema);
//dbConnection.createStatement().executeUpdate(schema);
dbConnection.createStatement().executeUpdate(schema);
return dbConnection;
} else { // old db doesn't exist, user doesn't want a new one.
throw new IllegalStateException("Database " + dbName + " does not exist, returning null");
}
} catch (Exception exc) {
myLogger.error("DBLoadingUtils:getPostgresconnection: exception thrown, " + exc.getMessage());
throw new IllegalStateException("Could not get create/retrieve database " + dbName + ", error: " + exc.getMessage());
}
}
// Method to verify anchors for genome intervals have no overlapping positions
// Overlapping intervals are not supported in the PHG
public static Set verifyIntervalRanges(String anchorFile) {
Set overlappingPositions = new HashSet(); // overlaps to be returned
RangeSet intervalRanges = TreeRangeSet.create();
// Read the anchor file, store to RangeSet, check for overlaps as you add
// Store overlapping anchors to a Set to be returned to calling method
try (BufferedReader br = Utils.getBufferedReader(anchorFile)) {
String curLine;
while ((curLine = br.readLine()) != null) {
if (curLine.toUpperCase().contains("CHROMSTART")) continue;
String[] tokens = curLine.split("\\t");
Chromosome chrom = Chromosome.instance(tokens[0]);
Range interval = Range.closedOpen(Position.of(chrom, Integer.parseInt(tokens[1])),Position.of(chrom, Integer.parseInt(tokens[2])));
if (intervalRanges.intersects(interval)) {
overlappingPositions.add(curLine);
}
intervalRanges.add(interval);
}
} catch (Exception exc) {
throw new IllegalArgumentException("DBLoadingUtils : error reading anchors file " + exc.getMessage());
}
return overlappingPositions;
}
// This method returns a byte array containing only entries that fall within
// the specified interval range
public static byte[] encodeSelectedVCFRegionsToByteArray(String fileName, boolean onlyVariants,
boolean mergeRefRanges, Rangeinterval) {
try {
VCFFileReader vcfReader = new VCFFileReader(new File(fileName), false);
CloseableIterator vc = vcfReader.query(interval.lowerEndpoint().getChromosome().getName(),
interval.lowerEndpoint().getPosition(), interval.upperEndpoint().getPosition());
Stream variantStream = vc.stream();
byte[] regionBytes = encodeVariantContextStreamToByteArray(variantStream,onlyVariants,mergeRefRanges);
vc.close();
vcfReader.close();
return regionBytes;
} catch (Exception exc) {
myLogger.error("DBLoadingUtils:encodeSelectedVCFRegionsToByteArray: exception thrown, " + exc.getMessage());
throw new IllegalStateException ("DBLoadingUtils:encodeSelectedVCFRegionsToByteArray: unable to create vcfReader for file " + fileName);
}
}
public static byte[] encodeVCFFileToByteArray(String fileName, boolean onlyVariants, boolean mergeRefRanges) {
try{
VCFFileReader vcfReader = new VCFFileReader(new File(fileName), false);
CloseableIterator vcfIterator = vcfReader.iterator();
Stream variantStream = vcfIterator.stream();
byte[] vcfByteArray = encodeVariantContextStreamToByteArray(variantStream,onlyVariants,mergeRefRanges);
vcfIterator.close();
vcfReader.close();
return vcfByteArray;
}
catch(Exception e) {
e.printStackTrace();
}
return null;
}
//LoadHapSequences (non-assembly) has a gvcf file to process.
public static byte[] encodeVariantContextStreamToByteArray(Stream variantStream, boolean onlyVariants, boolean mergeRefRanges) throws IOException{
//Check to see if we only want to store the variants
if(onlyVariants) {
variantStream = variantStream.filter(variantContext -> checkVariant(variantContext));
}
List listOfVariants = variantStream.collect(Collectors.toList());
byte[] compressedStream = encodeVariantContextListToByteArray( listOfVariants, mergeRefRanges);
return compressedStream;
}
// Assembly processing stores the List
public static byte[] encodeVariantContextListToByteArray(List listOfVariants,boolean mergeRefRanges) throws IOException {
if(mergeRefRanges) {
listOfVariants = GVCFUtils.convertVCFToGVCF(listOfVariants);
}
ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
ObjectOutputStream objectStream = new ObjectOutputStream(byteStream);
objectStream.writeObject(listOfVariants);
byte[] serializedBytes = byteStream.toByteArray();
objectStream.close();
byteStream.close();
return Snappy.compress(serializedBytes);
}
private static boolean checkVariant(VariantContext vc) {
boolean isVariant = true;
if(!vc.isVariant()) {
isVariant = false;
}
//if we only have 2
return isVariant;
}
public static List decodeByteArrayToListOfVariantContext(byte[] encodedByteArray) {
try {
encodedByteArray = Snappy.uncompress(encodedByteArray);
ByteArrayInputStream byteStream = new ByteArrayInputStream(encodedByteArray);
ObjectInputStream objectStream = new ObjectInputStream(byteStream);
List vcfList = (List)objectStream.readObject();
objectStream.close();
byteStream.close();
return vcfList;
}
catch(Exception e) {
e.printStackTrace();
}
return null;
}
/**
* THis method takes 2 multisets of HaplotypeNode objects: one indicating inclusion counts
* for a haplotype, the other indicating exclusion counts. These sets are on a per-taxon basis.
*
* The data will be written compressed to a byte array for storage in the PHG db haplotype_counts
* table. If indicated, the data will also be written to files.
* @return
*/
public static byte[] encodeHapCountsArrayFromMultiset(Multiset perfectHitSet,
Multiset exclusionHitSet) {
// Create a list of all nodes included in at least 1 of the sets.
List toSort = new ArrayList<>(Sets.union(perfectHitSet.elementSet(), exclusionHitSet.elementSet()));
toSort.sort(Comparator.comparingInt(HaplotypeNode::id));
ByteBuffer bBuff=ByteBuffer.allocate((Integer.SIZE/8) + (Integer.SIZE/8)*toSort.size() * 3);
bBuff.putInt(toSort.size());
// Add them in the order of a 3xN array
for (HaplotypeNode haplotypeNode : toSort) {
bBuff.putInt(haplotypeNode.id());
bBuff.putInt(perfectHitSet.count(haplotypeNode)); // returns # times elements occurs, or 0
bBuff.putInt(exclusionHitSet.count(haplotypeNode));
}
byte[] dataAsByteArray;
try {
dataAsByteArray = Snappy.compress(Arrays.copyOf(bBuff.array(), bBuff.position()));
} catch (IOException e) {
throw new IllegalStateException("Could not compress byte array:");
}
return dataAsByteArray;
}
public static byte[] encodeHapCountsArrayFromFile(String fileName) {
// For this genoid id we take all the values in the file, store them
// into an array. Then that array is compressed into a byte buffer
List data = new ArrayList();
try (BufferedReader br = Utils.getBufferedReader(fileName)) {
String line = null;
while ((line = br.readLine()) != null) {
data.add(line);
}
} catch (IOException ioe) {
myLogger.error("DBLoadingUtils:encodeHapCOutnsArrayFromFile: error reading input file " + fileName + ", error:" + ioe.getMessage());
}
// Compress to a byte array.
// Buffer size in bytes: number of entries in the haplotype_counts file,
// times 3 as there are 3 values per line, times the number of bytes it takes
// to hold an integer (integer.SIZE/8) plus 1 additional integer to hold the array size
System.out.println("Encoded values to BB: ");
ByteBuffer bBuff=ByteBuffer.allocate((Integer.SIZE/8) + (Integer.SIZE/8)*data.size() * 3);
bBuff.putInt(data.size()); // store the datasize . Needed when decode the data
for (int idx = 0; idx < data.size(); idx++) {
String[] dataValues = data.get(idx).split("\\t");
bBuff.putInt(Integer.parseInt(dataValues[0])); // haplotype id
bBuff.putInt(Integer.parseInt(dataValues[1])); // inclusion count
bBuff.putInt(Integer.parseInt(dataValues[2])); // exclusion count
//System.out.println("dataValues0 " + dataValues[0] + ", values1: " + dataValues[1] + ", values2: " + dataValues[2]);
}
try {
return Snappy.compress(Arrays.copyOf(bBuff.array(), bBuff.position()));
} catch (IOException e) {
throw new IllegalStateException("Could not compress byte array:");
}
}
/*
* Methods takes a Snappy compressed byte array from the DB.
* This byte array is decompressed and turned into a 3xn array with the "3" being
* haplotype_id, inclusion_count, exclusion_count
*/
public static int[][] decodeHapCountsArray(byte[]dataAsByteArray) {
ByteBuffer bb;
try {
bb = ByteBuffer.wrap(Snappy.uncompress(dataAsByteArray));
} catch (IOException e) {
throw new IllegalStateException("encodeHapCountsArrayFromFile: could not uncompress dataAsByteArray");
}
bb.rewind();
int bbSize = bb.getInt(); // number of hapids is first value stored in ByteBuffer
int idx = 0;
int[][] countsData2 =new int[3][bbSize];
//System.out.println("\nReading into countsData2 values: ");
while (bb.hasRemaining()) {
// Read 3 at a time into a dataline for the array
countsData2[0][idx] =bb.getInt();
countsData2[1][idx] =bb.getInt();
countsData2[2][idx]=bb.getInt();
//System.out.print(" " + countsData2[0][idx] + " " + countsData2[1][idx] + " " + countsData2[2][idx] );
idx++;
}
System.out.println("\nFinished: countsData2 length: " + countsData2.length + ", countsData2[0].size " + countsData2[1].length + ", bb.size: " + bbSize);
return countsData2;
}
// Method to encode a taxon's path into a compressed bye array.
// This data will be stored in the paths table as the haplotype_paths BLOB.
public static byte[] encodePathArrayFromSet(Set paths) {
ByteBuffer bBuff;
byte[] dataAsByteArray;
try {
myLogger.debug("encodePathARrayFromSet: Extracting the haplotypeIds");
SortedSet sortedIdSet = paths.stream()
.map(haplotypeNode -> haplotypeNode.id())
.filter(hapId -> hapId != -1)
.collect(Collector.of(TreeSet::new,
(set, hapId) -> set.add(hapId),
(leftSet, rightSet) -> {
leftSet.addAll(rightSet);
return leftSet;
}));
myLogger.debug("encodePathArrayFromSet: created the compressed path data");
bBuff=ByteBuffer.allocate((Integer.SIZE/8 + Integer.SIZE/8) *sortedIdSet.size() );
bBuff.putInt(sortedIdSet.size()); // store the size (number of hapids) as first value
for (Integer hapId : sortedIdSet) {
bBuff.putInt(hapId);
}
try {
return Snappy.compress(Arrays.copyOf(bBuff.array(), bBuff.position()));
} catch (IOException e) {
throw new IllegalStateException("Could not compress byte array:");
}
} catch (Exception exc) {
throw new IllegalStateException("DBLoadingUtils:encodePathArrayFromSet: error creating compressed data, " + exc.getMessage());
}
}
/**
* This method takes a list of haplotype ids and compresses them to a byte array.
*
* @param paths
* @return
*/
public static byte[] encodePathsFromIntArray(List paths) {
ByteBuffer bBuff;
bBuff=ByteBuffer.allocate((Integer.SIZE/8 + Integer.SIZE/8) *paths.size() );
bBuff.putInt(paths.size()); // store the size (number of hapids) as first value
for (Integer hapId : paths) {
bBuff.putInt(hapId);
}
try {
return Snappy.compress(Arrays.copyOf(bBuff.array(), bBuff.position()));
} catch (IOException e) {
throw new IllegalStateException("Could not compress byte array:");
}
}
// This method takes the "BLOB" stored for the haplotype_paths field of the paths
// table and decodes it into an int array containing the hapids that comprise the path.
public static int[] decodePathsArray(byte[]dataAsByteArray) {
ByteBuffer bb;
try {
bb = ByteBuffer.wrap(Snappy.uncompress(dataAsByteArray));
} catch (IOException e) {
throw new IllegalStateException("decodePathsArray: could not uncompress dataAsByteArray");
}
bb.rewind();
int bbSize = bb.getInt(); // number of hapids is first value stored in ByteBuffer
int idx = 0;
int[] paths =new int[bbSize];
while (bb.hasRemaining()) {
// each entry is a hapid on the path
paths[idx] =bb.getInt();
idx++;
}
return paths;
}
// Method to encode a taxon's path into a compressed byte array.
// This method allows for multiple path lists. The first int in the encoded
// array indicates how many lists are present. This is the negative of the
// actual number encoded. Using a negative number to differentiate paths lists
// encoded when only a single gamete path was supported, vs this method, which
// supports multiple path lists. This is necessary for decodePathsForMultipleLists
// to understand the blob encoding.
// This data will be stored in the paths table as the haplotype_paths BLOB.
public static byte[] encodePathArrayForMultipleLists(List> paths) {
try {
myLogger.debug("encodePathArrayForMultipleLists: Extracting the haplotypeIds");
// Storing a negative number for number of lists. This is to distinguish
// new MultipleLists (supporting diploids) from the previous encoding which
// only supported a single gamete list. The decoding method will check for
// a negative number and use that to determine how to decode. This facilitates
// processing existing path "blob" data from old dbs.
int numSets = paths.size() * -1;
// Do not use Sets as the same hapid may appear on multiple lists
List combinedLists = new ArrayList();
int setSize = 0;
// Loop through the Lists, adding all data to the combined List
for (List hapList : paths) {
List hapIdList = hapList.stream()
.map(haplotypeNode -> haplotypeNode.id())
.filter(hapId -> hapId != -1)
.collect(Collector.of(ArrayList::new,
(set, hapId) -> set.add(hapId),
(leftSet, rightSet) -> {
leftSet.addAll(rightSet);
return leftSet;
}));
// The number of hapids per gamete may vary. Store the number in each
// set before storing the hapids themselves.
setSize = hapIdList.size();
combinedLists.add(setSize);
combinedLists.addAll(hapIdList);
myLogger.info("encodePathArrayForMultipleLists: setSize is " + setSize + ", combinedSortedSet size: " + combinedLists.size());
}
ByteBuffer bBuff=ByteBuffer.allocate((Integer.SIZE/8) + (Integer.SIZE/8) *combinedLists.size() );
bBuff.putInt(numSets); // store number of sets (handles diploids)
for (Integer hapId : combinedLists) {
bBuff.putInt(hapId);
}
try {
return Snappy.compress(bBuff.array());
} catch (IOException e) {
throw new IllegalStateException("Could not compress byte array:");
}
} catch (Exception exc) {
throw new IllegalStateException("DBLoadingUtils:encodePathArrayForMultipleLists: error creating compressed data, " + exc.getMessage());
}
}
// This method takes the "BLOB" stored for the haplotype_paths field of the paths
// table and decodes it into a list of int arrays containing the hapids that comprise the path.
// This method decodes paths created via encodePathArrayForMultipleLists() above.
// The uncompressed paths data is of the form:
// < hapids for set 2> ...
public static List> decodePathsForMultipleLists(byte[]dataAsByteArray) {
ByteBuffer bb;
try {
bb = ByteBuffer.wrap(Snappy.uncompress(dataAsByteArray));
} catch (IOException e) {
throw new IllegalStateException("decodePathsArray: could not uncompress dataAsByteArray");
}
bb.rewind();
List> hapidLists = new ArrayList>();
int numLists = bb.getInt(); // first int is number of lists
// Check for old blob encoding: decode using previous method
if (numLists > 0) {
int[] paths = decodePathsArray(dataAsByteArray);
List pathList = IntStream.of(paths) // returns IntStream
.boxed()
.collect(Collectors.toList());
hapidLists.add(pathList);
return hapidLists;
} else {
// lists encoded via encodePathArrayForMultipleLists use negative
// numbers for the number of lists. Flip to positive for processing below.
numLists = numLists * -1;
}
// Process the buffer, splitting data into separate lists
while (bb.hasRemaining()) {
// each entry is a hapid on the path
List hapList = new ArrayList();
int listSize = bb.getInt(); // before each list is an int indicating list size
for (int idx = 0; idx < listSize; idx++) {
hapList.add(bb.getInt());
}
hapidLists.add(hapList);
}
return hapidLists;
}
public static List splitCigar(String cigarString) {
//One cigar component is a number of any digit, followed by a letter or =
Pattern cigarPattern = Pattern.compile("[\\d]+[a-zA-Z|=]");
ArrayList cigarElems = new ArrayList();
Matcher matcher = cigarPattern.matcher(cigarString);
while (matcher.find()) {
cigarElems.add( matcher.group() );
}
return cigarElems;
}
/**
* This method creates a list of allele strings based on the allele
* set of A,C,G,T,N
*
* The size of the set will be 5 + 5^2 + 5^3 + ... + 5^n where "n" is
* maxKmerLen passed in and "5^n" is 5 to the nth power.
*
* For example: if maxKmerLen = 3, size of initial Allele list is: 5 + 25 + 125 = 155;
* if maxKmerLen = 5, size of initial Allele list is: 5 + 25 + 125 + 625 + 3125 = 3905
*
* @param maxKmerLen
* @return
*/
public static List createInitialAlleles(int maxKmerLen) {
String[] alleleList = {"A","C","G","T","N"};
List initialAlleles = new ArrayList();
// Creating 3mers
List currentList = new ArrayList();
// create first list:
for (String allele : alleleList) {
currentList.add(allele);
}
initialAlleles.addAll(currentList);
// now, run a loop for the number of times we want these added up
// This goes to maxKmerLen-1 because we already added the first list above
for (int kmerIdx = 0; kmerIdx < maxKmerLen-1; kmerIdx++) {
List tempList = new ArrayList();
tempList.addAll(currentList);
currentList.clear();
for (String allele : alleleList) {
for (int idx = 0; idx < tempList.size(); idx++) {
String newAllele = tempList.get(idx) + allele;
currentList.add(newAllele);
}
}
initialAlleles.addAll(currentList);
}
return initialAlleles;
}
/**
* This method takes a Map of parameterName to parameterValue, and formats them into a JSON string.
* This string will be used by the calling method as the description entry
* for the PHG methods table.
* @param parameterList
* @return
*/
public static String formatMethodParamsToJSON(Map parameterList) {
JsonObjectBuilder objectBuilder = Json.createObjectBuilder();
parameterList.keySet().stream().forEach ( item -> {
objectBuilder.add(item,(parameterList.get(item)==null)? "null" : parameterList.get(item));
});
JsonObject jsonObject = objectBuilder.build();
String jsonString;
try(Writer writer = new StringWriter()) {
Json.createWriter(writer).write(jsonObject);
jsonString = writer.toString();
} catch (Exception exc) {
throw new IllegalArgumentException("formatMethodParamsToJSON: could not create json string");
}
return jsonString;
}
/**
* Takes a passed method description string from a PHG dd methods table entry,
* and formats the JSON key/value pairs into a Map for the user.
* If the string does not parse to JSON, a single map entry of "notes":methodDescription
* will be created and returned.
* @param methodDescription
* @return
*/
public static Map parseMethodJsonParamsToString(String methodDescription) {
Map pluginParams = new HashMap();
JsonReader reader = Json.createReader(new StringReader(methodDescription));
try {
// This will throw an "Unexpected char .. at .." error if the string is not JSON
// This will happen for older dbs where we stored a string for the description.
// Catch the error, create a map with a single entry for the JSON pair
JsonObject descObject = reader.readObject();
Set keys = descObject.keySet();
keys.stream().forEach( key -> {
pluginParams.put(key.toString(),descObject.get(key).toString());
});
} catch (Exception exc){
myLogger.info("parseMethodJsonParamsToString: methodDescription is not JSON, creating single map entry as notes:methodDescription for " + methodDescription);
pluginParams.put("notes",methodDescription);
}
return pluginParams;
}
/**
* This method connects to a database, finds the haplotypes for a specific gamete group,
* and creates an ordered-by-ref-range list of haplotype ids.
*
* The intended use is for path creation for Assembly and WGS input.
* @param conn
* @param gamete_grp_id
* @return
*/
public static List createPathNodesForGameteGrp(String taxon, Connection conn, int gamete_grp_id) {
List hapNodes = new ArrayList<>();
StringBuilder sb = new StringBuilder();
sb.append("select haplotypes_id, haplotypes.ref_range_id, chrom, range_start, range_end from haplotypes, reference_ranges ");
sb.append(" WHERE haplotypes.ref_range_id=reference_ranges.ref_range_id ");
sb.append(" AND haplotypes.gamete_grp_id=");
sb.append(gamete_grp_id);
sb.append(" order by chrom,range_start");
String query = sb.toString();
try (ResultSet rs = conn.createStatement().executeQuery(query)) {
while (rs.next()) {
int hapid = rs.getInt("haplotypes_id");
int id = rs.getInt("ref_range_id");
String chromosome = rs.getString("chrom");
int start = rs.getInt("range_start");
int end = rs.getInt("range_end");
hapNodes.add(hapid);
}
} catch (Exception exc) {
myLogger.debug(exc.getMessage(), exc);
throw new IllegalStateException("AssemblyHaplotypesPlugin: referenceRanges: Problem querying the database: " + exc.getMessage());
}
return hapNodes;
}
// Used for creating checksum on assembly fasta files.
// For BrAPI compatibility, PHG uses MD5
public static String getChecksumForFile(File file, String protocol) {
// from https://howtodoinjava.com/java/io/sha-md5-file-checksum-hash/
try {
FileInputStream fis = new FileInputStream(file);
MessageDigest md = MessageDigest.getInstance(protocol);
//Create byte array to read data in chunks
byte[] byteArray = new byte[1024];
int bytesCount = 0;
//Read file data and update in message digest
while ((bytesCount = fis.read(byteArray)) != -1) {
md.update(byteArray, 0, bytesCount);
}
fis.close();
byte[] byteData = md.digest();
// convert the byte to hex format
StringBuffer sb = new StringBuffer();
for (int idx = 0; idx < byteData.length; idx++) {
sb.append(Integer.toString((byteData[idx] & 0xff) + 0x100, 16).substring(1));
}
return sb.toString();
} catch (Exception exc) {
myLogger.error("getChecksumForString: problem getting checksum: " + exc.getMessage());
throw new IllegalStateException("CheckSum: getChecksumForFile: error: " + exc.getMessage());
}
}
public static byte[] encodeHapidListToByteArray(List hapidList) {
try {
ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
ObjectOutputStream objectStream = new ObjectOutputStream(byteStream);
objectStream.writeObject(hapidList);
byte[] serializedBytes = byteStream.toByteArray();
objectStream.close();
byteStream.close();
return Snappy.compress(serializedBytes);
} catch (Exception exc) {
throw new IllegalStateException("DBLoadingUtils:encodeHapidListToByteArray: failed to encode bytes: " + exc.getMessage());
}
}
public static List decodeHapidList(byte[] encodedByteArray) {
try {
encodedByteArray = Snappy.uncompress(encodedByteArray);
ByteArrayInputStream byteStream = new ByteArrayInputStream(encodedByteArray);
ObjectInputStream objectStream = new ObjectInputStream(byteStream);
List hapidIntList = (List)objectStream.readObject();
objectStream.close();
byteStream.close();
return hapidIntList;
}
catch(Exception exc) {
throw new IllegalStateException("DBLoadingUtils:decodeHapidList: failed to decode bytes: " + exc.getMessage());
}
}
}