net.maizegenetics.pangenome.db_loading.DBLoadingUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
package net.maizegenetics.pangenome.db_loading;

import com.google.common.collect.*;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFileReader;
import kotlin.Pair;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.pangenome.hapcollapse.GVCFUtils;
import net.maizegenetics.plugindef.ParameterCache;
import net.maizegenetics.util.Utils;

import org.apache.log4j.Logger;
import org.sqlite.SQLiteConfig;
import org.xerial.snappy.Snappy;

import com.google.common.io.CharStreams;

import javax.json.Json;
import javax.json.JsonObject;
import javax.json.JsonObjectBuilder;
import javax.json.JsonReader;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.security.MessageDigest;
import java.sql.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

/**
 * Common methods used by postgres and sqlite dbs for loading/retrieving
 * data from the PHG dbs.  This is the place encoding/decoding methods for
 * table data should stored.
 * 
 * Authors zrm22 and lcj34.
 */
public class DBLoadingUtils {

    private static final Logger myLogger = Logger.getLogger(DBLoadingUtils.class);

    public static final String REGION_REFERENCE_RANGE_GROUP = "refRegionGroup";
    public static final String INTER_REGION_REFERENCE_RANGE_GROUP = "refInterRegionGroup";

    // When pulling reference_ranges, user may request just anchor, just interanchor or both.
    // These are now refered to a "focus" or non-focus".  This enum may be obsolete and
    // thus removed in the future.
    public static enum AnchorType  {
        INTER_ANCHOR(0), 
        ANCHOR(1), 
        BOTH(2) ;
        int value;
        private AnchorType(int typeValue) {
            value = typeValue;
        }
    }

    // Used to identify methods in the method table
    // If adding new method types, add to the end.
    // methods with type TEST_*  will not be cached when running phg webKtor service
    public static enum MethodType {
        ANCHOR_HAPLOTYPES (1), // non-consensus, non-assembly methods loaded to the haplotypes table
        ASSEMBLY_HAPLOTYPES(2), // methods used to load the assemblies to the haplotypes table
        CONSENSUS_ANCHOR_SEQUENCE (3), // consensus methods loaded to the haplotypes table
        EDGE(4),
        READ_MAPPING(5), // read_mapping table methods
        PATHS(6), // paths table methods
        REF_RANGE_GROUP(7), // ref_range_groups method
        TEST_ANCHOR_HAPLOTYPES (8), // anchor haplotypes testing
        TEST_ASSEMBLY_HAPLOTYPES(9), // test method for assemblies
        TEST_CONSENSUS_ANCHOR_SEQUENCE(10), // test method for consensus haplotypes
        TEST_READ_MAPPING (11), // test method for read_mapppings
        TEST_PATHS (12); // test method for paths
        int value;
        private MethodType(int typeValue) {
            value = typeValue;
        }
        public int getValue() {
            return value;
        }
    }

    // this enum identifies an entry in the genome_file_data
    // table as either fasta or gvcf
    public static enum GenomeFileType  {
        FASTA(1),
        GVCF(2);

        int value;
        private GenomeFileType(int typeValue) {
            value = typeValue;
        }
        public int getValue() {
            return value;
        }
    }

    /**
     * Creates a database connection from the TASSEL ParameterCache It is expected that only initial db loading methods
     * will call this with "createNew" = true.
     *
     * @param createNew Indicates if the request is to connect to an existing db or to create a new one with the
     * specified name.
     *
     * @return database connection
     */
    public static Connection connection(boolean createNew) {

        Optional hostOpt = ParameterCache.value("host");
        if (!hostOpt.isPresent()) {
            throw new IllegalArgumentException("DBLoadingUtils: connection: host not defined by configuration file (ParameterCache)");
        }
        String host = hostOpt.get();

        Optional userOpt = ParameterCache.value("user");
        if (!userOpt.isPresent()) {
            throw new IllegalArgumentException("DBLoadingUtils: connection: user not defined by configuration file (ParameterCache)");
        }
        String user = userOpt.get();

        Optional passwordOpt = ParameterCache.value("password");
        if (!passwordOpt.isPresent()) {
            throw new IllegalArgumentException("DBLoadingUtils: connection: password not defined by configuration file (ParameterCache)");
        }
        String password = passwordOpt.get();

        Optional dbOpt = ParameterCache.value("DB");
        if (!dbOpt.isPresent()) {
            throw new IllegalArgumentException("DBLoadingUtils: connection: DB not defined by configuration file (ParameterCache)");
        }
        String dbName = dbOpt.get();

        Optional typeOpt = ParameterCache.value("DBtype");
        if (!typeOpt.isPresent()) {
            throw new IllegalArgumentException("DBLoadingUtils: connection: DBtype not defined by configuration file (ParameterCache)");
        }
        String type = typeOpt.get();

        myLogger.info("first connection: dbName from config file = " + dbName +
                " host: " + host + " user: " + user + " type: " + type);

        return connection(host, user, password, dbName, type, createNew);

    }

    /**
     * Creates a database connection given a properties file
     * It is expected that only initial db loading methods will
     * call this with "createNew" = true.
     *
     * @param propertiesFile properties file
     * @param createNew Indicates if the request is to connect to an existing db
     *                  or to create a new one with the specified name.
     *
     * @return database connection
     */
    public static Connection connection(String propertiesFile, boolean createNew) {

        Properties properties = new Properties();
        try {
            properties.load(Utils.getBufferedReader(propertiesFile));
        } catch (Exception e) {
            myLogger.debug(e.getMessage(), e);
            throw new IllegalArgumentException("DBLoadingUtils:connection: problem reading properties file: " + propertiesFile);
        }

        String host = properties.getProperty("host");
        String user = properties.getProperty("user");
        String password = properties.getProperty("password");
        String dbName = properties.getProperty("DB");
        String type = properties.getProperty("DBtype");

        myLogger.info("first connection: dbName from config file = " + dbName +
                " host: " + host + " user: " + user + " type: " + type);
        return connection(host, user, password, dbName, type, createNew);

    }

    /**     
     * 
     * Creates a new database connection or returns connection to existing db
     * If createNew is FALSE then try to connect, and if db doesn't exist, return NULL
     * 
     * NOTE:  from postgres, User should never create a db that matches
     * all lower case to an existing db.  This will cause errors as our
     * db check verifies based on all-lower case. 
     * 
     * To get a camel-case db name, the db must be created and accessed using
     * This is likely to cause confusion, so this code defaults to postgres all-lowercase db names.
     *
     * @param host hostname
     * @param user user id
     * @param password password
     * @param dbName database name
     * @param type database type (sqlite or postgres)
     * @param createNew if true, delete old db if it exists; create new db from PHG schema
     *
     * @return SQLite database connection
     */
    public static Connection connection(String host, String user, String password, String dbName, String type,
            boolean createNew) {

        Connection connection = null;
        String url = "";

        if (type.equalsIgnoreCase("sqlite")) {
            connection = getSQLiteConnection( host,  user,  password,  dbName, createNew); 
        } else if (type.equalsIgnoreCase("postgres")){
            // template1 url used to check db existance via SELECT datname FROM pg_catalog.pg_database WHERE lower(datname) = lower('dbname');                
            url = "jdbc:postgresql://" + host + "/template1" ;  
            myLogger.info("DBLoadingUtils:connection attempting Postgres connection, url is " + url);
            Connection postGresDB = getPostgresConnection( url,  host,  user,  password,  dbName, createNew) ;           
            return postGresDB;
        } else {
            throw new IllegalStateException("DBLoadingUtils:connection: DBType must be sqlite or postgres. Unsupported db type: " + type);
        }

        myLogger.info("Connected to database:  " + url + "\n");
        return connection;
    }


    private static Connection getSQLiteConnection( String host, String user, String password, String dbName, 
            boolean createNew) {
        Connection connection;
        try {
            boolean doesDBExist= Files.exists(Paths.get(dbName)); 
            if (!doesDBExist && !createNew) { 
                // Doesn't exist, and don't create a new one:  Specified when user expects to be connecting
                // to an existing db to retrieve data or add new.
                throw new IllegalStateException("DBLoadingUtils:getSQLiteConnection: requested DB does not exist: " + dbName);
            }
            if (doesDBExist && createNew) { 
                // DB exists, new one requested. Generally just used when initially loading from scratch
                // Called from LoadGenomeIntervalsToPHGdbPlugin
                try {
                    myLogger.info("\ndeleting old db\n");
                    Files.delete(Paths.get(dbName));
                    doesDBExist = false;
                } catch (Exception exc){
                    myLogger.error("LoadGenomeIntervalsToPHGdbPluginError when trying to delete database file: " + dbName);
                    myLogger.error("File delete error: " + exc.getMessage());
                    throw new IllegalStateException ("DBLoadingUtils: getSQLiteConnection: could not delete old SQLite db: " + dbName);                
                }
            }
            
            // code returns an existing db, or creates a new DB instance with the user specified name           
            String url = "jdbc:sqlite:" + dbName;
            myLogger.info("Database URL: " + url);
            Class.forName("org.sqlite.JDBC");
            connection = DriverManager.getConnection(url, user, password);

            SQLiteConfig config=new SQLiteConfig();

            connection = DriverManager.getConnection("jdbc:sqlite:" + dbName, config.toProperties());
            connection.setAutoCommit(true);  //This has massive performance effects
            Statement statement = connection.createStatement();
            statement.setQueryTimeout(30);  // set timeout to 30 sec.
            if(doesDBExist==false) {
                String schema = CharStreams.toString(new InputStreamReader(PHGdbAccess.class.getResourceAsStream("PHGSchema.sql")));
                myLogger.info("Database does not exist, creating new with schema: " + schema);               
                statement.executeUpdate(schema);
            }
            return connection;
        } catch (Exception exc) {
            myLogger.error("DBLoadingUtils: getSQLiteConnection error: " + exc.getMessage());
            throw new IllegalStateException ("DBLoadingUtils: getSQLiteConnection: could not get SQLite db: " + dbName);
        }
 
    }
    
    private static Connection getPostgresConnection(String url, String host, String user, String password, String dbName,
            boolean createNew) {
        Connection connection;
        try {

            Class.forName("org.postgresql.Driver");
            connection = DriverManager.getConnection(url,user,password);
            Statement statement = connection.createStatement();

            // Note on Postgres:  If you don't put quotes around the db name when adding it,
            // the db name is stored in all lower case.  The names ARE case-sensitive, but to get
            // mixed-case in the name you have to use quotes.
            //    CREATE DATABASE testTemp
            // Above creates a db named testtemp
            //
            // If you want the db to be named testTemp, you must create it via:
            //    CREATE DATABASE "testTemp"
            // WHen searching for the db, the same applies.  If it was created with quotes, you need to look for it via:
            //     select datname from pg_catalog.pg_database where datname = 'testTemplate';
            // Otherwise, look for all lower vai:
            //     select datname from pg_catalog.pg_database where datname = 'testTemplate';
            // Dropping is the same:  If you want case-sensitive name, use:
            //     drop database "testTemplate";
            // otherwise use:
            //    drop database testTemplate;
            //  This one above will look for "testtemplate" and drop that.  
            //
            // NOTE: In this code, I am letting all default to lower case.  It seems a bad idea to require all DB access
            // to quote each reference to the db.  Better to assume all is lower case.
            
            // Check for db existence
            String query = "SELECT datname FROM pg_catalog.pg_database WHERE lower(datname) = '" + dbName.toLowerCase() + "'";
            myLogger.info("Query: " + query);
            
            ResultSet rs = statement.executeQuery(query);
            if (rs.next()) {
                // DB exists - create and return connection
                String dbNameLower = dbName.toLowerCase(); // need to figure out the lower/upper case stuff.
                 if (createNew) {
                     myLogger.info("Dropping old database " + dbNameLower);
                     query = "DROP DATABASE " + dbNameLower;

                     connection.createStatement().executeUpdate(query);
                 } else { // user wants the existing one
                     url = "jdbc:postgresql://" + host + "/" + dbNameLower;
                     myLogger.info("Database exists, Database URL: " + url);
                     rs.close();
                     connection.close(); // closing original connection to template1 db
                     
                     Connection dbConnection = DriverManager.getConnection(url, user, password); // get connection to user specified db
                     return dbConnection;
                 }                              
            }
            myLogger.info("Database does NOT exist or was deleted per request: create it");
            rs.close();
            
            if (createNew) {
                // New command - create db, load the schema
                String dbNameLower = dbName.toLowerCase(); // need to figure out the lower/upper case stuff.
                // if you don't specify template0, it adds tables already in existence in other dbs!
                // https://www.postgresql.org/docs/9.0/static/sql-createdatabase.html
                query = "CREATE DATABASE  " + dbName + " with template template0";
     
                myLogger.info("Createdb query, NOTE: db will be all lowercase when created: " + query);
                statement.executeUpdate(query); // if it doesn't work, an exception will be thrown
                statement.close();
                connection.close(); // close connection to template1 db
                
                // read schema into db
                myLogger.info("Database successfully created, now add schema " + dbName);  
                
                url = "jdbc:postgresql://" + host + "/" + dbNameLower;
                Connection dbConnection = DriverManager.getConnection(url, user, password); 
                String schema = CharStreams.toString(new InputStreamReader(DBLoadingUtils.class.getResourceAsStream("PHGPostgreSQLSchema.sql")));
               
               // myLogger.info("Adding schema : \n" + schema);
                //dbConnection.createStatement().executeUpdate(schema); 
                dbConnection.createStatement().executeUpdate(schema);
                return dbConnection;
            } else { // old db doesn't exist, user doesn't want a new one.
                throw new IllegalStateException("Database " + dbName + " does not exist, returning null");
            }          
 
        } catch (Exception exc) {
            myLogger.error("DBLoadingUtils:getPostgresconnection: exception thrown, " + exc.getMessage());
            throw new IllegalStateException("Could not get create/retrieve database " + dbName + ", error: " + exc.getMessage());          
        }
    }

    // Method to verify anchors for genome intervals have no overlapping positions
    // Overlapping intervals are not supported in the PHG
    public static Set verifyIntervalRanges(String anchorFile) {
        Set overlappingPositions = new HashSet(); // overlaps to be returned
        
        RangeSet intervalRanges = TreeRangeSet.create();
        // Read the anchor file, store to RangeSet, check for overlaps as you add
        // Store overlapping anchors to a Set to be returned to calling method
        try (BufferedReader br = Utils.getBufferedReader(anchorFile)) {
            String curLine;
            while ((curLine = br.readLine()) != null) {
                if (curLine.toUpperCase().contains("CHROMSTART")) continue;
                String[] tokens = curLine.split("\\t"); 
                Chromosome chrom = Chromosome.instance(tokens[0]);
                Range interval = Range.closedOpen(Position.of(chrom, Integer.parseInt(tokens[1])),Position.of(chrom, Integer.parseInt(tokens[2])));  
                if (intervalRanges.intersects(interval)) {
                    overlappingPositions.add(curLine);
                }
                intervalRanges.add(interval);   
            }
                        
        } catch (Exception exc) {
            throw new IllegalArgumentException("DBLoadingUtils : error reading anchors file " + exc.getMessage());
        }       
        return overlappingPositions;
    }
    
    // This method returns a byte array containing only entries that fall within 
    // the specified interval range
    public static byte[] encodeSelectedVCFRegionsToByteArray(String fileName, boolean onlyVariants,
            boolean mergeRefRanges, Rangeinterval) {
        try {
            VCFFileReader vcfReader = new VCFFileReader(new File(fileName), false);
            CloseableIterator vc = vcfReader.query(interval.lowerEndpoint().getChromosome().getName(),
                    interval.lowerEndpoint().getPosition(), interval.upperEndpoint().getPosition());

            Stream variantStream = vc.stream();
            byte[] regionBytes = encodeVariantContextStreamToByteArray(variantStream,onlyVariants,mergeRefRanges);
            vc.close();
            vcfReader.close();
            return regionBytes;
        } catch (Exception exc) {
         
            myLogger.error("DBLoadingUtils:encodeSelectedVCFRegionsToByteArray: exception thrown, " + exc.getMessage());
            throw new IllegalStateException ("DBLoadingUtils:encodeSelectedVCFRegionsToByteArray: unable to create vcfReader for file "  + fileName);
        } 
        

    }


    public static byte[] encodeVCFFileToByteArray(String fileName, boolean onlyVariants, boolean mergeRefRanges) {
        try{
            VCFFileReader vcfReader = new VCFFileReader(new File(fileName), false);
            CloseableIterator vcfIterator = vcfReader.iterator();
            Stream variantStream = vcfIterator.stream();
            
            byte[] vcfByteArray = encodeVariantContextStreamToByteArray(variantStream,onlyVariants,mergeRefRanges);
            vcfIterator.close();
            vcfReader.close();
            return vcfByteArray;
        }
        catch(Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    //LoadHapSequences (non-assembly) has a gvcf file to process.
    public static byte[] encodeVariantContextStreamToByteArray(Stream variantStream, boolean onlyVariants, boolean mergeRefRanges) throws IOException{
        //Check to see if we only want to store the variants
        if(onlyVariants) {
            variantStream = variantStream.filter(variantContext -> checkVariant(variantContext));
        }

        List listOfVariants = variantStream.collect(Collectors.toList());

        byte[] compressedStream = encodeVariantContextListToByteArray( listOfVariants, mergeRefRanges);
        return compressedStream;

    }

    // Assembly processing stores the List
    public static byte[] encodeVariantContextListToByteArray(List listOfVariants,boolean mergeRefRanges) throws IOException {
 
        if(mergeRefRanges) {
            listOfVariants = GVCFUtils.convertVCFToGVCF(listOfVariants);
        }

        ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
        ObjectOutputStream objectStream = new ObjectOutputStream(byteStream);

        objectStream.writeObject(listOfVariants);
        byte[] serializedBytes = byteStream.toByteArray();

        objectStream.close();
        byteStream.close();

        return Snappy.compress(serializedBytes);
    }
    
    private static boolean checkVariant(VariantContext vc) {
        boolean isVariant = true;

        if(!vc.isVariant()) {
            isVariant = false;
        }

        //if we only have 2

        return isVariant;
    }


    public static List decodeByteArrayToListOfVariantContext(byte[] encodedByteArray) {
        try {
            encodedByteArray = Snappy.uncompress(encodedByteArray);
            ByteArrayInputStream byteStream = new ByteArrayInputStream(encodedByteArray);

            ObjectInputStream objectStream = new ObjectInputStream(byteStream);

            List vcfList = (List)objectStream.readObject();

            objectStream.close();
            byteStream.close();


            return vcfList;
        }
        catch(Exception e) {
            e.printStackTrace();
        }

        return null;
    }
    
    /**
     * THis method takes 2 multisets of HaplotypeNode objects:  one indicating inclusion counts
     * for a haplotype, the other indicating exclusion counts.  These sets are on a per-taxon basis.
     * 
     * The data will be written compressed to a byte array for storage in the PHG db haplotype_counts
     * table.  If indicated, the data will also be written to files.
     * @return
     */
    public static byte[] encodeHapCountsArrayFromMultiset(Multiset perfectHitSet, 
            Multiset exclusionHitSet) {

        // Create a list of all nodes included in at least 1 of the sets.
        List toSort = new ArrayList<>(Sets.union(perfectHitSet.elementSet(), exclusionHitSet.elementSet()));
        toSort.sort(Comparator.comparingInt(HaplotypeNode::id));
        
        ByteBuffer bBuff=ByteBuffer.allocate((Integer.SIZE/8) + (Integer.SIZE/8)*toSort.size() * 3);
        bBuff.putInt(toSort.size());
        // Add them in the order of a 3xN array 
        for (HaplotypeNode haplotypeNode : toSort) {
            bBuff.putInt(haplotypeNode.id());
            bBuff.putInt(perfectHitSet.count(haplotypeNode)); // returns # times elements occurs, or 0
            bBuff.putInt(exclusionHitSet.count(haplotypeNode));
        }
       
        byte[] dataAsByteArray;
        try {
            dataAsByteArray = Snappy.compress(Arrays.copyOf(bBuff.array(), bBuff.position()));
        } catch (IOException e) {
            throw new IllegalStateException("Could not compress byte array:");
        }
        
        return dataAsByteArray;
    }
    
    public static byte[] encodeHapCountsArrayFromFile(String fileName) {

        // For this genoid id we take all the values in the file, store them
        // into an array.  Then that array is compressed into a byte buffer
        List data = new ArrayList();
        try (BufferedReader br = Utils.getBufferedReader(fileName)) {
            String line = null;
            while ((line = br.readLine()) != null) {
                data.add(line);
            }

        } catch (IOException ioe) {
            myLogger.error("DBLoadingUtils:encodeHapCOutnsArrayFromFile: error reading input file " + fileName + ", error:" + ioe.getMessage());
        }

        // Compress to a byte array.

        // Buffer size in bytes:  number of entries in the haplotype_counts file,
        // times 3 as there are 3 values per line, times the number of bytes it takes
        // to hold an integer (integer.SIZE/8) plus 1 additional integer to hold the array size

        System.out.println("Encoded values to BB: ");
        ByteBuffer bBuff=ByteBuffer.allocate((Integer.SIZE/8) + (Integer.SIZE/8)*data.size() * 3);
        bBuff.putInt(data.size()); // store the datasize .  Needed when decode the data
        for (int idx = 0; idx < data.size(); idx++) {            
            String[] dataValues = data.get(idx).split("\\t");

            bBuff.putInt(Integer.parseInt(dataValues[0])); // haplotype id
            bBuff.putInt(Integer.parseInt(dataValues[1])); // inclusion count
            bBuff.putInt(Integer.parseInt(dataValues[2])); // exclusion count
            //System.out.println("dataValues0 " + dataValues[0] + ", values1: " + dataValues[1] + ", values2: " + dataValues[2]);
        }

        try {
            return Snappy.compress(Arrays.copyOf(bBuff.array(), bBuff.position()));
        } catch (IOException e) {
            throw new IllegalStateException("Could not compress byte array:");
        }

    }
    
    /*
     * Methods takes a Snappy compressed byte array from the DB.
     * This byte array is decompressed and turned into a 3xn array with the "3" being
     * haplotype_id, inclusion_count, exclusion_count
     */
    public static int[][] decodeHapCountsArray(byte[]dataAsByteArray) {

        ByteBuffer bb;
        try {
            bb = ByteBuffer.wrap(Snappy.uncompress(dataAsByteArray));
        } catch (IOException e) {
            throw new IllegalStateException("encodeHapCountsArrayFromFile: could not uncompress dataAsByteArray");
        } 

        bb.rewind();
        int bbSize = bb.getInt(); // number of hapids is first value stored in ByteBuffer
        int idx = 0;
        int[][] countsData2 =new int[3][bbSize];
        //System.out.println("\nReading into countsData2 values: ");
 
        while (bb.hasRemaining()) {
            // Read 3 at a time into a dataline for the array
            countsData2[0][idx] =bb.getInt();
            countsData2[1][idx] =bb.getInt();
            countsData2[2][idx]=bb.getInt();
            //System.out.print(" " + countsData2[0][idx] + " " + countsData2[1][idx] + " " + countsData2[2][idx] );
            idx++;
        }

        System.out.println("\nFinished: countsData2 length: " + countsData2.length + ", countsData2[0].size " + countsData2[1].length + ", bb.size: " +  bbSize);
        return countsData2;
    }
    
   // Method to encode a taxon's path into a compressed bye array.
    // This data will be stored in the paths table as the haplotype_paths BLOB.
    public static byte[] encodePathArrayFromSet(Set paths) { 
            
        ByteBuffer bBuff;
        byte[] dataAsByteArray;
 
        try {
            myLogger.debug("encodePathARrayFromSet: Extracting the haplotypeIds");
            SortedSet sortedIdSet = paths.stream()
                    .map(haplotypeNode -> haplotypeNode.id())
                    .filter(hapId -> hapId != -1)
                    .collect(Collector.of(TreeSet::new,
                            (set, hapId) -> set.add(hapId),
                            (leftSet, rightSet) -> {
                                leftSet.addAll(rightSet);
                                return leftSet;
                            }));
            myLogger.debug("encodePathArrayFromSet: created the compressed path data");
            bBuff=ByteBuffer.allocate((Integer.SIZE/8 + Integer.SIZE/8) *sortedIdSet.size() );
            bBuff.putInt(sortedIdSet.size()); // store the size (number of hapids) as first value
            for (Integer hapId : sortedIdSet) {
                bBuff.putInt(hapId);
            }

            try {
                return Snappy.compress(Arrays.copyOf(bBuff.array(), bBuff.position()));
            } catch (IOException e) {
                throw new IllegalStateException("Could not compress byte array:");
            }
            
        } catch (Exception exc) {
            throw new IllegalStateException("DBLoadingUtils:encodePathArrayFromSet: error creating compressed data, " + exc.getMessage());
        }
    }

    /**
     * This method takes a list of haplotype ids and compresses them to a byte array.
     *
     * @param paths
     * @return
     */
    public static byte[] encodePathsFromIntArray(List paths) {
        ByteBuffer bBuff;
        bBuff=ByteBuffer.allocate((Integer.SIZE/8 + Integer.SIZE/8) *paths.size() );
        bBuff.putInt(paths.size()); // store the size (number of hapids) as first value
        for (Integer hapId : paths) {
            bBuff.putInt(hapId);
        }

        try {
            return Snappy.compress(Arrays.copyOf(bBuff.array(), bBuff.position()));
        } catch (IOException e) {
            throw new IllegalStateException("Could not compress byte array:");
        }
    }

    // This method takes the "BLOB" stored for the haplotype_paths field of the paths
    // table and decodes it into an int array containing the hapids that comprise the path.
    public static int[] decodePathsArray(byte[]dataAsByteArray) {

        ByteBuffer bb;
        try {
            bb = ByteBuffer.wrap(Snappy.uncompress(dataAsByteArray));
        } catch (IOException e) {
            throw new IllegalStateException("decodePathsArray: could not uncompress dataAsByteArray");
        }

        bb.rewind();
        int bbSize = bb.getInt(); // number of hapids is first value stored in ByteBuffer
        int idx = 0;
        int[] paths =new int[bbSize];
        while (bb.hasRemaining()) {
            // each entry is a hapid on the path
            paths[idx] =bb.getInt();
            idx++;
        }

        return paths;

    }

    // Method to encode a taxon's path into a compressed byte array.
    // This method allows for multiple path lists.  The first int in the encoded
    // array indicates how many lists are present. This is the negative of the
    // actual number encoded.  Using a negative number to differentiate paths lists
    // encoded when only a single gamete path was supported, vs this method, which
    // supports multiple path lists.  This is necessary for decodePathsForMultipleLists
    // to understand the blob encoding.
    // This data will be stored in the paths table as the haplotype_paths BLOB.
    public static byte[] encodePathArrayForMultipleLists(List> paths) {

        try {
            myLogger.debug("encodePathArrayForMultipleLists: Extracting the haplotypeIds");
            // Storing a negative number for number of lists.  This is to distinguish
            // new MultipleLists (supporting diploids) from the previous encoding which
            // only supported a single gamete list.  The decoding method will check for
            // a negative number and use that to determine how to decode.  This facilitates
            // processing existing path "blob" data from old dbs.
            int numSets = paths.size() * -1;
            // Do not use Sets as the same hapid may appear on multiple lists
            List combinedLists = new ArrayList();
            int setSize = 0;

            // Loop through the Lists, adding all data to the combined List
            for (List hapList : paths) {
                List hapIdList = hapList.stream()
                        .map(haplotypeNode -> haplotypeNode.id())
                        .filter(hapId -> hapId != -1)
                        .collect(Collector.of(ArrayList::new,
                                (set, hapId) -> set.add(hapId),
                                (leftSet, rightSet) -> {
                                    leftSet.addAll(rightSet);
                                    return leftSet;
                                }));
                // The number of hapids per gamete may vary. Store the number in each
                // set before storing the hapids themselves.
                setSize = hapIdList.size();
                combinedLists.add(setSize);
                combinedLists.addAll(hapIdList);
                myLogger.info("encodePathArrayForMultipleLists: setSize is " + setSize + ", combinedSortedSet size: " + combinedLists.size());
            }

            ByteBuffer bBuff=ByteBuffer.allocate((Integer.SIZE/8) + (Integer.SIZE/8) *combinedLists.size()  );
            bBuff.putInt(numSets); // store number of sets (handles diploids)
            for (Integer hapId : combinedLists) {
                bBuff.putInt(hapId);
            }

            try {
                return Snappy.compress(bBuff.array());
            } catch (IOException e) {
                throw new IllegalStateException("Could not compress byte array:");
            }

        } catch (Exception exc) {
            throw new IllegalStateException("DBLoadingUtils:encodePathArrayForMultipleLists: error creating compressed data, " + exc.getMessage());
        }
    }

    // This method takes the "BLOB" stored for the haplotype_paths field of the paths
    // table and decodes it into a list of int arrays containing the hapids that comprise the path.
    // This method decodes paths created via encodePathArrayForMultipleLists() above.
    // The uncompressed paths data is of the form:
    //      < hapids for set 2> ...
    public static List> decodePathsForMultipleLists(byte[]dataAsByteArray) {

        ByteBuffer bb;
        try {
            bb = ByteBuffer.wrap(Snappy.uncompress(dataAsByteArray));
        } catch (IOException e) {
            throw new IllegalStateException("decodePathsArray: could not uncompress dataAsByteArray");
        }

        bb.rewind();

        List> hapidLists = new ArrayList>();
        int numLists = bb.getInt(); // first int is number of lists

        // Check for old blob encoding:  decode using previous method
        if (numLists > 0) {
            int[] paths = decodePathsArray(dataAsByteArray);
            List pathList = IntStream.of(paths)	  // returns IntStream
                    .boxed()
                    .collect(Collectors.toList());
            hapidLists.add(pathList);
            return hapidLists;
        } else {
            // lists encoded via encodePathArrayForMultipleLists use negative
            // numbers for the number of lists.  Flip to positive for processing below.
            numLists = numLists * -1;
        }

        // Process the buffer, splitting data into separate lists
        while (bb.hasRemaining()) {
            // each entry is a hapid on the path
            List hapList = new ArrayList();
            int listSize = bb.getInt(); // before each list is an int indicating list size
            for (int idx = 0; idx < listSize; idx++) {
                hapList.add(bb.getInt());
            }
            hapidLists.add(hapList);
        }

        return hapidLists;
    }

    
    public static  List splitCigar(String cigarString) {
        //One cigar component is a number of any digit, followed by a letter or =
        Pattern cigarPattern = Pattern.compile("[\\d]+[a-zA-Z|=]");
        ArrayList cigarElems = new ArrayList();
        Matcher matcher = cigarPattern.matcher(cigarString);
        while (matcher.find()) {
            cigarElems.add( matcher.group() );
        }
        return cigarElems;
    }
    
    /**
     * This method creates a list of allele strings based on the allele
     * set of A,C,G,T,N
     *
     * The size of the set will be 5 + 5^2 + 5^3 + ... + 5^n where "n" is
     * maxKmerLen passed in and "5^n" is 5 to the nth power.
     * 
     * For example: if maxKmerLen = 3, size of initial Allele list is: 5 + 25 + 125 = 155;
     *              if maxKmerLen = 5, size of initial Allele list is: 5 + 25 + 125 + 625 + 3125 = 3905
     *
     * @param maxKmerLen
     * @return
     */
    public static List createInitialAlleles(int maxKmerLen) {
        String[] alleleList = {"A","C","G","T","N"};

        List initialAlleles = new ArrayList();
        // Creating 3mers
        
        List currentList = new ArrayList();

        // create first list:
        for (String allele : alleleList) {           
            currentList.add(allele);
        } 
        initialAlleles.addAll(currentList);
        // now, run a loop for the number of times we want these added up
        // This goes to maxKmerLen-1 because we already added the first list above
        for (int kmerIdx = 0; kmerIdx < maxKmerLen-1; kmerIdx++) {
            List tempList = new ArrayList();
            tempList.addAll(currentList);
            currentList.clear();
            for (String allele : alleleList) {
                for (int idx = 0; idx < tempList.size(); idx++) {                       
                    String newAllele = tempList.get(idx) + allele;
                    currentList.add(newAllele);
                }
            }
            initialAlleles.addAll(currentList);
        }
        
        return initialAlleles;
    }

    /**
     * This method takes a Map of parameterName to parameterValue, and formats them into a JSON string.
     * This string will be used by the calling method as the description entry
     * for the PHG methods table.
     * @param parameterList
     * @return
     */
    public static  String formatMethodParamsToJSON(Map parameterList) {

        JsonObjectBuilder objectBuilder = Json.createObjectBuilder();

        parameterList.keySet().stream().forEach ( item -> {
            objectBuilder.add(item,(parameterList.get(item)==null)? "null" : parameterList.get(item));
        });

        JsonObject jsonObject = objectBuilder.build();
        String jsonString;
        try(Writer writer = new StringWriter()) {
            Json.createWriter(writer).write(jsonObject);
            jsonString = writer.toString();
        } catch (Exception exc) {
            throw new IllegalArgumentException("formatMethodParamsToJSON: could not create json string");
        }
        return jsonString;
    }

    /**
     * Takes a passed method description string from a PHG dd methods table entry,
     * and formats the JSON key/value pairs into a Map for the user.
     * If the string does not parse to JSON, a single map entry of "notes":methodDescription
     * will be created and returned.
     * @param methodDescription
     * @return
     */
    public static  Map parseMethodJsonParamsToString(String methodDescription) {

        Map pluginParams = new HashMap();
        JsonReader reader = Json.createReader(new StringReader(methodDescription));
        try {
            // This will throw an "Unexpected char .. at .." error if the string is not JSON
            // This will happen for older dbs where we stored a string for the description.
            // Catch the error, create a map with a single entry for the JSON pair
            JsonObject descObject = reader.readObject();

            Set keys = descObject.keySet();

            keys.stream().forEach( key -> {
                pluginParams.put(key.toString(),descObject.get(key).toString());
            });
        } catch (Exception exc){
            myLogger.info("parseMethodJsonParamsToString: methodDescription is not JSON, creating single map entry as notes:methodDescription for " + methodDescription);
            pluginParams.put("notes",methodDescription);
        }

        return pluginParams;
    }

    /**
     * This method connects to a database, finds the haplotypes for a specific gamete group,
     * and creates an ordered-by-ref-range list of haplotype ids.
     *
     * The intended use is for path creation for Assembly and WGS input.
     * @param conn
     * @param gamete_grp_id
     * @return
     */
    public static List createPathNodesForGameteGrp(String taxon, Connection conn, int gamete_grp_id) {

        List hapNodes = new ArrayList<>();
        StringBuilder sb = new StringBuilder();
        sb.append("select haplotypes_id, haplotypes.ref_range_id, chrom, range_start, range_end from haplotypes, reference_ranges ");
        sb.append(" WHERE haplotypes.ref_range_id=reference_ranges.ref_range_id ");
        sb.append(" AND haplotypes.gamete_grp_id=");
        sb.append(gamete_grp_id);
        sb.append(" order by chrom,range_start");

        String query = sb.toString();
        try (ResultSet rs = conn.createStatement().executeQuery(query)) {

            while (rs.next()) {
                int hapid = rs.getInt("haplotypes_id");
                int id = rs.getInt("ref_range_id");
                String chromosome = rs.getString("chrom");
                int start = rs.getInt("range_start");
                int end = rs.getInt("range_end");
                hapNodes.add(hapid);
            }

        } catch (Exception exc) {
            myLogger.debug(exc.getMessage(), exc);
            throw new IllegalStateException("AssemblyHaplotypesPlugin: referenceRanges: Problem querying the database: " + exc.getMessage());
        }
        return hapNodes;
    }

    // Used for creating checksum on assembly fasta files.
    // For BrAPI compatibility, PHG uses MD5
    public static String getChecksumForFile(File file, String protocol) {
        // from https://howtodoinjava.com/java/io/sha-md5-file-checksum-hash/
        try {
            FileInputStream fis = new FileInputStream(file);
            MessageDigest md = MessageDigest.getInstance(protocol);

            //Create byte array to read data in chunks
            byte[] byteArray = new byte[1024];
            int bytesCount = 0;

            //Read file data and update in message digest
            while ((bytesCount = fis.read(byteArray)) != -1) {
                md.update(byteArray, 0, bytesCount);
            }

            fis.close();

            byte[] byteData = md.digest();

            // convert the byte to hex format
            StringBuffer sb = new StringBuffer();
            for (int idx = 0; idx < byteData.length; idx++) {
                sb.append(Integer.toString((byteData[idx] & 0xff) + 0x100, 16).substring(1));
            }
            return sb.toString();
        } catch (Exception exc) {
            myLogger.error("getChecksumForString: problem getting checksum: " + exc.getMessage());
            throw new IllegalStateException("CheckSum: getChecksumForFile: error: " + exc.getMessage());
        }
    }

    public static byte[] encodeHapidListToByteArray(List hapidList) {

        try {
            ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
            ObjectOutputStream objectStream = new ObjectOutputStream(byteStream);

            objectStream.writeObject(hapidList);
            byte[] serializedBytes = byteStream.toByteArray();

            objectStream.close();
            byteStream.close();

            return Snappy.compress(serializedBytes);
        } catch (Exception exc) {
            throw new IllegalStateException("DBLoadingUtils:encodeHapidListToByteArray: failed to encode bytes: " + exc.getMessage());
        }
    }

    public static List decodeHapidList(byte[] encodedByteArray) {
        try {
            encodedByteArray = Snappy.uncompress(encodedByteArray);
            ByteArrayInputStream byteStream = new ByteArrayInputStream(encodedByteArray);

            ObjectInputStream objectStream = new ObjectInputStream(byteStream);

            List hapidIntList = (List)objectStream.readObject();

            objectStream.close();
            byteStream.close();

            return hapidIntList;
        }
        catch(Exception exc) {
            throw new IllegalStateException("DBLoadingUtils:decodeHapidList: failed to decode bytes: " + exc.getMessage());
        }
    }
}