net.maizegenetics.pangenome.db_loading.LoadHapSequencesFromGVCFPlugin Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
package net.maizegenetics.pangenome.db_loading;

import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.RangeSet;
import com.google.common.collect.TreeRangeSet;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFileReader;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.ConvertVariantContextToVariantInfo;
import net.maizegenetics.pangenome.fastaExtraction.GVCFSequence;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;

import javax.swing.*;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.File;
import java.sql.Connection;
import java.util.*;
import java.util.stream.Collectors;

/**
 * Plugin which will upload a GVCF file to the DB.  It can be filtered or unfiltered, but will upload as haplotypes to the database.
 * This plugin should work with GATK/Sentieon GVCFS as well as assembly GVCFs.  It currently will not work with consensus.
 * Created by Zack Miller [email protected] June 25th 2018
 */
@Deprecated
public class LoadHapSequencesFromGVCFPlugin extends AbstractPlugin {
    private static final Logger myLogger = Logger.getLogger(LoadHapSequencesFromGVCFPlugin.class);

    private PluginParameter myInputFile = new PluginParameter.Builder<>("inputGVCFFile", null, String.class)
            .description("GVCF File to be filtered.")
            .inFile()
            .required(true)
            .build();

    private PluginParameter myReference = new PluginParameter.Builder<>("ref", null, String.class)
            .description("Input Reference Fasta")
            .required(true)
            .inFile()
            .build();

    private PluginParameter dbConfigFile = new PluginParameter.Builder<>("dbConfigFile", null, String.class)
            .description("File holding the DB config information")
            .required(true)
            .inFile()
            .build();

    private PluginParameter loadDataFile = new PluginParameter.Builder<>("loadDataFile", null, String.class)
            .description("File holding the DB config information")
            .required(true)
            .inFile()
            .build();

    private PluginParameter bedFile = new PluginParameter.Builder<>("bedFile", null, String.class)
            .description("File holding the Reference Range Information")
            .required(true)
            .inFile()
            .build();


    public LoadHapSequencesFromGVCFPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }

    @Override
    public DataSet processData(DataSet input) {

        //Steps:
        //walk through the bed file to get the list of regions we want to extract
        RangeSet focusRanges = parseBedFileIntoRangeSet(bedFile());
        //Open up the loadDataFile and parse out the needed fields
        Map loadDataMap = parseLoadData(loadDataFile());

        Properties configProperties = new Properties();
        try {
            configProperties.load(Utils.getBufferedReader(dbConfigFile()));
        } catch (Exception e) {
            e.printStackTrace();
        }

        GenomeSequence gs = GVCFSequence.instance(reference(),inputFile());

        myLogger.info("Begin LoadHapSequencesFromGVCFPlugin ...");
        try (Connection dbConnect = DBLoadingUtils.connection(dbConfigFile(), false)) {

            //Set up the objects needed when we start iterating through the BED file
            PHGDataWriter phg = new PHGdbAccess(dbConnect);           
            VCFFileReader gvcfFile = new VCFFileReader(new File(inputFile()),false);
            Map anchorSequences = new HashMap();
            RangeMap chromAnchorStartEnd = null;

            // First load method, genotype, gametes, gamete_groups tables
            GenoHaploData ghd = new GenoHaploData(Integer.parseInt(loadDataMap.get("ploidy")),
                    Boolean.parseBoolean(loadDataMap.get("is_ref")),
                    loadDataMap.get("line"),
                    loadDataMap.get("line_data"),
                    Boolean.parseBoolean(loadDataMap.get("genesPhased")),
                    Boolean.parseBoolean(loadDataMap.get("chromsPhased")),
                    Integer.parseInt(loadDataMap.get("hapNumber")),
                    Float.parseFloat(loadDataMap.get("conf")));
            phg.putGenoAndHaploTypeData(ghd);

            // Load the gamete_groups and gamete_haplotypes table
            String nameWithHap = loadDataMap.get("line") + "_" + loadDataMap.get("hapNumber"); // ref line is always hap1
            List gameteGroupList = new ArrayList();
            gameteGroupList.add(nameWithHap);
            phg.putGameteGroupAndHaplotypes(gameteGroupList);

            // Put the method data - identifies for each haplotype how the anchors were created
            // anchor and inter-anchor are both method_type "anchor_haplotypes"
            Map pluginParams = pluginParameters();
            pluginParams.put("notes",loadDataMap.get("method_details"));

            int method_id = phg.putMethod(loadDataMap.get("method"), DBLoadingUtils.MethodType.ANCHOR_HAPLOTYPES,pluginParams);

            // Add info to group tables
            
            int gamete_grp_id = phg.getGameteGroupIDFromTaxaList(gameteGroupList);
            
            //For each range in the bed file, process
            String currentChromosome = "";
            String prevChromosome = "";

            for(Range currentRange: focusRanges.asRanges()) {

                if(currentRange.lowerEndpoint().getChromosome()==null || !currentRange.lowerEndpoint().getChromosome().getName().equals(currentChromosome)) {
                    chromAnchorStartEnd = phg.getIntervalRangesWithIDForChrom(currentRange.lowerEndpoint().getChromosome().getName()); 
                    prevChromosome = currentChromosome;
                    currentChromosome = chromAnchorStartEnd.asMapOfRanges()
                            .keySet()
                            .stream()
                            .collect(Collectors.toList())
                            .get(0)
                            .lowerEndpoint()
                            .getChromosome()
                            .getName();
                    if (anchorSequences.size() > 0) { // load previous chromosome's data
                        myLogger.info("\ncalling putHaplotypes for chrom " + prevChromosome);
                        phg.putHaplotypesData(gamete_grp_id, method_id, anchorSequences, prevChromosome);
                        anchorSequences.clear(); // these have been processed, clear the map, start fresh with next chrom
                    }                                    
                }
                //Extract out the List
                List gvcfContexts = gvcfFile.query(currentRange.lowerEndpoint().getChromosome().getName(),
                        currentRange.lowerEndpoint().getPosition(),
                        currentRange.upperEndpoint().getPosition())
                        .stream()
                        .collect(Collectors.toList());

                //Extract out the fasta sequence: 
                // THis looks wrong - shouldn't it be coordinates of the hap-line, using hap fasta?
                String sequence = gs.genotypeAsString(currentRange.lowerEndpoint().getChromosome(),
                        currentRange.lowerEndpoint().getPosition(),
                        currentRange.upperEndpoint().getPosition());
                //Add the sequences and the encoded GVCF to the datastructure

                Map.Entry, Integer> mapEntry
                        = chromAnchorStartEnd.getEntry(currentRange.lowerEndpoint());

                int anchorId = mapEntry.getValue();

                AnchorDataPHG aData = new AnchorDataPHG(currentRange,null,inputFile(),null, ConvertVariantContextToVariantInfo.convertVCListToVariantInfoList(gvcfContexts,loadDataMap.get("line"),Integer.parseInt(loadDataMap.get("hapNumber"))), sequence);

                anchorSequences.put(anchorId, aData);
            }

            if ( anchorSequences.size() > 0) {
                // load the data on per-chrom basis
                myLogger.info("\ncalling putHaplotypes for chrom " + currentChromosome);
                phg.putHaplotypesData(gamete_grp_id, method_id, anchorSequences, currentChromosome);
            }
            
            ((PHGdbAccess)phg).close();

        }catch(Exception exc) {
            myLogger.debug(exc.getMessage(),exc);
            throw new IllegalStateException("Error writing haplotypes to the DB:",exc);
        }
        return null;
    }

    /**
     * Simple method to parse a BED file into a Range set for use in the DB uploading.
     * @param bedFile
     * @return
     */
    private RangeSet parseBedFileIntoRangeSet(String bedFile) {
        RangeSet focusRanges = TreeRangeSet.create();

        try(BufferedReader reader = Utils.getBufferedReader(bedFile)) {
            String currentLine = "";

            while((currentLine = reader.readLine()) != null) {
                //Split the bed file by tabs
                String[] bedRecordSplit = currentLine.split("\t");
                Position startPos = Position.of(bedRecordSplit[0],Integer.parseInt(bedRecordSplit[1])+1);
                Position endPos = Position.of(bedRecordSplit[0],Integer.parseInt(bedRecordSplit[2]));

                focusRanges.add(Range.closed(startPos,endPos));
            }
        }
        catch(Exception exc) {
            myLogger.debug("Error loading in the Bed file",exc);
            throw new IllegalStateException("Error loading in the Bed File: ",exc);
        }

        return focusRanges;
    }

    /**
     * Method to parse and verify the load data file input to this plugin.
     * @param loadDataFile
     * @return
     */
    private Map parseLoadData (String loadDataFile) {
        myLogger.info("postProcessParameters: reading genomeDataFile: " + loadDataFile);
        Map loadDataMap = new HashMap<>();
        try (BufferedReader br = Utils.getBufferedReader(loadDataFile)){

            // parse input file to find arguments
            myLogger.info("reading genomeDataFile: " + loadDataFile);

            String headers = br.readLine(); // read the header line
            int lineIndex = -1;
            int lineDataIndex = -1;
            int ploidyIndex = -1;
            int hapNumberIndex = -1;
            int isRefIndex = -1;
            int genesPhasedIndex = -1;
            int chromPhasedIndex = -1;
            int confIndex = -1;
            int methodIndex = -1;
            int methodDetailsIndex = -1;
            int versionIndex = -1;

            int idx = 0;
            myLogger.info("GenomeFile header line: " + headers);
            for (String header : headers.split("\\t")) {
                if (header.equalsIgnoreCase("Genotype")) {
                    lineIndex = idx;
                } else if (header.equalsIgnoreCase("Hapnumber")) {
                    hapNumberIndex = idx;
                } else if (header.equalsIgnoreCase("Dataline")) {
                    lineDataIndex = idx;
                } else if (header.equalsIgnoreCase("ploidy")) {
                    ploidyIndex = idx;
                } else if (header.equalsIgnoreCase("reference")) {
                    isRefIndex = idx;
                } else if (header.equalsIgnoreCase("genesPhased")) {
                    genesPhasedIndex = idx;
                } else if (header.equalsIgnoreCase("chromsPhased")) {
                    chromPhasedIndex = idx;
                } else if (header.equalsIgnoreCase("confidence")) {
                    confIndex = idx;
                } else if (header.equalsIgnoreCase("Method")) {
                    methodIndex = idx;
                } else if (header.equalsIgnoreCase("MethodDetails")) {
                    methodDetailsIndex = idx;
                }
                idx++;
            }
            if (lineIndex == -1 || lineDataIndex == -1 || ploidyIndex == -1 ||
                    hapNumberIndex == -1 || isRefIndex == -1 || genesPhasedIndex == -1 ||
                    chromPhasedIndex == -1 || confIndex == -1 || methodIndex == -1 ||
                    methodDetailsIndex == -1 ) {
                myLogger.error("ERROR - Genotype datafile does not contain the required 10 fields");
                myLogger.error("Please check your file for the tab delimited, case-insensistive headers: ");
                myLogger.error("  Genotype Hapnumber Dataline Ploidy Reference GenePhased ChromPhased Confidence Method MethodDetails RefVersion");
                throw new IllegalArgumentException("Wrong number of header columns in genome data file");
            }
            // All headers are present - check there is data for the required columns
            String dataLine = br.readLine();
            String[] dataTokens = dataLine.split("\\t");
            if (dataTokens.length != 10) {
                myLogger.error("ERROR - wrong number of data items in genotype datafile, expecting 10, found " + dataTokens.length);
                throw new IllegalArgumentException("ERROR - wrong number of data items in genotype datafile, expecting 10, found " + dataTokens.length);
            }

            loadDataMap.put("line",dataTokens[lineIndex]);
            loadDataMap.put("line_data", dataTokens[lineDataIndex]);
            loadDataMap.put("ploidy",dataTokens[ploidyIndex]);
            loadDataMap.put("hapNumber",dataTokens[hapNumberIndex]);
            loadDataMap.put("is_ref", (dataTokens[isRefIndex].equalsIgnoreCase("true")) ? "true" : "false");
            loadDataMap.put("genesPhased",(dataTokens[genesPhasedIndex].equalsIgnoreCase("true")) ? "true" : "false");
            loadDataMap.put("chromsPhased", (dataTokens[chromPhasedIndex].equalsIgnoreCase("true")) ? "true" : "false");
            loadDataMap.put("conf", dataTokens[confIndex]);
            loadDataMap.put("method", dataTokens[methodIndex]);
            loadDataMap.put("method_details", dataTokens[methodDetailsIndex]);



        } catch (Exception exc) {
            myLogger.error("ERROR parsing fields in genomeData file");
            myLogger.error(" Expecting a tab-delimited file with these columns:  " );
            myLogger.error("  Genotype/Hapnumber/Dataline/ploidy/reference/genePhased/chromPhased/confidence/Method/MethodDetails/RefVersion");
            myLogger.error("Please fix the file and try again - aborting the run");
            throw new IllegalArgumentException("ERROR parsing genomeData file");
        }
        return loadDataMap;
    }


    @Override
    public ImageIcon getIcon() {
        return null;
    }

    @Override
    public String getButtonName() {
        return "Load GVCF file to DB";
    }

    @Override
    public String getToolTipText() {
        return "Load GVCF file to DB";
    }

    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
    // public static void main(String[] args) {
    //     GeneratePluginCode.generate(LoadHapSequencesFromGVCFPlugin.class);
    // }

    /**
     * Convenience method to run plugin with one return object.
     */
    // TODO: Replace  with specific type.
//    public  runPlugin(DataSet input) {
//        return () performFunction(input).getData(0).getData();
//    }

    /**
     * GVCF File to be filtered.
     *
     * @return Input G V C F File
     */
    public String inputFile() {
        return myInputFile.value();
    }

    /**
     * Set Input G V C F File. GVCF File to be filtered.
     *
     * @param value Input G V C F File
     *
     * @return this plugin
     */
    public LoadHapSequencesFromGVCFPlugin inputFile(String value) {
        myInputFile = new PluginParameter<>(myInputFile, value);
        return this;
    }

    /**
     * Input Reference Fasta
     *
     * @return Ref
     */
    public String reference() {
        return myReference.value();
    }

    /**
     * Set Ref. Input Reference Fasta
     *
     * @param value Ref
     *
     * @return this plugin
     */
    public LoadHapSequencesFromGVCFPlugin reference(String value) {
        myReference = new PluginParameter<>(myReference, value);
        return this;
    }

    /**
     * File holding the DB config information
     *
     * @return Db Config File
     */
    public String dbConfigFile() {
        return dbConfigFile.value();
    }

    /**
     * Set Db Config File. File holding the DB config information
     *
     * @param value Db Config File
     *
     * @return this plugin
     */
    public LoadHapSequencesFromGVCFPlugin dbConfigFile(String value) {
        dbConfigFile = new PluginParameter<>(dbConfigFile, value);
        return this;
    }

    /**
     * File holding the DB config information
     *
     * @return Load Data File
     */
    public String loadDataFile() {
        return loadDataFile.value();
    }

    /**
     * Set Load Data File. File holding the DB config information
     *
     * @param value Load Data File
     *
     * @return this plugin
     */
    public LoadHapSequencesFromGVCFPlugin loadDataFile(String value) {
        loadDataFile = new PluginParameter<>(loadDataFile, value);
        return this;
    }

    /**
     * File holding the Reference Range Information
     *
     * @return Bed File
     */
    public String bedFile() {
        return bedFile.value();
    }

    /**
     * Set Bed File. File holding the Reference Range Information
     *
     * @param value Bed File
     *
     * @return this plugin
     */
    public LoadHapSequencesFromGVCFPlugin bedFile(String value) {
        bedFile = new PluginParameter<>(bedFile, value);
        return this;
    }
}