net.maizegenetics.pangenome.db_loading.LoadHapSequencesToDBPlugin Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
package net.maizegenetics.pangenome.db_loading;


import java.awt.Frame;
import java.io.BufferedReader;
import java.io.File;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import javax.swing.ImageIcon;

import htsjdk.samtools.util.CloseableIterator;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFileReader;
import net.maizegenetics.pangenome.api.ConvertVariantContextToVariantInfo;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import org.apache.log4j.Logger;

import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;

import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Utils;
/**
 * This method takes data processed through GATK haplotype caller, or through
 * processAssebmlyGenomes.Minimap2PipelinePlugin.  In either case, the data is
 * subsequently run through ComputeMedianGVCFAndFilter which creates a fasta for
 * loading into the DB via this method.
 * 
 * The ID line in the fasta is used to associate each sequence with a DB 
 * genome interval id. Entries are added to the following tables for each haplotype:
 *   - genotypes
 *   - gametes
 *   - gamete_groups (only the single taxa/chromosome in each group)
 *   - gamete_haplotypes (associates gamete_grp_id to the gamete_id)
 *   - haplotypes (sequence related data for each genome_interval for the specified gamete is added)
 * 
 * INPUT:
 *   1. configuration file indicating db connection data
 *   2. a fasta file with haplotype anchor sequences.  Fasta idlines look as below.   
 *      >refChrom:refstart:refEnd gvcfFileName
 *   3. a tab-delimited file containing parameters specific for this haplotype, columns are:
 *     Genotype/Hapnumber/Dataline/ploidy/reference/genesPhased/chromsPhased/confidence/Method/MethodDetails/RefVersion
 *   4. Path to directory containing the gvcf file
 *      
 * 
 * OUTPUT:
 *   Data is written to the db tables listed above.
 * 
 * @author lcj34
 *
 */
@Deprecated
public class LoadHapSequencesToDBPlugin extends AbstractPlugin {
    private static final Logger myLogger = Logger.getLogger(LoadHapSequencesToDBPlugin.class);
    
    private PluginParameter fasta = new PluginParameter.Builder("fasta", null, String.class).guiName("Fasta File").required(true).inFile()
            .description("Fasta file containing haplotype sequences ").build();

    private PluginParameter genomeData = new PluginParameter.Builder("genomeData", null, String.class).guiName("Genome Data File").required(true).inFile()
            .description("Path to tab-delimited file containing genome speciic data with header line:\nGenotype Hapnumber Dataline Ploidy Reference GenePhased ChromPhased Confidence Method MethodDetails RefVersion")
            .build();
    private PluginParameter gvcf = new PluginParameter.Builder("gvcf", null, String.class).guiName("GVCF Directory").required(true)
            .description("Directry containing GVCF file used to create the haplotype fasta file.  Directory path only including trailing /").build();

    // This data is populated from the genomeData file
    private String line; // refName to be stored as line_name in genotypes table, e.g. B73Ref
    private String line_data ;
    private int ploidy ;
    private int hapNumber;
    private boolean genesPhased;
    private boolean chromsPhased;
    private float conf;
    private String method;
    private String method_details;

    boolean is_ref = false;

    public LoadHapSequencesToDBPlugin() {
        super(null, false);
    }

    public LoadHapSequencesToDBPlugin(Frame parentFrame) {
        super(parentFrame, false);
    }

    public LoadHapSequencesToDBPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }
    private static int numMapped = 0;
    private static int notMapped = 0;

    @Override
    public void postProcessParameters() {
        myLogger.info("postProcessParameters: reading genomeDataFile: " + genomeData());
        try (BufferedReader br = Utils.getBufferedReader(genomeData())){

            // parse input file to find arguments
            myLogger.info("reading genomeDataFile: " + genomeData());
            
            String headers = br.readLine(); // read the header line
            int lineIndex = -1;
            int lineDataIndex = -1;
            int ploidyIndex = -1;
            int hapNumberIndex = -1;
            int isRefIndex = -1;
            int genesPhasedIndex = -1;
            int chromPhasedIndex = -1;
            int confIndex = -1;
            int methodIndex = -1;
            int methodDetailsIndex = -1;

            int idx = 0;
            myLogger.info("GenomeFile header line: " + headers);
            for (String header : headers.split("\\t")) {
                if (header.equalsIgnoreCase("Genotype")) {
                    lineIndex = idx;
                } else if (header.equalsIgnoreCase("Hapnumber")) {
                    hapNumberIndex = idx;
                } else if (header.equalsIgnoreCase("Dataline")) {
                    lineDataIndex = idx;
                } else if (header.equalsIgnoreCase("ploidy")) {
                    ploidyIndex = idx;
                } else if (header.equalsIgnoreCase("reference")) {
                    isRefIndex = idx;
                } else if (header.equalsIgnoreCase("genesPhased")) {
                    genesPhasedIndex = idx;
                } else if (header.equalsIgnoreCase("chromsPhased")) {
                    chromPhasedIndex = idx;
                } else if (header.equalsIgnoreCase("confidence")) {
                    confIndex = idx;
                } else if (header.equalsIgnoreCase("Method")) {
                    methodIndex = idx;
                } else if (header.equalsIgnoreCase("MethodDetails")) {
                    methodDetailsIndex = idx;
                } 
                idx++;
            }
            if (lineIndex == -1 || lineDataIndex == -1 || ploidyIndex == -1 ||
                    hapNumberIndex == -1 || isRefIndex == -1 || genesPhasedIndex == -1 ||
                    chromPhasedIndex == -1 || confIndex == -1 || methodIndex == -1 ||
                    methodDetailsIndex == -1 ) {
                myLogger.error("ERROR - Genotype datafile does not contain the required 10 fields");
                myLogger.error("Please check your file for the tab delimited, case-insensistive headers: ");
                myLogger.error("  Genotype Hapnumber Dataline Ploidy Reference GenePhased ChromPhased Confidence Method MethodDetails");
                throw new IllegalArgumentException("Wrong number of header columns in genome data file");
            }
            // All headers are present - check there is data for the required columns
            String dataLine = br.readLine();
            String[] dataTokens = dataLine.split("\\t");
            if (dataTokens.length != 10) {
                myLogger.error("ERROR - wrong number of data items in genotype datafile, expecting 10, found " + dataTokens.length);
                throw new IllegalArgumentException("ERROR - wrong number of data items in genotype datafile, expecting 10, found " + dataTokens.length);
            }
            line = dataTokens[lineIndex];
            line_data = dataTokens[lineDataIndex];          
            ploidy = Integer.parseInt(dataTokens[ploidyIndex]);
            hapNumber = Integer.parseInt(dataTokens[hapNumberIndex]);
            is_ref = (dataTokens[isRefIndex].equalsIgnoreCase("true")) ? true : false;
            genesPhased = (dataTokens[genesPhasedIndex].equalsIgnoreCase("true")) ? true : false;
            chromsPhased = (dataTokens[chromPhasedIndex].equalsIgnoreCase("true")) ? true : false;         
            conf = Float.parseFloat(dataTokens[confIndex]);
            method = dataTokens[methodIndex];
            method_details = dataTokens[methodDetailsIndex];

        } catch (Exception exc) {
            myLogger.error("ERROR parsing fields in genomeData file");
            myLogger.error(" Expecting a tab-delimited file with these columns:  " ); 
            myLogger.error("  Genotype/Hapnumber/Dataline/ploidy/reference/genePhased/chromPhased/confidence/Method/MethodDetails/RefVersion");
            myLogger.error("Please fix the file and try again - aborting the run");
            throw new IllegalArgumentException("ERROR parsing genomeData file");
        }
    }

    @Override
    public DataSet processData(DataSet input) {

        long totalTime = System.nanoTime();
        long time=System.nanoTime();

        RangeMap chromPosIDMap = null;
        // Map
        Map anchorSequences = new HashMap();
 
        Connection dbConnect = (Connection)input.getData(0).getData();
        
        if (dbConnect == null) {
           myLogger.error("LoadHapSequencesToDBPlugin:  No DB CONNECTION supplied !");
           return null;
        }
        myLogger.info("LoadHapSequencesToDBPlugin: have connection, create PHGdbAccess object");
        PHGDataWriter phg = new PHGdbAccess(dbConnect);

        // Read the fasta file.  Pull anchors on per-chrom data
        try (BufferedReader fastaBR = Utils.getBufferedReader(fasta())){
            String fLine;          
            String oldChrom = "-1";
            String refData = "none";
            String refChr = "none";
            
            StringBuilder seqSB = new StringBuilder();
            int idlineCount = 0;
            int fastaSeqCount = 0;
            int numUnknownSeq = 0;
            
            // Load initial data to genotypes, gametes, gamete_groups, method
            // add the genotype/haplotype info to DB
            GenoHaploData ghd = new GenoHaploData(ploidy,is_ref,line,line_data, genesPhased, chromsPhased, hapNumber, conf);
            phg.putGenoAndHaploTypeData(ghd);
            
            // Load the gamete_groups and gamete_haplotypes table
            String nameWithHap = line + "_" + hapNumber; // ref line is always hap1
            List gameteGroupList = new ArrayList();
            gameteGroupList.add(nameWithHap);
            phg.putGameteGroupAndHaplotypes(gameteGroupList);
            
            // Put the method data - identifies for each haplotype how the anchors were created
            // anchor and inter-anchor are both method_type "anchor_haplotypes"
            int method_id = phg.putMethod(method, DBLoadingUtils.MethodType.ANCHOR_HAPLOTYPES,pluginParameters());
            
            // Add info to group tables           
            int gamete_grp_id = phg.getGameteGroupIDFromTaxaList(gameteGroupList);
            
            while ((fLine = fastaBR.readLine()) != null) {
                
                if (fLine.startsWith(">")) {  
                    // id line:  >refChrom:refstart:refEnd gvcfFileName
                    idlineCount++;
                    if (seqSB.length() > 0) { // process previous fasta sequence
                        fastaSeqCount++;
                        // look for anchorID. 
                        addSequenceToMap(chromPosIDMap, anchorSequences,seqSB.toString(),  refData);
                        seqSB.setLength(0); // clear the previous sequence
                    } 
                    
                    refData = fLine.replace(">","");
                    refChr = refData.split(":")[0]; // 8:09890:98900 = chr/start/end
                    if (!refChr.equals(oldChrom)) { 
                        myLogger.info("Time to process chrom " + oldChrom + ": " + (System.nanoTime() - time) / 1e9 + " seconds");
                        time = System.nanoTime();
                        
                        // In the DB, the chromosome name is stored from the chromosome object.
                        // The Chromosome class will parse off CHR or CHROMOSOME before creating the name
                        // Pass that here.
                        String refChrName = Chromosome.instance(refChr).getName();
                        myLogger.info("LoadHapSequencesToDBPlugin: getting anchors for chrom " + refChr + " as " + refChrName);
                        // Position object is key, anchorID is value
                        chromPosIDMap = phg.getIntervalRangesWithIDForChrom(refChrName); 
                        myLogger.info("Number of anchors for chrom " + refChr + " is " + chromPosIDMap.asMapOfRanges().keySet().size());
                        time = System.nanoTime();
                        oldChrom = refChr;
                        
                        if (anchorSequences.size() > 0) {
                            // WIth variants table schema, we must load on a per-chrom basis
                            processGVCFVariants(anchorSequences);
                            myLogger.info("\ncalling putHaplotypes for chrom " + oldChrom + " with list of size: " + anchorSequences.size() );
                            phg.putHaplotypesData(gamete_grp_id, method_id, anchorSequences, oldChrom);
                            anchorSequences.clear(); // start fresh
                        }
                    }
                    
                } else {
                    seqSB.append(fLine);
                }
            }
            // Process last fasta entry, load last chromosome
            if (seqSB.length() > 0) {
                fastaSeqCount++;
                // look for anchorID, add to map 
                addSequenceToMap( chromPosIDMap, anchorSequences, seqSB.toString(),  refData);
                processGVCFVariants(anchorSequences);
                myLogger.info("\ncalling putHaplotypes for chrom " + refChr + " with list of size: " + anchorSequences.size());
                phg.putHaplotypesData(gamete_grp_id, method_id, anchorSequences, refChr);
                seqSB.setLength(0); // clear the previous sequence
            }

            ((PHGdbAccess)phg).close();
            myLogger.info("Time to load sequences into DB:  " + (System.nanoTime() - time) / 1e9 + " seconds");
            myLogger.info("\nTotal time for LoadHapSequencesToPHGdb:  " + (System.nanoTime() - totalTime) / 1e9 + " seconds");
        } catch (Exception exc) {
            throw new IllegalStateException("LoadHapSequences: error procesing data: " + exc.getMessage());           
        }
        return null;
    }
   
    // Create all the variant BLOBs for loading into the DB.
    private void processGVCFVariants(Map anchorSequences) {
         
        anchorSequences.entrySet().parallelStream().forEach(entry -> {
            AnchorDataPHG aData = entry.getValue();
            // Encode gvcfFile into variant list bytes.  The boolean fields indicates whether
            // only variants should be pulled from the gvcf file and if refRanges should be merged.
            // For individual haplotypes (vs consensus) the full gvcf file is passed.
            // Only store in the variant list the entries related to this genome_interval region.
            String gvcf = aData.gvcf();

            try {
                VCFFileReader vcfReader = new VCFFileReader(new File(gvcf), false);
                Range interval = aData.intervalCoordinates();
                CloseableIterator vc = vcfReader.query(interval.lowerEndpoint().getChromosome().getName(),
                        interval.lowerEndpoint().getPosition(), interval.upperEndpoint().getPosition());

                List variants = vc.stream()
                        .map(variantContext -> ConvertVariantContextToVariantInfo.convertContextToInfo(variantContext,line,hapNumber))
                        .collect(Collectors.toList());
                int lastSlash = gvcf.lastIndexOf("/");
                gvcf = gvcf.substring(lastSlash + 1); // no longer need the directory path, just store file name

                AnchorDataPHG newAdata = new AnchorDataPHG(aData.intervalCoordinates(), aData.geneCoordinates(),
                        gvcf, null, variants, aData.sequence());
                entry.setValue(newAdata); // setValue() does not incur a concurrent modification exception
            } catch (Exception exc) {
                throw new IllegalStateException("Error processing GVCFVariants." , exc);
            }
        });       
    }
    
    private String getGVCFFileName(String[] idline) {
        // gvcf file is last item on idline, first item after the space
        // >refChrom:refStartPos:refEndPos gvcfFileName
               
        String gvcfFileNameOnly;
        
        if (idline.length < 2) {
            // Old version of fasta idlines don't have the gvcf name.  Infer it from
            // the fasta file name
            String fastaName = fasta();
            int lastSlash = fasta().lastIndexOf("/");
            if (lastSlash != -1) {
                fastaName = fasta().substring(lastSlash+1);
            } 
            int lastPeriod = fastaName.lastIndexOf(".");
            gvcfFileNameOnly = fastaName.substring(0, lastPeriod) + ".g.vcf";
        } else {
            gvcfFileNameOnly = idline[1];
        }                     
        return gvcfFileNameOnly;
    }
    
    // look for  match from genome intervals table.  There should always be a match
    private  void addSequenceToMap(RangeMap chromAnchorStartEnd,
            Map anchorSequences,  String seqString, String refData) {
        int anchorId = -1;

        String[] idlineData = refData.split(" ");
        String refInterval = idlineData[0];;
        String refChr = refInterval.split(":")[0]; // 8:09890:98900 = chr/start/end;
        String gvcfFileNameOnly = getGVCFFileName( idlineData) ;

        int refStart = Integer.parseInt(refInterval.split(":")[1]);
        Position startPos = Position.of( refChr,refStart);
        Map.Entry, Integer> mapEntry
                    = chromAnchorStartEnd.getEntry(startPos);
        if (mapEntry != null) {
            // refStart pos must not only fall within a Position range, it must equal the start position
            if (mapEntry.getKey().lowerEndpoint().getPosition() == refStart) {
                anchorId = mapEntry.getValue();
                int refEnd = Integer.parseInt(refInterval.split(":")[2]);
                Position endPos = Position.of(refChr, refEnd);
                Range intervalRange = Range.closed(startPos, endPos);

                Position geneStart = Position.of( refChr,0);
                Position geneEnd = Position.of( refChr,0);
                String gvcfFileWithPath = gvcf() + gvcfFileNameOnly;
                Range geneRange =  Range.closed(geneStart, geneEnd); // not relevant for non-ref haplotypes
                AnchorDataPHG adata = new AnchorDataPHG(intervalRange, geneRange,gvcfFileWithPath, null, seqString);
                  anchorSequences.put(anchorId, adata);
                numMapped++;
            } else {
                notMapped++;
                myLogger.warn("Anchor not found for chr " + refChr + " with start pos " + refStart);
            }
        } else {
            notMapped++;
            myLogger.warn("Anchor not found for chr " + refChr + " with start pos " + refStart);
        }
    }
    
    @Override
    public ImageIcon getIcon() {
        return null;
    }

    @Override
    public String getButtonName() {
        return ("Load haplotype sequences to database");
    }

    @Override
    public String getToolTipText() {
        return ("Load haplotype sequences to database");
    }
    
//    public static void main(String[] args) {
//        GeneratePluginCode.generate(LoadHapSequencesToPHGdb.class);        
//    }


    /**
     * Fasta file containing haplotype sequences 
     *
     * @return Fasta File
     */
    public String fasta() {
        return fasta.value();
    }

    /**
     * Set Fasta File. Fasta file containing haplotype sequences
     * 
     *
     * @param value Fasta File
     *
     * @return this plugin
     */
    public LoadHapSequencesToDBPlugin fasta(String value) {
        fasta = new PluginParameter<>(fasta, value);
        return this;
    }

    /**
     * Path to tab-delimited file containing genome speciic
     * data with header line:
     * Genotype Hapnumber Dataline Ploidy Reference GenePhased
     * ChromPhased Confidence Method MethodDetails RefVersion
     *
     * @return Genome Data File
     */
    public String genomeData() {
        return genomeData.value();
    }

    /**
     * Set Genome Data File. Path to tab-delimited file containing
     * genome speciic data with header line:
     * Genotype Hapnumber Dataline Ploidy Reference GenePhased
     * ChromPhased Confidence Method MethodDetails RefVersion
     *
     * @param value Genome Data File
     *
     * @return this plugin
     */
    public LoadHapSequencesToDBPlugin genomeData(String value) {
        genomeData = new PluginParameter<>(genomeData, value);
        return this;
    }

    /**
     * GVCF file used to create the haplotype fasta file
     *
     * @return GVCF File
     */
    public String gvcf() {
        return gvcf.value();
    }

    /**
     * Set GVCF File. GVCF file used to create the haplotype
     * fasta file
     *
     * @param value GVCF File
     *
     * @return this plugin
     */
    public LoadHapSequencesToDBPlugin gvcf(String value) {
        gvcf = new PluginParameter<>(gvcf, value);
        return this;
    }
    
}