net.maizegenetics.pangenome.db_loading.PHGDataWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
/**
 * 
 */
package net.maizegenetics.pangenome.db_loading;

import java.util.List;
import java.util.Map;
import java.util.Set;

import com.google.common.collect.Multimap;

import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.util.Tuple;

/**
 * @author lcj34
 *
 */
public interface PHGDataWriter extends PHGData {
    /**
     * Stores chrom, start pos, end pos to referenece_ranges table
     * isFocus identifies focus intervals from the user's bed file
     * 
     * @param anchorData
     * @param refGroupMethodID  method_id used for creating this ref_range_group
     * @return
     */            
    boolean putAllAnchors(List anchorData,   int refGroupMethodID);
    
    /**
     * Stores required data to the genotypes and haplotypes tables
     * for each entry on the list.
     * 
     * @param ghData
     * @return
     */
    boolean putGenoAndHaploTypeData(GenoHaploData ghData);
    
    /**
     * Fills in the haplotypes table
     * for the reference ranges.
     * 
     * @param line_name
     * @param hapnumber
     * @param hapMethod Name of method used to create anchors.  Maps to method table for id.
     * @param adata  Anchor data, including chrom, start/end positions
     * @param refGroupMethod  List of methods used to create the ref_range_group
     * @param gvcf String - name of gvcf file
     * @param variant_list String - name of file containing list of variants
     * @param genomeFileId int - id for ref fasta entry in the genome_file_data table
     * @paRAM gvcfFileId int - id for ref gvcf file enty in the genome_file_data table
     * @return
     */
    boolean putRefAnchorData(String line_name, int hapnumber, List adata,int hapMethod, 
             Set refGroupMethod,String gvcf, String variant_list, int genomeFileId, int gvcfFileId);

    /**
     * Adds a method, its type and its description to the anchor_methods table
     * These are used to identify how sequences were created,how the were combined into consensus
     * sequences,  how haplotype counts were scores, how paths through the graph were create 
     * or how an edge was created.
     * 
     * The "type" field identifies the table to which the method belongs.
     * 
     * @param name
     * @param type
     * @param description - a map of pluginParameter name to value(as String)
     * @return the methodId from the methods table
     */
    int putMethod(String name, DBLoadingUtils.MethodType type, Map description);

    /**
     * Adds inter-anchor sequences for the specified assembly to the anchor_sequences
     * and anchor_haplotypes table.
     * 
     * This method takes a multi-map as assembly.  Inter-anchors that do not map
     * to a reference inter-anchor are all given the anchorid 0.
     * 
     * @param line_name
     * @param hapNumber
     * @param method
     * @param anchorSequences
     * @return
     */
    boolean putAssemblyInterAnchorSequences(String line_name, int hapNumber, String method, Multimap anchorSequences);
    
    /**
     * This method takes a map of consensus data, finds the anchorIds based on Position,
     * finds the hapids of the taxa whose sequences at the specified anchorID map
     * to the consensus.  Adds the gamete_group and sequence data to the haplotpes table ;
     * adds entries to gamete_groups and gamete_hapltoypes.     * 
     * 
     * @param consensusMap Multimap>
     * @param methodId  method used for collapsing anchors
     * @return A list of hash codes loaded to the anchor_sequences table for the consensus sequences
     */
    void putConsensusSequences(Multimap>> consensusMap, 
            int methodId);

    /**
     * Takes a list of gametes and stores to the gamete_groups and gamete_haplotypes table
     * Skips if this grouping already exists
     * @param gametes list consisting of taxa/gamete number in the form taxaName_gameteNumber
     * @return true if successful
     */
    boolean putGameteGroupAndHaplotypes(List gametes);

    /**
     * Stores gamete sequence data to the haplotypes table
     * This method associates all entries with the single gamete_grp_id which is passed in.
     * It is used when loading reference_ranges sequences or haplotype sequences for a 
     * single line.
     * 
     * THe gidToVariantDataMap map is used to create the variant list blob for the db
     * 
     * @param gamete_grp_id 
     * @param method
     * @param anchorSequences
     * @param genomeFileId - genome_file_data table id for the fasta file this haplotype, -1 means none.
     * @param gvcfFileId - genome_file_data table id for the gvcf file for this haplotype, -1 mean none
     * @return
     */
    void putHaplotypesForGamete(int gamete_grp_id, int method,  Map anchorSequences,
             int genomeFileId, int gvcfFileId);

    /**
     * Add data to the haplotypes table.  Entries on the map are for different gamete groups.
     * The key is a Position item identifying the genome_interval id
     * The value is a Tuple consisting of (x) AnchorDataPHG object with sequence, gvcf, etc; and
     * (y) a List of taxa represented by the ANchorDataPHG sequence
     * 
     * @param mapWithGroupHash
     * @param method_id  Id in the methods table for this group of sequences
     */
    void putHaplotypesForMultipleGroups(Multimap> mapWithGroupHash, 
            int method_id);

    /**
     * This method adds data to the haplotype_counts table.
     * The "data" is a Snappy compressed byte buffer of a 3xn array, found in parameter "counts"
     * 
     * To see how this data is stored, examine DBLoadingUtils.encodeHapCountsArrayFromFile(),
     * DBLoadingUtils.encodeHapCountsArrayFromMultiset() and DBLoadingUtils.decodeHapCountsArray()
     * 
     * @param method
     * @param methodDetails
     * @param taxonName
     * @param fastqFile
     * @param counts
     */
    void putHaplotypeCountsData(String method, Map methodDetails, String taxonName, String fastqFile, byte[] counts);

    /**
     * This method stores paths data to the paths table.
     * @param method - Method Name for Path detemination process
     * @param methodDetails Details of how these paths were created
     * @param taxon Name of line for which data is being added
     * @param readMappingIds List of read_mapping_ids
     * @param pathBytes  Compressed byte array of paths data
     * @param isTestMethod Indicates if the method type should be PATHS ot TEST_PATHS
     */
    int putPathsData(String method, Map methodDetails, String taxon, List readMappingIds, byte[] pathBytes, boolean isTestMethod);

    /**
     * Takes a method id and a list of reference ranges.
     * Populates the ref_range_ref_range_method table.
     * @param group_method_id
     * @param refRangeList
     */
    void putRefRangeRefRangeMethod(int group_method_id, List refRangeList);

    /**
     * Takes a gamete_grp_id, method_id, list of haplotype sequences, a chromosome and a genomeFileId.
     * Starts the process of storing table data for the haplotypes to the db.
     * This will set maxEntries to 10000 and will call the putHaplotypesData version below
     *
     * putHaplotypesData(int gamete_grp_id, int method, Map anchorSequences, String chromosome, int genomeFileId, gvcfFIleId)
     * 
     * @param gamete_grp_id
     * @param method
     * @param anchorSequences
     * @param chromosome
     * @param genomeFileId
     */
    void putHaplotypesData(int gamete_grp_id, int method, Map anchorSequences, String chromosome, int genomeFileId, int gvcfFileId);

    /**
     * Takes a gamete_grp_id, method_id, list of haplotype sequences, a chromosome, genomeFileId, and a number of MaxEntries.
     * Starts the process of storing table data for the haplotypes to the db
     * @param gamete_grp_id
     * @param method
     * @param anchorSequences
     * @param chromosome
     * @param genomeFileId
     * @param maxEntries
     */
    void putHaplotypesData(int gamete_grp_id, int method, Map anchorSequences, String chromosome, int genomeFileId, int gvcfFileId,int maxEntries);

    /**
     * Takes a method name, method details string, taxon name (should exist in the genotypes table), file_group_name,
     * and a byte array of read mapping data.  This is stored to the PHG read_mapping table
     * @param method
     * @param methodDetails
     * @param taxon
     * @param file_group_name
     * @param mapping_data
     * @param haplotypeListId: id from the haplotype_list table
     * @param isTestMethod: indicates if method type should be set to a TEST method
     * @return
     */
    int putReadMappingData(String method, Map methodDetails, String taxon, String file_group_name, byte[] mapping_data,  boolean isTestMethod, int haplotypeListId);

    /**
     * This prompts a call to private method loadReadMappingHash() to update this hash table
     */
    void updateReadMappingHash();

    /**
     * Method takes information on a genome fasta file, stores
     * to the PHG db, returns the genome_file_data entry id created for the
     * table entry.
     * @param genome_path external server path and file name for genome
     * @param genome_file local path to file name used for MD5 calculation
     * @param genoid genoid associated with this genome data
     * @param type the type of file, ie FASTA or GVCF from DBLoadingUtils.GenomeFileType
     * @return
     */
    int putGenomeFileData(String genome_path, String genome_file, int genoid, int type);

    /**
     * Creates an entry in the taxa_groups table.  If one already
     * exists with the specified name, the id for it is returned.
     * @param group_name
     * @return id for this taxa group name
     */
    int putTaxaGroupName(String group_name);

    /**
     * Takes a taxa group name and a list of taxa.
     * Populates the taxa_groups and taxa_groups_genoid tables.
     * @param group_name
     * @param taxaList
     */
    void putTaxaTaxaGroups(String group_name, List taxaList);

    /**
     * Deletes from the read_mapping table based on the ids in the input List
     * @param readMappingIds
     * @return: nothing
     */
    void deleteReadMappingsById(List readMappingIds);

    /**
     * Deletes from the read_mapping_paths table the ids in the ReadMappingIds List
     * @param readMappingIds
     * @return  nothing
     */
    void deleteReadMappingPathsById(List readMappingIds);

    /**
     * Deletes paths based on a method name and taxa.  It allows for either
     * method or taxa to be null, but not both.  Entries are deleted from
     * both the read_mapping_paths and the paths
     * @param method delete paths which have this method
     * @param taxa  list of taxa for which the paths should be deleted
     */
    void deletePaths(String method, Listtaxa);

    /**
     * Takes a method name and deletes the entry for it from the methods table.
     * @param method
     * @return true or false indicating success
     */
    int deleteMethodByName(String method);

    /**
     * Delete read_mappings from the read_mapping table based on provided ids.
     * This will also delete entries from the read_mapping_paths and paths table
     * that are associated with these read_mappings
     * @param readMappingIds
     * @return
     */
    boolean deleteReadMappingsCascade(List readMappingIds  );

    /**
     * put data to the haplotype Lists table.
     * @param hapids - list of integers representing haplotype ids
     * @return the haplotype_list_id for this set of haplotypes (generated or existing)
     */
    int putHalotypeListData(List hapids);

}