net.maizegenetics.pangenome.db_loading.PHGDataWriter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.db_loading;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.google.common.collect.Multimap;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.util.Tuple;
/**
* @author lcj34
*
*/
public interface PHGDataWriter extends PHGData {
/**
* Stores chrom, start pos, end pos to referenece_ranges table
* isFocus identifies focus intervals from the user's bed file
*
* @param anchorData
* @param refGroupMethodID method_id used for creating this ref_range_group
* @return
*/
boolean putAllAnchors(List anchorData, int refGroupMethodID);
/**
* Stores required data to the genotypes and haplotypes tables
* for each entry on the list.
*
* @param ghData
* @return
*/
boolean putGenoAndHaploTypeData(GenoHaploData ghData);
/**
* Fills in the haplotypes table
* for the reference ranges.
*
* @param line_name
* @param hapnumber
* @param hapMethod Name of method used to create anchors. Maps to method table for id.
* @param adata Anchor data, including chrom, start/end positions
* @param refGroupMethod List of methods used to create the ref_range_group
* @param gvcf String - name of gvcf file
* @param variant_list String - name of file containing list of variants
* @param genomeFileId int - id for ref fasta entry in the genome_file_data table
* @paRAM gvcfFileId int - id for ref gvcf file enty in the genome_file_data table
* @return
*/
boolean putRefAnchorData(String line_name, int hapnumber, List adata,int hapMethod,
Set refGroupMethod,String gvcf, String variant_list, int genomeFileId, int gvcfFileId);
/**
* Adds a method, its type and its description to the anchor_methods table
* These are used to identify how sequences were created,how the were combined into consensus
* sequences, how haplotype counts were scores, how paths through the graph were create
* or how an edge was created.
*
* The "type" field identifies the table to which the method belongs.
*
* @param name
* @param type
* @param description - a map of pluginParameter name to value(as String)
* @return the methodId from the methods table
*/
int putMethod(String name, DBLoadingUtils.MethodType type, Map description);
/**
* Adds inter-anchor sequences for the specified assembly to the anchor_sequences
* and anchor_haplotypes table.
*
* This method takes a multi-map as assembly. Inter-anchors that do not map
* to a reference inter-anchor are all given the anchorid 0.
*
* @param line_name
* @param hapNumber
* @param method
* @param anchorSequences
* @return
*/
boolean putAssemblyInterAnchorSequences(String line_name, int hapNumber, String method, Multimap anchorSequences);
/**
* This method takes a map of consensus data, finds the anchorIds based on Position,
* finds the hapids of the taxa whose sequences at the specified anchorID map
* to the consensus. Adds the gamete_group and sequence data to the haplotpes table ;
* adds entries to gamete_groups and gamete_hapltoypes. *
*
* @param consensusMap Multimap>
* @param methodId method used for collapsing anchors
* @return A list of hash codes loaded to the anchor_sequences table for the consensus sequences
*/
void putConsensusSequences(Multimap>> consensusMap,
int methodId);
/**
* Takes a list of gametes and stores to the gamete_groups and gamete_haplotypes table
* Skips if this grouping already exists
* @param gametes list consisting of taxa/gamete number in the form taxaName_gameteNumber
* @return true if successful
*/
boolean putGameteGroupAndHaplotypes(List gametes);
/**
* Stores gamete sequence data to the haplotypes table
* This method associates all entries with the single gamete_grp_id which is passed in.
* It is used when loading reference_ranges sequences or haplotype sequences for a
* single line.
*
* THe gidToVariantDataMap map is used to create the variant list blob for the db
*
* @param gamete_grp_id
* @param method
* @param anchorSequences
* @param genomeFileId - genome_file_data table id for the fasta file this haplotype, -1 means none.
* @param gvcfFileId - genome_file_data table id for the gvcf file for this haplotype, -1 mean none
* @return
*/
void putHaplotypesForGamete(int gamete_grp_id, int method, Map anchorSequences,
int genomeFileId, int gvcfFileId);
/**
* Add data to the haplotypes table. Entries on the map are for different gamete groups.
* The key is a Position item identifying the genome_interval id
* The value is a Tuple consisting of (x) AnchorDataPHG object with sequence, gvcf, etc; and
* (y) a List of taxa represented by the ANchorDataPHG sequence
*
* @param mapWithGroupHash
* @param method_id Id in the methods table for this group of sequences
*/
void putHaplotypesForMultipleGroups(Multimap> mapWithGroupHash,
int method_id);
/**
* This method adds data to the haplotype_counts table.
* The "data" is a Snappy compressed byte buffer of a 3xn array, found in parameter "counts"
*
* To see how this data is stored, examine DBLoadingUtils.encodeHapCountsArrayFromFile(),
* DBLoadingUtils.encodeHapCountsArrayFromMultiset() and DBLoadingUtils.decodeHapCountsArray()
*
* @param method
* @param methodDetails
* @param taxonName
* @param fastqFile
* @param counts
*/
void putHaplotypeCountsData(String method, Map methodDetails, String taxonName, String fastqFile, byte[] counts);
/**
* This method stores paths data to the paths table.
* @param method - Method Name for Path detemination process
* @param methodDetails Details of how these paths were created
* @param taxon Name of line for which data is being added
* @param readMappingIds List of read_mapping_ids
* @param pathBytes Compressed byte array of paths data
* @param isTestMethod Indicates if the method type should be PATHS ot TEST_PATHS
*/
int putPathsData(String method, Map methodDetails, String taxon, List readMappingIds, byte[] pathBytes, boolean isTestMethod);
/**
* Takes a method id and a list of reference ranges.
* Populates the ref_range_ref_range_method table.
* @param group_method_id
* @param refRangeList
*/
void putRefRangeRefRangeMethod(int group_method_id, List refRangeList);
/**
* Takes a gamete_grp_id, method_id, list of haplotype sequences, a chromosome and a genomeFileId.
* Starts the process of storing table data for the haplotypes to the db.
* This will set maxEntries to 10000 and will call the putHaplotypesData version below
*
* putHaplotypesData(int gamete_grp_id, int method, Map anchorSequences, String chromosome, int genomeFileId, gvcfFIleId)
*
* @param gamete_grp_id
* @param method
* @param anchorSequences
* @param chromosome
* @param genomeFileId
*/
void putHaplotypesData(int gamete_grp_id, int method, Map anchorSequences, String chromosome, int genomeFileId, int gvcfFileId);
/**
* Takes a gamete_grp_id, method_id, list of haplotype sequences, a chromosome, genomeFileId, and a number of MaxEntries.
* Starts the process of storing table data for the haplotypes to the db
* @param gamete_grp_id
* @param method
* @param anchorSequences
* @param chromosome
* @param genomeFileId
* @param maxEntries
*/
void putHaplotypesData(int gamete_grp_id, int method, Map anchorSequences, String chromosome, int genomeFileId, int gvcfFileId,int maxEntries);
/**
* Takes a method name, method details string, taxon name (should exist in the genotypes table), file_group_name,
* and a byte array of read mapping data. This is stored to the PHG read_mapping table
* @param method
* @param methodDetails
* @param taxon
* @param file_group_name
* @param mapping_data
* @param haplotypeListId: id from the haplotype_list table
* @param isTestMethod: indicates if method type should be set to a TEST method
* @return
*/
int putReadMappingData(String method, Map methodDetails, String taxon, String file_group_name, byte[] mapping_data, boolean isTestMethod, int haplotypeListId);
/**
* This prompts a call to private method loadReadMappingHash() to update this hash table
*/
void updateReadMappingHash();
/**
* Method takes information on a genome fasta file, stores
* to the PHG db, returns the genome_file_data entry id created for the
* table entry.
* @param genome_path external server path and file name for genome
* @param genome_file local path to file name used for MD5 calculation
* @param genoid genoid associated with this genome data
* @param type the type of file, ie FASTA or GVCF from DBLoadingUtils.GenomeFileType
* @return
*/
int putGenomeFileData(String genome_path, String genome_file, int genoid, int type);
/**
* Creates an entry in the taxa_groups table. If one already
* exists with the specified name, the id for it is returned.
* @param group_name
* @return id for this taxa group name
*/
int putTaxaGroupName(String group_name);
/**
* Takes a taxa group name and a list of taxa.
* Populates the taxa_groups and taxa_groups_genoid tables.
* @param group_name
* @param taxaList
*/
void putTaxaTaxaGroups(String group_name, List taxaList);
/**
* Deletes from the read_mapping table based on the ids in the input List
* @param readMappingIds
* @return: nothing
*/
void deleteReadMappingsById(List readMappingIds);
/**
* Deletes from the read_mapping_paths table the ids in the ReadMappingIds List
* @param readMappingIds
* @return nothing
*/
void deleteReadMappingPathsById(List readMappingIds);
/**
* Deletes paths based on a method name and taxa. It allows for either
* method or taxa to be null, but not both. Entries are deleted from
* both the read_mapping_paths and the paths
* @param method delete paths which have this method
* @param taxa list of taxa for which the paths should be deleted
*/
void deletePaths(String method, Listtaxa);
/**
* Takes a method name and deletes the entry for it from the methods table.
* @param method
* @return true or false indicating success
*/
int deleteMethodByName(String method);
/**
* Delete read_mappings from the read_mapping table based on provided ids.
* This will also delete entries from the read_mapping_paths and paths table
* that are associated with these read_mappings
* @param readMappingIds
* @return
*/
boolean deleteReadMappingsCascade(List readMappingIds );
/**
* put data to the haplotype Lists table.
* @param hapids - list of integers representing haplotype ids
* @return the haplotype_list_id for this set of haplotypes (generated or existing)
*/
int putHalotypeListData(List hapids);
}