net.maizegenetics.pangenome.db_loading.PHGSchema.sql Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
-- Table: reference_ranges
-- This data pertains to the reference
-- The table links with the ref_range_ref_range_method table
-- Different reference_ranges can be grouped together to form
-- focus groups. The initial groups are "focus" from the user supplied
-- interval bed file, and non-focus, the remaining genomes ranges not
-- covered in the initial bed file.
CREATE TABLE reference_ranges (
ref_range_id INTEGER PRIMARY KEY,
chrom TEXT,
range_start INTEGER NOT NULL,
range_end INTEGER NOT NULL,
UNIQUE (chrom,range_start)
);
-- Method used to create the anchors, the alignments, the edges
-- and the consensus anchors.
-- description should include program run, e.g. BLAST,
-- method of handling hets, when to drop things, etc
-- The method_type field should be the table name for which
-- the method applies (ie, if this method is for
-- anchors/gametes, alignment(msa), consensus_anchors, edge)
CREATE TABLE methods (
method_id INTEGER PRIMARY KEY,
method_type INTEGER,
name TEXT UNIQUE,
description TEXT
);
-- Table: genotypes
-- Not everything that goes into this will
-- be lines (essentially haploids)
-- The "line_data" field is created to allow
-- additional descriptions about the line. THis is
-- especially useful if the line is a reference line.
-- Store in here AGPVx and other data.
CREATE TABLE genotypes (
genoid INTEGER PRIMARY KEY,
ploidy INTEGER NOT NULL,
is_reference BOOLEAN,
line_name TEXT NOT NULL UNIQUE,
line_data TEXT,
isPhasedAcrossGenes BOOLEAN,
isPhasedAcrossChromosomes BOOLEAN
);
-- Table: gametes
-- Only gametes are phased
-- hapNumber: generally 0 or 1. Inbreds all 0,
-- diploid heterozygous 0 (from mom) or 1 (from dad)
CREATE TABLE gametes (
gameteid INTEGER PRIMARY KEY,
genoid INTEGER NOT NULL,
hapNumber INTEGER NOT NULL,
phasingConfidence FLOAT,
UNIQUE (genoid,hapNumber),
FOREIGN KEY (genoid) REFERENCES genotypes(genoid)
);
--Table: ref_range_ref_range_method
CREATE TABLE ref_range_ref_range_method (
method_id INTEGER NOT NULL,
ref_range_id INTEGER NOT NULL,
FOREIGN KEY (method_id) REFERENCES methods(method_id),
FOREIGN KEY (ref_range_id) REFERENCES reference_ranges(ref_range_id)
);
-- Table: gamete_groups
-- Defines a groupg of gametes which map to a sequence
-- THe List_hash is created by taking the SHA1 has value of
-- a comma separated ordered list of the gametids
CREATE TABLE gamete_groups (
gamete_grp_id INTEGER PRIMARY KEY,
list_hash TEXT
);
--TABLE gamete_haplotypes:
-- Table linking gametids to a gamete_group
-- Each gamete will belong to multiple groups, including
-- groups consisting of just the individual gamete id
CREATE TABLE gamete_haplotypes (
gameteid INTEGER,
gamete_grp_id INTEGER,
UNIQUE(gameteid,gamete_grp_id),
FOREIGN KEY (gameteid) REFERENCES gametes(gameteid),
FOREIGN KEY (gamete_grp_id) REFERENCES gamete_groups(gamete_grp_id)
);
CREATE INDEX gamete_haplotypes_gamete_grp_id_index ON gamete_haplotypes (gamete_grp_id);
-- Table: haplotypes
-- sequence has been gzipped - accesss from java/kotlin as byte array, then decompress
-- the variant_list will be used for easy access to SNPs
CREATE TABLE haplotypes (
haplotypes_id INTEGER PRIMARY KEY,
gamete_grp_id INTEGER,
ref_range_id INTEGER NOT NULL,
genome_file_id INTEGER,
gvcf_file_id INTEGER,
asm_contig text,
asm_start_coordinate INTEGER,
asm_end_coordinate INTEGER,
asm_strand text,
sequence BLOB,
seq_len INTEGER,
seq_hash text,
method_id INTEGER,
UNIQUE (gamete_grp_id, ref_range_id, method_id),
FOREIGN KEY (ref_range_id) REFERENCES reference_ranges(ref_range_id),
FOREIGN KEY (method_id) REFERENCES methods(method_id)
);
CREATE INDEX haplotypes_method_id_index ON haplotypes (method_id);
CREATE INDEX haplotypes_ref_range_id_index ON haplotypes (ref_range_id);
-- Table: Edge
-- Holds edge information between anchor_haplotypes
-- fields "is_id1_anchor" and "is_id2_anchor" indicate if the
-- specified node is an anchor (true) or an inter-anchor node.
CREATE TABLE edge (
edge_id INTEGER PRIMARY KEY,
anchor_haplotype_id1 INTEGER,
anchor_haplotype_id2 INTEGER,
is_id1_anchor BOOLEAN,
is_id2_anchor BOOLEAN,
method_id INTEGER,
quality_score FLOAT,
FOREIGN KEY (anchor_haplotype_id1) REFERENCES haployptes(haplotypes_id),
FOREIGN KEY (anchor_haplotype_id2) REFERENCES haplotypes(haplotypes_id),
FOREIGN KEY (method_id) REFERENCES methods(method_id)
);
--
-- There will be multiple files for each genoid as
-- the files are per chromosome, and multiple chromosomes
-- per genotype
-- file_checksum is on contents of file, not name
-- genome_path is the remote server path
-- genome_file is just the file name
-- type is 1 (fasta) or 2(gvcf) from DBLoadingUtils.GenomeFileType
CREATE TABLE genome_file_data (
id INTEGER PRIMARY KEY,
genome_path text,
genome_file text,
file_checksum text,
genoid INTEGER,
type INTEGER,
UNIQUE (genoid,genome_file),
FOREIGN KEY (genoid) REFERENCES genotypes(genoid)
);
-- haplotype_list table: keeps a list of haplotypes that were
-- grouped together as a result of determining read_mappings.
-- The hapid_list is a comma separated string of hapids sorted
-- in ascending order, then compressed. The sorted order is important
-- to ensure any lists containing the same hapids result in the same
-- list_hash value.
-- The purpose of this table is to facilitate decoding the bitsets
-- of the read_mapping
CREATE TABLE haplotype_list (
haplotype_list_id INTEGER PRIMARY KEY,
list_hash TEXT UNIQUE,
hapid_list BLOB
);
-- read_mapping table: The "mapping_data" reflects all haplotype ids
-- hit by each read.
-- method_id is the method used when creating the read_mapping
-- file_group_name from the fastq file - taken from the key file (flowcell, lane, etc)
CREATE TABLE read_mapping (
read_mapping_id INTEGER PRIMARY KEY,
genoid INTEGER,
method_id INTEGER,
file_group_name TEXT,
mapping_data BLOB,
haplotype_list_id INTEGER,
FOREIGN KEY (method_id) REFERENCES methods(method_id),
FOREIGN KEY (genoid) REFERENCES genotypes(genoid)
);
CREATE UNIQUE INDEX read_mapping_idx on read_mapping (genoid, method_id, file_group_name);
CREATE INDEX read_mapping_genoid_idx on read_mapping (genoid);
-- Paths table: The haplotype_paths will be an
-- array list of [hapid1, hapid2], ordered by ref range,
-- condensed to a byte array to be stored as a blob
-- method_id is the method used when creating the path
-- The paths map to the read_mapping_path table via that
-- table's path_id
CREATE TABLE paths (
path_id INTEGER PRIMARY KEY,
genoid INTEGER,
method_id INTEGER,
paths_data BLOB,
UNIQUE(genoid, method_id),
FOREIGN KEY (method_id) REFERENCES methods(method_id),
FOREIGN KEY (genoid) REFERENCES genotypes(genoid)
);
-- links the read_mapping and paths tables
-- path_id maps to paths.path_id
-- read_mapping_id maps to read_mapping.read_mapping_id
CREATE TABLE read_mapping_paths (
read_mapping_id INTEGER,
path_id INTEGER,
UNIQUE(read_mapping_id,path_id),
FOREIGN KEY (read_mapping_id) REFERENCES read_mapping(read_mapping_id),
FOREIGN KEY (path_id) REFERENCES paths(path_id)
);
-- Haplotype_counts table: The "data" will be an nx3 array
-- of [hapid, inclusion count, exclusion count], condensed to a
-- byte array, stored in a BLOB
-- method_id is the method used when creating the haplotype counts
CREATE TABLE haplotype_counts (
haplotype_counts_id INTEGER PRIMARY KEY,
genoid INTEGER,
method_id INTEGER,
fastq_file TEXT,
data BLOB,
FOREIGN KEY (method_id) REFERENCES methods(method_id),
FOREIGN KEY (genoid) REFERENCES genotypes(genoid)
);
-- taxa_groups table: Matches a group id to a group name
CREATE TABLE taxa_groups (
taxa_grp_id INTEGER PRIMARY KEY,
taxa_grp_name TEXT UNIQUE
);
-- taxa_groups_genoid table: Keeps a list of genoids
-- associated with each taxa_group. A genoid may belong
-- to more than 1 taxa_grp_id
CREATE TABLE taxa_groups_genoid (
taxa_grp_id INTEGER,
genoid INTEGER,
UNIQUE(taxa_grp_id,genoid),
FOREIGN KEY (taxa_grp_id) REFERENCES taxa_groups(taxa_grp_id),
FOREIGN KEY (genoid) REFERENCES genotypes(genoid)
);