net.maizegenetics.pangenome.db_loading.PHGSchema.sql Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
-- Table: reference_ranges 
-- This data pertains to the reference
-- The table links with the ref_range_ref_range_method table 
-- Different reference_ranges can be grouped together to form
-- focus groups.  The initial groups are "focus" from the user supplied
-- interval bed file, and non-focus, the remaining genomes ranges not
-- covered in the initial bed file.

CREATE TABLE reference_ranges (
    ref_range_id INTEGER PRIMARY KEY,
    chrom	TEXT,   
    range_start INTEGER NOT NULL,
    range_end INTEGER NOT NULL,
    UNIQUE (chrom,range_start)
);

-- Method used to create the anchors, the alignments, the edges
-- and the consensus anchors.
-- description should include program run, e.g. BLAST,
-- method of handling hets, when to drop things, etc
-- The method_type field should be the table name for which
-- the method applies (ie, if this method is for
-- anchors/gametes, alignment(msa), consensus_anchors, edge)
CREATE TABLE methods (
    method_id INTEGER PRIMARY KEY,
    method_type INTEGER,
    name TEXT UNIQUE,
    description TEXT
);

-- Table: genotypes
-- Not everything that goes into this will
-- be lines (essentially haploids)
-- The "line_data" field is created to allow
-- additional descriptions about the line.  THis is
-- especially useful if the line is a reference line.
-- Store in here AGPVx and other data.

CREATE TABLE genotypes (
    genoid INTEGER   PRIMARY KEY,
    ploidy INTEGER NOT NULL,
    is_reference BOOLEAN,
    line_name TEXT NOT NULL UNIQUE,
    line_data TEXT,
    isPhasedAcrossGenes BOOLEAN,
    isPhasedAcrossChromosomes BOOLEAN
);

-- Table: gametes
-- Only gametes are phased
-- hapNumber: generally 0 or 1.  Inbreds all 0,
-- diploid heterozygous 0 (from mom) or 1 (from dad)
CREATE TABLE gametes (
    gameteid INTEGER PRIMARY KEY,
    genoid INTEGER NOT NULL,
    hapNumber INTEGER NOT NULL,
    phasingConfidence FLOAT,
    UNIQUE (genoid,hapNumber),
    FOREIGN KEY (genoid) REFERENCES genotypes(genoid)
);


--Table: ref_range_ref_range_method
CREATE TABLE ref_range_ref_range_method (
    method_id INTEGER NOT NULL,
    ref_range_id INTEGER NOT NULL,
    FOREIGN KEY (method_id) REFERENCES methods(method_id),
    FOREIGN KEY (ref_range_id) REFERENCES reference_ranges(ref_range_id)
);

-- Table: gamete_groups
-- Defines a groupg of gametes which map to a sequence
-- THe List_hash is created by taking the SHA1 has value of
-- a comma separated ordered list of the gametids
CREATE TABLE gamete_groups (
    gamete_grp_id INTEGER PRIMARY KEY,
    list_hash TEXT
);

--TABLE gamete_haplotypes:
-- Table linking gametids to a gamete_group
-- Each gamete will belong to multiple groups, including
-- groups consisting of just the individual gamete id
CREATE TABLE gamete_haplotypes (
    gameteid INTEGER,
    gamete_grp_id INTEGER,
    UNIQUE(gameteid,gamete_grp_id),
    FOREIGN KEY (gameteid) REFERENCES gametes(gameteid),
    FOREIGN KEY (gamete_grp_id) REFERENCES gamete_groups(gamete_grp_id)
);
CREATE INDEX gamete_haplotypes_gamete_grp_id_index ON gamete_haplotypes (gamete_grp_id);

-- Table: haplotypes
-- sequence has been gzipped - accesss from java/kotlin as byte array, then decompress
-- the variant_list will be used for easy access to SNPs
CREATE TABLE haplotypes (
    haplotypes_id INTEGER PRIMARY KEY,
    gamete_grp_id INTEGER,
    ref_range_id INTEGER NOT NULL,
    genome_file_id INTEGER,
    gvcf_file_id INTEGER,
    asm_contig text,
    asm_start_coordinate INTEGER,
    asm_end_coordinate INTEGER,
    asm_strand text,
    sequence BLOB,
    seq_len INTEGER,
    seq_hash text,
    method_id INTEGER,
    UNIQUE (gamete_grp_id, ref_range_id, method_id),
    FOREIGN KEY (ref_range_id) REFERENCES reference_ranges(ref_range_id),
    FOREIGN KEY (method_id) REFERENCES methods(method_id)
);
CREATE INDEX haplotypes_method_id_index ON haplotypes (method_id);
CREATE INDEX haplotypes_ref_range_id_index ON haplotypes (ref_range_id);

-- Table: Edge
-- Holds edge information between anchor_haplotypes
-- fields "is_id1_anchor" and "is_id2_anchor" indicate if the
-- specified node is an anchor (true) or an inter-anchor node.
CREATE TABLE edge (
    edge_id INTEGER PRIMARY KEY,
    anchor_haplotype_id1 INTEGER,
    anchor_haplotype_id2 INTEGER,
    is_id1_anchor BOOLEAN,
    is_id2_anchor BOOLEAN,
    method_id INTEGER,
    quality_score FLOAT,
    FOREIGN KEY (anchor_haplotype_id1) REFERENCES haployptes(haplotypes_id),
    FOREIGN KEY (anchor_haplotype_id2) REFERENCES haplotypes(haplotypes_id),
    FOREIGN KEY (method_id) REFERENCES methods(method_id)
);

--
-- There will be multiple files for each genoid as
-- the files are per chromosome, and multiple chromosomes
-- per genotype
-- file_checksum is on contents of file, not name
-- genome_path is the remote server path
-- genome_file is just the file name
-- type is 1 (fasta) or 2(gvcf) from DBLoadingUtils.GenomeFileType
CREATE TABLE genome_file_data (
    id  INTEGER PRIMARY KEY,
    genome_path text,
    genome_file text,
    file_checksum text,
    genoid INTEGER,
    type INTEGER,
    UNIQUE (genoid,genome_file),
    FOREIGN KEY (genoid) REFERENCES genotypes(genoid)
);

-- haplotype_list table: keeps a list of haplotypes that were
-- grouped together as a result of determining  read_mappings.
-- The hapid_list is a comma separated string of hapids sorted
-- in ascending order, then compressed.  The sorted order is important
-- to ensure any lists containing the same hapids result in the same
-- list_hash value.
-- The purpose of this table is to facilitate decoding the bitsets
-- of the read_mapping
CREATE TABLE haplotype_list (
    haplotype_list_id INTEGER PRIMARY KEY,
    list_hash TEXT UNIQUE,
    hapid_list BLOB
);

-- read_mapping table: The "mapping_data" reflects all haplotype ids
-- hit by each read.
-- method_id is the method used when creating the read_mapping
-- file_group_name from the fastq file - taken from the key file (flowcell, lane, etc)
CREATE TABLE read_mapping (
   read_mapping_id INTEGER PRIMARY KEY,
   genoid INTEGER,
   method_id INTEGER,
   file_group_name TEXT,
   mapping_data BLOB,
   haplotype_list_id INTEGER,
   FOREIGN KEY (method_id) REFERENCES methods(method_id),
   FOREIGN KEY (genoid) REFERENCES genotypes(genoid)
);
CREATE UNIQUE INDEX read_mapping_idx on read_mapping (genoid, method_id, file_group_name);
CREATE INDEX read_mapping_genoid_idx on read_mapping (genoid);

-- Paths table:  The haplotype_paths will be an
-- array list of [hapid1, hapid2], ordered by ref range,
-- condensed to a byte array to be stored as a blob
-- method_id is the method used when creating the path
-- The paths map to the read_mapping_path table via that
-- table's path_id
CREATE TABLE paths (
    path_id INTEGER PRIMARY KEY,
    genoid INTEGER,
    method_id INTEGER,
    paths_data BLOB,
    UNIQUE(genoid, method_id),
    FOREIGN KEY (method_id) REFERENCES methods(method_id),
    FOREIGN KEY (genoid) REFERENCES genotypes(genoid)
);

-- links the read_mapping and paths tables
-- path_id maps to paths.path_id
-- read_mapping_id maps to read_mapping.read_mapping_id
CREATE TABLE read_mapping_paths (
   read_mapping_id INTEGER,
   path_id INTEGER,
   UNIQUE(read_mapping_id,path_id),
   FOREIGN KEY (read_mapping_id) REFERENCES read_mapping(read_mapping_id),
   FOREIGN KEY (path_id) REFERENCES paths(path_id)
);

-- Haplotype_counts table: The "data" will be an nx3 array
-- of [hapid, inclusion count, exclusion count], condensed to a
-- byte array, stored in a BLOB
-- method_id is the method used when creating the haplotype counts
CREATE TABLE haplotype_counts (
   haplotype_counts_id INTEGER PRIMARY KEY,
   genoid INTEGER,
   method_id INTEGER,
   fastq_file TEXT,
   data BLOB,
   FOREIGN KEY (method_id) REFERENCES methods(method_id),
   FOREIGN KEY (genoid) REFERENCES genotypes(genoid)
);



-- taxa_groups table:  Matches a group id to a group name
CREATE TABLE taxa_groups (
   taxa_grp_id INTEGER PRIMARY KEY,
   taxa_grp_name TEXT UNIQUE
);

-- taxa_groups_genoid table:  Keeps a list of genoids
-- associated with each taxa_group.  A genoid may belong
-- to more than 1 taxa_grp_id
CREATE TABLE taxa_groups_genoid (
   taxa_grp_id INTEGER,
   genoid INTEGER,
   UNIQUE(taxa_grp_id,genoid),
   FOREIGN KEY (taxa_grp_id) REFERENCES taxa_groups(taxa_grp_id),
   FOREIGN KEY (genoid) REFERENCES genotypes(genoid)
);