net.maizegenetics.pangenome.db_loading.PHGPostgreSQLSchema.sql Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
--
-- PostgreSQL database dump
--

-- Dumped from database version 9.4.14
-- Dumped by pg_dump version 9.6.5
-- NOTE: This drops tables that already exists.  But if a table is exists in 
-- the old schema which is not here, it does NOT get dropped.  Really need to know
-- how to prevent template1 from getting overwritten with a new schema.

SET statement_timeout = 0;
SET lock_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SET check_function_bodies = false;
SET client_min_messages = warning;

--
-- Name: plpgsql; Type: EXTENSION; Schema: -; Owner: 
--

CREATE EXTENSION IF NOT EXISTS plpgsql WITH SCHEMA pg_catalog;


--
-- Name: EXTENSION plpgsql; Type: COMMENT; Schema: -; Owner: 
--

COMMENT ON EXTENSION plpgsql IS 'PL/pgSQL procedural language';


SET search_path = public, pg_catalog;

SET default_tablespace = '';

SET default_with_oids = false;

--
-- Name: methods; Type: TABLE; Schema: public; Owner: postgres
--

DROP TABLE IF EXISTS methods CASCADE;
CREATE TABLE methods (
    method_id SERIAL PRIMARY KEY,
    method_type integer,
    name text UNIQUE,
    description text
);

--
-- Name: genotypes; Type: TABLE; Schema: public; Owner: postgres
--

DROP TABLE IF EXISTS genotypes CASCADE;
CREATE TABLE genotypes (
    genoid SERIAL PRIMARY KEY,
    ploidy integer,
    is_reference smallint,
    isphasedacrossgenes smallint,
    isphasedacrosschromosomes smallint,
    line_name text UNIQUE,
    line_data text

);

--
-- Name: gamete_groups; Type: TABLE; Schema: public; Owner: postgres
--

DROP TABLE IF EXISTS gamete_groups CASCADE;
CREATE TABLE gamete_groups (
    gamete_grp_id  SERIAL PRIMARY KEY,
    list_hash text
);


--
-- Name: gametes; Type: TABLE; Schema: public; Owner: postgres
--
DROP TABLE IF EXISTS gametes CASCADE;
CREATE TABLE gametes (
    gameteid SERIAL PRIMARY KEY,
    genoid integer,
    hapnumber integer,
    phasingconfidence double precision,
    UNIQUE (genoid, hapNumber)
);

--
-- Name: gamete_haplotypes; Type: TABLE; Schema: public; Owner: postgres
--  NOTE - gameteid comes from gametes table, gamete_grp_id from
--  gamete_grps table.
--
DROP TABLE IF EXISTS gamete_haplotypes CASCADE;
CREATE TABLE gamete_haplotypes (
    gameteid integer,
    gamete_grp_id integer,
    UNIQUE(gameteid,gamete_grp_id)
);

--
-- Name: reference_ranges; Type: TABLE; Schema: public; Owner: postgres
--

DROP TABLE IF EXISTS reference_ranges CASCADE;
CREATE TABLE reference_ranges (
    ref_range_id SERIAL PRIMARY KEY,
    range_start integer,
    range_end integer,
    chrom text,
    UNIQUE (chrom, range_start)
);

--
-- Name: ref_range_ref_range_method; Type: TABLE; Schema: public; Owner: postgres
--

DROP TABLE IF EXISTS ref_range_ref_range_method CASCADE;
CREATE TABLE ref_range_ref_range_method (
    method_id integer, 
    ref_range_id integer
);


--
-- Name: haplotypes; Type: TABLE; Schema: public; Owner: postgres
-- The asm_strand is varChar(n) instead of char(n) as char(n) will
-- pad the output, but varChar(n) sets a character limit, but doesn't pad.
-- This must be compatible with SQL, so needs to allow for "+","-" or "."
--

DROP TABLE IF EXISTS haplotypes CASCADE;
CREATE TABLE haplotypes (
    haplotypes_id SERIAL PRIMARY KEY,
    gamete_grp_id integer,
    ref_range_id integer,
    genome_file_id integer,
    gvcf_file_id integer,
    asm_contig text,
    asm_start_coordinate integer,
    asm_end_coordinate integer,
    asm_strand varchar(1),
    method_id integer,
    sequence bytea,
    seq_len integer,
    seq_hash text,
    UNIQUE(gamete_grp_id, ref_range_id, method_id)
);

--
-- Name: edge; Type: TABLE; Schema: public; Owner: postgres
--
DROP TABLE IF EXISTS edge CASCADE;
CREATE TABLE edge (
    edge_id SERIAL PRIMARY KEY,
    anchor_haplotype_id1 integer,
    anchor_haplotype_id2 integer,
    method_id integer,
    quality_score double precision,
    is_id1_anchor smallint,
    is_id2_anchor smallint
);

--
-- There will be multiple files for each genoid as
-- the files are per chromosome, and multiple chromosomes
-- per genotype.
-- genome_path is the remote server path
-- genome_file is just the file name
-- type is 1 (fasta) or 2(gvcf) from DBLoadingUtils.GenomeFileType
--
DROP TABLE IF EXISTS genome_file_data CASCADE;
CREATE TABLE genome_file_data (
    id  SERIAL PRIMARY KEY,
    genome_path text,
    genome_file text,
    file_checksum text,
    genoid integer,
    type integer,
    UNIQUE (genoid, genome_file)
);


-- Haplotype_counts table: The "data" will be an nx3 array
-- of [hapid, inclusion count, exclusion count], condensed to a
-- byte array, stored in a BLOB
-- method_id is the method used when creating the haplotype counts
DROP TABLE IF EXISTS haplotype_counts CASCADE;
CREATE TABLE haplotype_counts (
   haplotype_counts_id SERIAL PRIMARY KEY,
   genoid INTEGER,
   method_id INTEGER,
   fastq_file TEXT,
   data bytea,
   UNIQUE (genoid, method_id, fastq_file)
);

-- Paths table:  The haplotype_paths will be an
-- array list of [hapid1, hapid2], ordered by ref range,
-- condensed to a byte array to be stored as a blob
-- rm_group_id maps to a read_mapping_group entry
-- method_id is the method used when creating the path
DROP TABLE IF EXISTS paths CASCADE;
CREATE TABLE paths (
   path_id SERIAL PRIMARY KEY,
   genoid INTEGER,
   method_id INTEGER,
   paths_data bytea,
   UNIQUE(genoid,method_id)
);

-- haplotype_list table: keeps a list of haplotypes that were
-- grouped together as a result of determining  read_mappings.
-- The hapid_list is a comma separated string of hapids sorted
-- in ascending order, then compressed.  The sorted order is important
-- to ensure any lists containing the same hapids result in the same
-- list_hash value.
-- The purpose of this table is to facilitate decoding the bitsets
-- of the read_mapping
DROP TABLE IF EXISTS haplotype_list CASCADE;
CREATE TABLE haplotype_list (
    haplotype_list_id SERIAL PRIMARY KEY,
    list_hash TEXT UNIQUE,
    hapid_list bytea
);

-- read_mapping table: The "mapping_data" reflects all haplotype ids
-- hit by each read.
-- method_id is the method used when creating the read_mapping
-- file_group_name from the fastq file - taken from the key file (flowcell, lane, etc)
DROP TABLE IF EXISTS read_mapping CASCADE;
CREATE TABLE read_mapping (
   read_mapping_id SERIAL PRIMARY KEY,
   genoid INTEGER,
   method_id INTEGER,
   file_group_name TEXT,
   haplotype_list_id INTEGER,
   mapping_data bytea
);

-- links the read_mapping and read_mapping_group tables
-- there can/will be many read_mapping_ids for each rm_group_id
-- rm_group_id maps to read_mapping_group.rm_group_id
-- read_mapping_id maps to read_mapping.read_mapping_id
CREATE TABLE read_mapping_paths (
   read_mapping_id INTEGER,
   path_id INTEGER,
   UNIQUE(read_mapping_id,path_id)
);

-- taxa_groups table:  Matches a group is to a group name
DROP TABLE IF EXISTS taxa_groups CASCADE;
CREATE TABLE taxa_groups (
   taxa_grp_id SERIAL PRIMARY KEY,
   taxa_grp_name TEXT UNIQUE
);

-- taxa_groups_genoid table:  Keeps a list of genoids
-- associated with each taxa_group.  A genoid may belong
-- to more than 1 taxa_grp_id
DROP TABLE IF EXISTS taxa_groups_genoid CASCADE;
CREATE TABLE taxa_groups_genoid (
   taxa_grp_id INTEGER,
   genoid INTEGER
);



CREATE INDEX gamete_haplotypes_gamete_grp_id_index ON gamete_haplotypes (gamete_grp_id);

CREATE INDEX haplotypes_method_id_index ON haplotypes (method_id);
CREATE INDEX haplotypes_ref_range_id_index ON haplotypes (ref_range_id);

CREATE UNIQUE INDEX read_mapping_idx on read_mapping (genoid, method_id, file_group_name);
CREATE INDEX read_mapping_genoid_idx on read_mapping (genoid);

CREATE UNIQUE INDEX taxa_groups_genoid_unique_idx on taxa_groups_genoid (taxa_grp_id,genoid);

-- FKsReadMapping
ALTER TABLE read_mapping add constraint fk_method_id FOREIGN KEY (method_id) REFERENCES methods (method_id);

-- FKsEdge
ALTER TABLE edge add constraint fk_anchor_haplotype_id1 FOREIGN KEY (anchor_haplotype_id1) REFERENCES haplotypes (haplotypes_id);
ALTER TABLE edge add constraint fk_anchor_haplotype_id2 FOREIGN KEY (anchor_haplotype_id2) REFERENCES haplotypes (haplotypes_id);
ALTER TABLE edge add constraint fk_method_id FOREIGN KEY (method_id) REFERENCES methods (method_id);

-- FKsGameteHaplotypes
ALTER TABLE gamete_haplotypes add constraint fk_gamete_haplotypes_gameteid FOREIGN KEY (gameteid) REFERENCES gametes(gameteid);
ALTER TABLE gamete_haplotypes add constraint fk_gamete_haplotypes_gameate_grp_id FOREIGN KEY (gamete_grp_id) REFERENCES gamete_groups(gamete_grp_id);

-- FKsGametes
ALTER TABLE gametes add constraint fk_gametes_genoid FOREIGN KEY (genoid) REFERENCES genotypes(genoid);

-- FKsRefRangeRefRangeMethod
ALTER TABLE ref_range_ref_range_method add constraint fk_ref_range_ref_range_method_method_id FOREIGN KEY (method_id) REFERENCES methods(method_id);
ALTER TABLE ref_range_ref_range_method add constraint fk_ref_range_ref_range_method_ref_range_id FOREIGN KEY (ref_range_id) REFERENCES reference_ranges(ref_range_id);

-- FKsHaplotypes
ALTER TABLE haplotypes add constraint fk_haplotypes_ref_range_id FOREIGN KEY (ref_range_id) REFERENCES reference_ranges(ref_range_id);
ALTER TABLE haplotypes add constraint fk_haplotypes_method_id FOREIGN KEY (method_id) REFERENCES methods(method_id);

-- FKsGenomeFileData
ALTER TABLE genome_file_data add constraint fk_genome_file_data_genoid FOREIGN KEY (genoid) REFERENCES genotypes(genoid);

-- FKsHaplotypeCounts
ALTER TABLE haplotype_counts add constraint fk_haplotype_counts_method_id FOREIGN KEY (method_id) REFERENCES methods(method_id);
ALTER TABLE haplotype_counts add constraint fk_haplotype_counts_genoid FOREIGN KEY (genoid) REFERENCES genotypes(genoid);

-- FKsReadMapping
ALTER TABLE read_mapping add constraint fk_read_mapping_method_id FOREIGN KEY (method_id) REFERENCES methods(method_id);
ALTER TABLE read_mapping add constraint fk_read_mapping_genoid FOREIGN KEY (genoid) REFERENCES genotypes(genoid);

-- FKsReadMappingPaths
ALTER TABLE read_mapping_paths add constraint fk_read_mapping_paths_read_mapping_id FOREIGN KEY (read_mapping_id) REFERENCES read_mapping(read_mapping_id);
ALTER TABLE read_mapping_paths add constraint fk_read_mapping_paths_path_id FOREIGN KEY (path_id) REFERENCES paths(path_id);

-- FKsPaths
ALTER TABLE paths add constraint fk_paths_method_id FOREIGN KEY (method_id) REFERENCES methods(method_id);
ALTER TABLE paths add constraint fk_paths_genoid FOREIGN KEY (genoid) REFERENCES genotypes(genoid);

-- FKsTaxaGroupsGenoid
ALTER TABLE taxa_groups_genoid add constraint fk_taxa_groups_genoid_genoid FOREIGN KEY (genoid) REFERENCES genotypes(genoid);

--
-- Name: public; Type: ACL; Schema: -; Owner: postgres
--

REVOKE ALL ON SCHEMA public FROM PUBLIC;
REVOKE ALL ON SCHEMA public FROM postgres;
GRANT ALL ON SCHEMA public TO postgres;
GRANT ALL ON SCHEMA public TO PUBLIC;


--
-- PostgreSQL database dump complete
--