net.maizegenetics.pangenome.db_loading.PHGPostgreSQLSchema.sql Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
--
-- PostgreSQL database dump
--
-- Dumped from database version 9.4.14
-- Dumped by pg_dump version 9.6.5
-- NOTE: This drops tables that already exists. But if a table is exists in
-- the old schema which is not here, it does NOT get dropped. Really need to know
-- how to prevent template1 from getting overwritten with a new schema.
SET statement_timeout = 0;
SET lock_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SET check_function_bodies = false;
SET client_min_messages = warning;
--
-- Name: plpgsql; Type: EXTENSION; Schema: -; Owner:
--
CREATE EXTENSION IF NOT EXISTS plpgsql WITH SCHEMA pg_catalog;
--
-- Name: EXTENSION plpgsql; Type: COMMENT; Schema: -; Owner:
--
COMMENT ON EXTENSION plpgsql IS 'PL/pgSQL procedural language';
SET search_path = public, pg_catalog;
SET default_tablespace = '';
SET default_with_oids = false;
--
-- Name: methods; Type: TABLE; Schema: public; Owner: postgres
--
DROP TABLE IF EXISTS methods CASCADE;
CREATE TABLE methods (
method_id SERIAL PRIMARY KEY,
method_type integer,
name text UNIQUE,
description text
);
--
-- Name: genotypes; Type: TABLE; Schema: public; Owner: postgres
--
DROP TABLE IF EXISTS genotypes CASCADE;
CREATE TABLE genotypes (
genoid SERIAL PRIMARY KEY,
ploidy integer,
is_reference smallint,
isphasedacrossgenes smallint,
isphasedacrosschromosomes smallint,
line_name text UNIQUE,
line_data text
);
--
-- Name: gamete_groups; Type: TABLE; Schema: public; Owner: postgres
--
DROP TABLE IF EXISTS gamete_groups CASCADE;
CREATE TABLE gamete_groups (
gamete_grp_id SERIAL PRIMARY KEY,
list_hash text
);
--
-- Name: gametes; Type: TABLE; Schema: public; Owner: postgres
--
DROP TABLE IF EXISTS gametes CASCADE;
CREATE TABLE gametes (
gameteid SERIAL PRIMARY KEY,
genoid integer,
hapnumber integer,
phasingconfidence double precision,
UNIQUE (genoid, hapNumber)
);
--
-- Name: gamete_haplotypes; Type: TABLE; Schema: public; Owner: postgres
-- NOTE - gameteid comes from gametes table, gamete_grp_id from
-- gamete_grps table.
--
DROP TABLE IF EXISTS gamete_haplotypes CASCADE;
CREATE TABLE gamete_haplotypes (
gameteid integer,
gamete_grp_id integer,
UNIQUE(gameteid,gamete_grp_id)
);
--
-- Name: reference_ranges; Type: TABLE; Schema: public; Owner: postgres
--
DROP TABLE IF EXISTS reference_ranges CASCADE;
CREATE TABLE reference_ranges (
ref_range_id SERIAL PRIMARY KEY,
range_start integer,
range_end integer,
chrom text,
UNIQUE (chrom, range_start)
);
--
-- Name: ref_range_ref_range_method; Type: TABLE; Schema: public; Owner: postgres
--
DROP TABLE IF EXISTS ref_range_ref_range_method CASCADE;
CREATE TABLE ref_range_ref_range_method (
method_id integer,
ref_range_id integer
);
--
-- Name: haplotypes; Type: TABLE; Schema: public; Owner: postgres
-- The asm_strand is varChar(n) instead of char(n) as char(n) will
-- pad the output, but varChar(n) sets a character limit, but doesn't pad.
-- This must be compatible with SQL, so needs to allow for "+","-" or "."
--
DROP TABLE IF EXISTS haplotypes CASCADE;
CREATE TABLE haplotypes (
haplotypes_id SERIAL PRIMARY KEY,
gamete_grp_id integer,
ref_range_id integer,
genome_file_id integer,
gvcf_file_id integer,
asm_contig text,
asm_start_coordinate integer,
asm_end_coordinate integer,
asm_strand varchar(1),
method_id integer,
sequence bytea,
seq_len integer,
seq_hash text,
UNIQUE(gamete_grp_id, ref_range_id, method_id)
);
--
-- Name: edge; Type: TABLE; Schema: public; Owner: postgres
--
DROP TABLE IF EXISTS edge CASCADE;
CREATE TABLE edge (
edge_id SERIAL PRIMARY KEY,
anchor_haplotype_id1 integer,
anchor_haplotype_id2 integer,
method_id integer,
quality_score double precision,
is_id1_anchor smallint,
is_id2_anchor smallint
);
--
-- There will be multiple files for each genoid as
-- the files are per chromosome, and multiple chromosomes
-- per genotype.
-- genome_path is the remote server path
-- genome_file is just the file name
-- type is 1 (fasta) or 2(gvcf) from DBLoadingUtils.GenomeFileType
--
DROP TABLE IF EXISTS genome_file_data CASCADE;
CREATE TABLE genome_file_data (
id SERIAL PRIMARY KEY,
genome_path text,
genome_file text,
file_checksum text,
genoid integer,
type integer,
UNIQUE (genoid, genome_file)
);
-- Haplotype_counts table: The "data" will be an nx3 array
-- of [hapid, inclusion count, exclusion count], condensed to a
-- byte array, stored in a BLOB
-- method_id is the method used when creating the haplotype counts
DROP TABLE IF EXISTS haplotype_counts CASCADE;
CREATE TABLE haplotype_counts (
haplotype_counts_id SERIAL PRIMARY KEY,
genoid INTEGER,
method_id INTEGER,
fastq_file TEXT,
data bytea,
UNIQUE (genoid, method_id, fastq_file)
);
-- Paths table: The haplotype_paths will be an
-- array list of [hapid1, hapid2], ordered by ref range,
-- condensed to a byte array to be stored as a blob
-- rm_group_id maps to a read_mapping_group entry
-- method_id is the method used when creating the path
DROP TABLE IF EXISTS paths CASCADE;
CREATE TABLE paths (
path_id SERIAL PRIMARY KEY,
genoid INTEGER,
method_id INTEGER,
paths_data bytea,
UNIQUE(genoid,method_id)
);
-- haplotype_list table: keeps a list of haplotypes that were
-- grouped together as a result of determining read_mappings.
-- The hapid_list is a comma separated string of hapids sorted
-- in ascending order, then compressed. The sorted order is important
-- to ensure any lists containing the same hapids result in the same
-- list_hash value.
-- The purpose of this table is to facilitate decoding the bitsets
-- of the read_mapping
DROP TABLE IF EXISTS haplotype_list CASCADE;
CREATE TABLE haplotype_list (
haplotype_list_id SERIAL PRIMARY KEY,
list_hash TEXT UNIQUE,
hapid_list bytea
);
-- read_mapping table: The "mapping_data" reflects all haplotype ids
-- hit by each read.
-- method_id is the method used when creating the read_mapping
-- file_group_name from the fastq file - taken from the key file (flowcell, lane, etc)
DROP TABLE IF EXISTS read_mapping CASCADE;
CREATE TABLE read_mapping (
read_mapping_id SERIAL PRIMARY KEY,
genoid INTEGER,
method_id INTEGER,
file_group_name TEXT,
haplotype_list_id INTEGER,
mapping_data bytea
);
-- links the read_mapping and read_mapping_group tables
-- there can/will be many read_mapping_ids for each rm_group_id
-- rm_group_id maps to read_mapping_group.rm_group_id
-- read_mapping_id maps to read_mapping.read_mapping_id
CREATE TABLE read_mapping_paths (
read_mapping_id INTEGER,
path_id INTEGER,
UNIQUE(read_mapping_id,path_id)
);
-- taxa_groups table: Matches a group is to a group name
DROP TABLE IF EXISTS taxa_groups CASCADE;
CREATE TABLE taxa_groups (
taxa_grp_id SERIAL PRIMARY KEY,
taxa_grp_name TEXT UNIQUE
);
-- taxa_groups_genoid table: Keeps a list of genoids
-- associated with each taxa_group. A genoid may belong
-- to more than 1 taxa_grp_id
DROP TABLE IF EXISTS taxa_groups_genoid CASCADE;
CREATE TABLE taxa_groups_genoid (
taxa_grp_id INTEGER,
genoid INTEGER
);
CREATE INDEX gamete_haplotypes_gamete_grp_id_index ON gamete_haplotypes (gamete_grp_id);
CREATE INDEX haplotypes_method_id_index ON haplotypes (method_id);
CREATE INDEX haplotypes_ref_range_id_index ON haplotypes (ref_range_id);
CREATE UNIQUE INDEX read_mapping_idx on read_mapping (genoid, method_id, file_group_name);
CREATE INDEX read_mapping_genoid_idx on read_mapping (genoid);
CREATE UNIQUE INDEX taxa_groups_genoid_unique_idx on taxa_groups_genoid (taxa_grp_id,genoid);
-- FKsReadMapping
ALTER TABLE read_mapping add constraint fk_method_id FOREIGN KEY (method_id) REFERENCES methods (method_id);
-- FKsEdge
ALTER TABLE edge add constraint fk_anchor_haplotype_id1 FOREIGN KEY (anchor_haplotype_id1) REFERENCES haplotypes (haplotypes_id);
ALTER TABLE edge add constraint fk_anchor_haplotype_id2 FOREIGN KEY (anchor_haplotype_id2) REFERENCES haplotypes (haplotypes_id);
ALTER TABLE edge add constraint fk_method_id FOREIGN KEY (method_id) REFERENCES methods (method_id);
-- FKsGameteHaplotypes
ALTER TABLE gamete_haplotypes add constraint fk_gamete_haplotypes_gameteid FOREIGN KEY (gameteid) REFERENCES gametes(gameteid);
ALTER TABLE gamete_haplotypes add constraint fk_gamete_haplotypes_gameate_grp_id FOREIGN KEY (gamete_grp_id) REFERENCES gamete_groups(gamete_grp_id);
-- FKsGametes
ALTER TABLE gametes add constraint fk_gametes_genoid FOREIGN KEY (genoid) REFERENCES genotypes(genoid);
-- FKsRefRangeRefRangeMethod
ALTER TABLE ref_range_ref_range_method add constraint fk_ref_range_ref_range_method_method_id FOREIGN KEY (method_id) REFERENCES methods(method_id);
ALTER TABLE ref_range_ref_range_method add constraint fk_ref_range_ref_range_method_ref_range_id FOREIGN KEY (ref_range_id) REFERENCES reference_ranges(ref_range_id);
-- FKsHaplotypes
ALTER TABLE haplotypes add constraint fk_haplotypes_ref_range_id FOREIGN KEY (ref_range_id) REFERENCES reference_ranges(ref_range_id);
ALTER TABLE haplotypes add constraint fk_haplotypes_method_id FOREIGN KEY (method_id) REFERENCES methods(method_id);
-- FKsGenomeFileData
ALTER TABLE genome_file_data add constraint fk_genome_file_data_genoid FOREIGN KEY (genoid) REFERENCES genotypes(genoid);
-- FKsHaplotypeCounts
ALTER TABLE haplotype_counts add constraint fk_haplotype_counts_method_id FOREIGN KEY (method_id) REFERENCES methods(method_id);
ALTER TABLE haplotype_counts add constraint fk_haplotype_counts_genoid FOREIGN KEY (genoid) REFERENCES genotypes(genoid);
-- FKsReadMapping
ALTER TABLE read_mapping add constraint fk_read_mapping_method_id FOREIGN KEY (method_id) REFERENCES methods(method_id);
ALTER TABLE read_mapping add constraint fk_read_mapping_genoid FOREIGN KEY (genoid) REFERENCES genotypes(genoid);
-- FKsReadMappingPaths
ALTER TABLE read_mapping_paths add constraint fk_read_mapping_paths_read_mapping_id FOREIGN KEY (read_mapping_id) REFERENCES read_mapping(read_mapping_id);
ALTER TABLE read_mapping_paths add constraint fk_read_mapping_paths_path_id FOREIGN KEY (path_id) REFERENCES paths(path_id);
-- FKsPaths
ALTER TABLE paths add constraint fk_paths_method_id FOREIGN KEY (method_id) REFERENCES methods(method_id);
ALTER TABLE paths add constraint fk_paths_genoid FOREIGN KEY (genoid) REFERENCES genotypes(genoid);
-- FKsTaxaGroupsGenoid
ALTER TABLE taxa_groups_genoid add constraint fk_taxa_groups_genoid_genoid FOREIGN KEY (genoid) REFERENCES genotypes(genoid);
--
-- Name: public; Type: ACL; Schema: -; Owner: postgres
--
REVOKE ALL ON SCHEMA public FROM PUBLIC;
REVOKE ALL ON SCHEMA public FROM postgres;
GRANT ALL ON SCHEMA public TO postgres;
GRANT ALL ON SCHEMA public TO PUBLIC;
--
-- PostgreSQL database dump complete
--