avro.variantMetadata.avdl Maven / Gradle / Ivy
The newest version!
@namespace("org.opencb.biodata.models.variant.metadata")
protocol VariantMetadataProtocol {
// we need import metadata.avdl
import idl "metadata.avdl";
/**
Some studies does not provide real samples information.
Instead, only aggregated data is provided as file attributes.
This field represents the schema of representing aggregated data (if any)
*/
enum Aggregation {
/**
There is none aggregated data
*/
NONE,
/**
Basic aggregated data
Attributes used:
- AC: Allele Count
- AN: Allele Number
- AF: Allele Frequency
- GTC: Genotype Count
- GTS: Genotypes Sort
The attributes may refere to different cohorts with a prefix or sufix
*/
BASIC,
/**
EVS like aggregated data
Adds some attributes to the basic mode:
- GROUPS_ORDER: Used to specify the order of the comma separated values of cohorts in tags such as MAF.
- MAF: Minnor Allele Frequency for all the cohorts, ordered by GROUPS_ORDER
*/
EVS,
/**
EXAC like aggregated data
Adds some attributes to the basic mode:
- HOM: Homozygous Counts
- HET: Heterozygous Counts
*/
EXAC
}
// /**
// Counts the number of variants within a certain frequency range.
// */
// record VariantsByFrequency {
// /** Inclusive frequency range start */
// float startFrequency;
//
// /** Exclusive frequency range end */
// float endFrequency;
//
// /** Number of variants with this frequency */
// int count;
// }
/**
Variant statistics for a set of variants.
The variants set can be contain a whole study, a cohort, a sample, a region, ...
*/
record VariantSetStats {
/** Number of variants in the variant set */
long variantCount;
/** Number of samples in the variant set */
long sampleCount;
/**
* The number of occurrences for each FILTER value in files from this set.
* Each file can contain more than one filter value (usually separated by ';').
**/
map filterCount;
/** Number of genotypes found for all samples in variants set */
map genotypeCount = {};
/** Number of files in the variant set */
long filesCount;
/** TiTvRatio = num. transitions / num. transversions */
float tiTvRatio;
/** Mean Quality for all the variants with quality */
float qualityAvg;
/** Standard Deviation of the quality */
float qualityStdDev;
// /**
// array of elements to classify variants according to their 'rarity'
// Typical frequency ranges:
// - very rare -> from 0 to 0.001
// - rare -> from 0.001 to 0.005
// - low frequency -> from 0.005 to 0.05
// - common -> from 0.05
// */
// array numRareVariants = [];
/** Variants count group by type. e.g. SNP, INDEL, MNP, SNV, ... */
map typeCount = {};
/** Variants count group by biotype. e.g. protein-coding, miRNA, lncRNA, ... */
map biotypeCount = {};
/** Variants count group by consequence type. e.g. synonymous_variant, missense_variant, stop_lost, ... */
map consequenceTypeCount = {};
/** Number of variants per chromosome */
map chromosomeCount = {};
/** Total density of variants within the chromosome. counts / chromosome.length */
map chromosomeDensity = {};
}
record IndelLength {
int lt5;
int lt10;
int lt15;
int lt20;
int gte20;
}
record DepthCount {
int na;
int lt5;
int lt10;
int lt15;
int lt20;
int gte20;
}
record SampleVariantStats {
/** Sample identifier **/
string id;
/** Number of variants where the sample has the main allele (i.e. 0/1, 1/1, ./1, 1/2, ...) */
int variantCount;
/** Number of variants per chromosome **/
// TODO: Should include chromosome density?
map chromosomeCount = {};
/** Variants count group by type. e.g. SNP, INDEL, MNP, SNV, ... */
map typeCount = {};
/** Number of variants per genotype. Only counts genotypes with the main allele. Phase is ignored. **/
map genotypeCount = {};
/** Indel length grouped in ranges **/
IndelLength indelLengthCount;
/**
* The number of occurrences for each FILTER value in files from this set.
* Each file can contain more than one filter value (usually separated by ';').
**/
map filterCount;
/** TiTvRatio = num. transitions / num. transversions */
float tiTvRatio;
/** Mean Quality for all the variants with quality */
float qualityAvg;
/** Standard Deviation of the quality */
float qualityStdDev;
// TODO ?
// /** Number of positions not sequenced **/
// int missingPositions;
//double missingnessScore ??
/**
* Heterozygosity rate as defined by PLINK: (N–O)/N
*
* N is the number of non-missing genotypes
* O is the observed number of homozygous genotypes for a given individual
**/
float heterozygosityRate;
/** Number of mendelian errors grouped by PLINK error codes grouped by Chromosome. **/
map