protobuf.opencb.variant.proto Maven / Gradle / Ivy
The newest version!
syntax = "proto3";
package protobuf.opencb;
option java_package = "org.opencb.biodata.models.variant.protobuf";
option java_outer_classname = "VariantProto";
option java_generate_equals_and_hash = true;
//option java_multiple_files = true;
import "protobuf/opencb/variant_annotation.proto";
/**
* Type of variation, which depends mostly on its length.
*
* - SNVs involve a single nucleotide, without changes in length
* - MNVs involve multiple nucleotides, without changes in length
* - Indels are insertions or deletions of less than SV_THRESHOLD (50) nucleotides
* - Structural variations are large changes of more than SV_THRESHOLD nucleotides
* - Copy-number variations alter the number of copies of a region
*
*/
enum VariantType {
// As the NO_VARIATION is the most common value on gVCFs, being the first value,
// protobuf will use this as default value and save some space.
NO_VARIATION = 0; // Defined in HTSJDK
SNV = 2; // SO:0001483
MNV = 4; // SO:0002007
INDEL = 5; // SO:1000032
SV = 6; // SO:0001537
COPY_NUMBER = 7; // SO:0001019
COPY_NUMBER_GAIN = 16; // SO:0001742
COPY_NUMBER_LOSS = 17; // SO:0001743
SYMBOLIC = 8; // Defined in HTSJDK
MIXED = 9; // Defined in HTSJDK
INSERTION = 10; // SO:0000667
DELETION = 11; // SO:0000159
TRANSLOCATION = 12; // SO:0000199
INVERSION = 13; // SO:1000036
DUPLICATION = 14; // SO:1000035
TANDEM_DUPLICATION = 18; // SO:1000173
BREAKEND = 15;
// Deprecated
CNV = 20; // Deprecated. Renamed to COPY_NUMBER
SNP = 1; // Deprecated
MNP = 3; // Deprecated
}
message VariantStats {
/**
* Unique cohort identifier within the study.
**/
string cohortId = 17;
/**
* Count of samples with non-missing genotypes in this variant from the cohort.
* This value is used as denominator for genotypeFreq.
**/
int32 sampleCount = 18;
/**
* Count of files with samples from the cohort that reported this variant.
* This value is used as denominator for filterFreq.
**/
int32 fileCount = 19;
/**
* Total number of alleles in called genotypes. It does not include missing alleles.
* This value is used as denominator for refAlleleFreq and altAlleleFreq.
**/
int32 alleleCount = 1;
/**
* Number of reference alleles found in this variant.
**/
int32 refAlleleCount = 2;
/**
* Number of main alternate alleles found in this variants. It does not include secondary alternates.
**/
int32 altAlleleCount = 3;
/**
* Reference allele frequency calculated from refAlleleCount and alleleCount, in the range [0,1]
**/
float refAlleleFreq = 4;
/**
* Alternate allele frequency calculated from altAlleleCount and alleleCount, in the range [0,1]
**/
float altAlleleFreq = 5;
/**
* Number of missing alleles
**/
int32 missingAlleleCount = 8;
/**
* Number of genotypes with all alleles missing (e.g. ./.). It does not count partially missing genotypes like "./0" or "./1".
**/
int32 missingGenotypeCount = 9;
/**
* Number of occurrences for each genotype.
* This does not include genotype with all alleles missing (e.g. ./.), but it includes partially missing genotypes like "./0" or "./1".
* Total sum of counts should be equal to the count of samples.
**/
map genotypeCount = 6;
/**
* Genotype frequency for each genotype found calculated from the genotypeCount and samplesCount, in the range [0,1]
* The sum of frequencies should be 1.
**/
map genotypeFreq = 7;
/**
* The number of occurrences for each FILTER value in files from samples in this cohort reporting this variant.
* As each file can contain more than one filter value (usually separated by ';'), the total sum of counts could be greater than to the count of files.
**/
map filterCount = 14;
/**
* Frequency of each filter calculated from the filterCount and filesCount, in the range [0,1]
**/
map filterFreq = 15;
/**
* The number of files from samples in this cohort reporting this variant with valid QUAL values.
* This value is used as denominator to obtain the qualityAvg.
*/
int32 qualityCount = 20;
/**
* The average Quality value for files with valid QUAL values from samples in this cohort reporting this variant.
* Some files may not have defined the QUAL value, so the sampling could be less than the filesCount.
**/
float qualityAvg = 16;
/**
* Minor allele frequency. Frequency of the less common allele between the reference and the main alternate alleles.
* This value does not take into acconunt secondary alternates.
**/
float maf = 10;
/**
* Minor genotype frequency. Frequency of the less common genotype seen in this variant.
* This value takes into account all values from the genotypeFreq map.
**/
float mgf = 11;
/**
* Allele with minor frequency
**/
string mafAllele = 12;
/**
* Genotype with minor frequency
**/
string mgfGenotype = 13;
}
message OriginalCall {
/**
* Original variant ID before normalization including all secondary alternates.
*/
string variantId = 1;
/**
* Alternate allele index of the original multi-allellic variant call in which was decomposed.
*/
int32 alleleIndex = 2;
}
message FileEntry {
string fileId = 1;
OriginalCall call = 2;
map data = 3;
}
message AlternateCoordinate {
string chromosome = 1;
int32 start = 2;
int32 end = 3;
/**
* Reference allele.
*/
string reference = 4;
/**
* Alternate allele.
*/
string alternate = 5;
/**
* Type of variation: single nucleotide, indel or structural variation.
*/
VariantType type = 6;
}
message SampleEntry {
string sampleId = 1;
int32 fileIndex = 2;
repeated string data = 3;
}
message StudyEntry {
string studyId = 1;
repeated FileEntry files = 2;
/**
* Alternate alleles that appear along with a variant alternate.
*/
repeated AlternateCoordinate secondaryAlternates = 3;
repeated string sampleDataKeys = 4;
repeated SampleEntry samples = 5;
repeated VariantStats stats = 6;
}
///**
// * Type of structural variation
// *
// * - COPY_NUMBER_GAIN for CNVs
// * - COPY_NUMBER_LOSS for CNVs
// * - TANDEM_DUPLICATION for DUP
// *
// */
//enum StructuralVariantType {
// unused = 0; // SO:0001742
//// COPY_NUMBER_GAIN = 0; // SO:0001742
//// COPY_NUMBER_LOSS = 1; // SO:0001743
//// TANDEM_DUPLICATION = 2; // SO:1000173
//}
/*
* SE | (Start -> End) | s | t[p[ | piece extending to the right of p is joined after t
* SS | (Start -> Start) | s | t]p] | reverse comp piece extending left of p is joined after t
* ES | (End -> Start) | s | ]p]t | piece extending to the left of p is joined before t
* EE | (End -> End) | s | [p[t | reverse comp piece extending right of p is joined before t
*/
enum BreakendOrientation {
SE = 0;
SS = 1;
ES = 2;
EE = 3;
}
message BreakendMate {
string chromosome = 1;
int32 position = 2;
int32 ciPositionLeft = 3;
int32 ciPositionRight = 4;
}
message Breakend {
BreakendMate mate = 1;
BreakendOrientation orientation = 2;
string insSeq = 3;
}
message StructuralVariation {
int32 ciStartLeft = 1;
int32 ciStartRight = 2;
int32 ciEndLeft = 3;
int32 ciEndRight = 4;
/**
* Number of copies for CNV variants.
*/
int32 copyNumber = 5;
/**
* Inserted sequence for long INS
**/
string leftSvInsSeq = 6;
string rightSvInsSeq = 7;
/**
* Structural variation type: COPY_NUMBER_GAIN, COPY_NUMBER_LOSS, TANDEM_DUPLICATION, ...
*/
// StructuralVariantType type = 8;
Breakend breakend = 9;
}
message Variant {
/**
* Chromosome where the genomic variation occurred.
*/
string chromosome = 1;
/**
* Normalized position where the genomic variation starts.
*
* - SNVs have the same start and end position
* - Insertions start in the last present position: if the first nucleotide
* is inserted in position 6, the start is position 5
* - Deletions start in the first previously present position: if the first
* deleted nucleotide is in position 6, the start is position 6
*
*/
int32 start = 2;
/**
* Normalized position where the genomic variation ends.
*
* - SNVs have the same start and end positions
* - Insertions end in the first present position: if the last nucleotide
* is inserted in position 9, the end is position 10
* - Deletions ends in the last previously present position: if the last
* deleted nucleotide is in position 9, the end is position 9
*
*/
int32 end = 3;
/**
* Reference allele.
*/
string reference = 4;
/**
* Alternate allele.
*/
string alternate = 5;
/**
* Reference strand for this variant
*/
string strand = 6;
/**
* Information regarding Structural Variants
*/
StructuralVariation sv = 14;
/**
* The variant ID.
*/
string id = 13;
/**
* Other names used for this genomic variation.
*/
repeated string names = 7;
/**
* Length of the genomic variation, which depends on the variation type.
*
* - SNVs have a length of 1 nucleotide
* - Indels have the length of the largest allele
*
*/
int32 length = 8;
/**
* Type of variation: single nucleotide, indel or structural variation.
*/
VariantType type = 9;
/**
* Information specific to each study the variant was read from, such as
* samples or statistics.
*/
repeated StudyEntry studies = 11;
/**
* Annotations of the genomic variation.
*/
VariantAnnotation annotation = 12;
}
message VariantFileMetadata {
string fileId = 1;
string studyId = 2;
string fileName = 3;
string studyName = 4;
repeated string samples = 5;
map metadata = 6;
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy