protobuf.opencb.variant.proto Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of biodata-models Show documentation
The newest version!
syntax = "proto3";

package protobuf.opencb;

option java_package = "org.opencb.biodata.models.variant.protobuf";
option java_outer_classname = "VariantProto";
option java_generate_equals_and_hash = true;
//option java_multiple_files = true;

import "protobuf/opencb/variant_annotation.proto";

/**
 * Type of variation, which depends mostly on its length.
 * 
 * SNVs involve a single nucleotide, without changes in length
 * MNVs involve multiple nucleotides, without changes in length
 * Indels are insertions or deletions of less than SV_THRESHOLD (50) nucleotides
 * Structural variations are large changes of more than SV_THRESHOLD nucleotides
 * Copy-number variations alter the number of copies of a region
 * 
 */
enum VariantType {
    // As the NO_VARIATION is the most common value on gVCFs, being the first value,
    // protobuf will use this as default value and save some space.
    NO_VARIATION   = 0;  // Defined in HTSJDK

    SNV            = 2;  // SO:0001483
    MNV            = 4;  // SO:0002007
    INDEL          = 5;  // SO:1000032
    SV             = 6;  // SO:0001537
    COPY_NUMBER    = 7;  // SO:0001019
    COPY_NUMBER_GAIN     = 16;  // SO:0001742
    COPY_NUMBER_LOSS     = 17;  // SO:0001743
    SYMBOLIC       = 8;  // Defined in HTSJDK
    MIXED          = 9;  // Defined in HTSJDK

    INSERTION      = 10;  // SO:0000667
    DELETION       = 11;  // SO:0000159
    TRANSLOCATION  = 12;  // SO:0000199
    INVERSION      = 13;  // SO:1000036
    DUPLICATION    = 14;  // SO:1000035
    TANDEM_DUPLICATION   = 18;  // SO:1000173
    BREAKEND       = 15;

    // Deprecated
    CNV            = 20; // Deprecated. Renamed to COPY_NUMBER
    SNP            = 1;  // Deprecated
    MNP            = 3;  // Deprecated
}

message VariantStats {
    /**
     * Unique cohort identifier within the study.
     **/
    string cohortId = 17;

    /**
     * Count of samples with non-missing genotypes in this variant from the cohort.
     * This value is used as denominator for genotypeFreq.
     **/
    int32 sampleCount = 18;

    /**
     * Count of files with samples from the cohort that reported this variant.
     * This value is used as denominator for filterFreq.
     **/
    int32 fileCount = 19;

    /**
     * Total number of alleles in called genotypes. It does not include missing alleles.
     * This value is used as denominator for refAlleleFreq and altAlleleFreq.
     **/
    int32 alleleCount = 1;

    /**
     * Number of reference alleles found in this variant.
     **/
    int32 refAlleleCount = 2;

    /**
     * Number of main alternate alleles found in this variants. It does not include secondary alternates.
     **/
    int32 altAlleleCount = 3;

    /**
     * Reference allele frequency calculated from refAlleleCount and alleleCount, in the range [0,1]
     **/
    float refAlleleFreq = 4;

    /**
     * Alternate allele frequency calculated from altAlleleCount and alleleCount, in the range [0,1]
     **/
    float altAlleleFreq = 5;

    /**
     * Number of missing alleles
     **/
    int32 missingAlleleCount = 8;

    /**
     * Number of genotypes with all alleles missing (e.g. ./.). It does not count partially missing genotypes like "./0" or "./1".
     **/
    int32 missingGenotypeCount = 9;

    /**
     * Number of occurrences for each genotype.
     * This does not include genotype with all alleles missing (e.g. ./.), but it includes partially missing genotypes like "./0" or "./1".
     * Total sum of counts should be equal to the count of samples.
     **/
    map genotypeCount = 6;

    /**
     * Genotype frequency for each genotype found calculated from the genotypeCount and samplesCount, in the range [0,1]
     * The sum of frequencies should be 1.
     **/
    map genotypeFreq = 7;

    /**
     * The number of occurrences for each FILTER value in files from samples in this cohort reporting this variant.
     * As each file can contain more than one filter value (usually separated by ';'), the total sum of counts could be greater than to the count of files.
     **/
    map filterCount = 14;

    /**
     * Frequency of each filter calculated from the filterCount and filesCount, in the range [0,1]
     **/
    map filterFreq = 15;

    /**
     * The number of files from samples in this cohort reporting this variant with valid QUAL values.
     * This value is used as denominator to obtain the qualityAvg.
     */
    int32 qualityCount = 20;

    /**
     * The average Quality value for files with valid QUAL values from samples in this cohort reporting this variant.
     * Some files may not have defined the QUAL value, so the sampling could be less than the filesCount.
     **/
    float qualityAvg = 16;

    /**
     * Minor allele frequency. Frequency of the less common allele between the reference and the main alternate alleles.
     * This value does not take into acconunt secondary alternates.
     **/
    float maf = 10;

    /**
     * Minor genotype frequency. Frequency of the less common genotype seen in this variant.
     * This value takes into account all values from the genotypeFreq map.
     **/
    float mgf = 11;

    /**
     * Allele with minor frequency
     **/
    string mafAllele = 12;

    /**
     * Genotype with minor frequency
     **/
    string mgfGenotype = 13;
}

message OriginalCall {
    /**
     * Original variant ID before normalization including all secondary alternates.
     */
    string variantId = 1;

    /**
     * Alternate allele index of the original multi-allellic variant call in which was decomposed.
     */
    int32 alleleIndex = 2;
}


message FileEntry {
    string fileId = 1;
    OriginalCall call = 2;
    map data = 3;
}

message AlternateCoordinate {
    string chromosome = 1;
    int32 start = 2;
    int32 end = 3;

    /**
     * Reference allele.
     */
    string reference = 4;

    /**
     * Alternate allele.
     */
    string alternate = 5;

    /**
     * Type of variation: single nucleotide, indel or structural variation.
     */
    VariantType type = 6;
}

message SampleEntry {
    string sampleId = 1;
    int32 fileIndex = 2;
    repeated string data = 3;
}

message StudyEntry {
    string studyId = 1;
    repeated FileEntry files = 2;
    /**
     * Alternate alleles that appear along with a variant alternate.
     */
    repeated AlternateCoordinate secondaryAlternates = 3;
    repeated string sampleDataKeys = 4;
    repeated SampleEntry samples = 5;
    repeated VariantStats stats = 6;
}

///**
// * Type of structural variation
// * 
// * COPY_NUMBER_GAIN for CNVs
// * COPY_NUMBER_LOSS for CNVs
// * TANDEM_DUPLICATION for DUP
// * 
// */
//enum StructuralVariantType {
//    unused   = 0;            // SO:0001742
////    COPY_NUMBER_GAIN   = 0;            // SO:0001742
////    COPY_NUMBER_LOSS   = 1;            // SO:0001743
////    TANDEM_DUPLICATION = 2;            // SO:1000173
//}


/*
* SE | (Start -> End)   | s | t[p[ | piece extending to the right of p is joined after t
* SS | (Start -> Start) | s | t]p] | reverse comp piece extending left of p is joined after t
* ES | (End -> Start)   | s | ]p]t | piece extending to the left of p is joined before t
* EE | (End -> End)     | s | [p[t | reverse comp piece extending right of p is joined before t
*/
enum BreakendOrientation {
    SE = 0;
    SS = 1;
    ES = 2;
    EE = 3;
}

message BreakendMate {
    string chromosome = 1;
    int32 position = 2;
    int32 ciPositionLeft = 3;
    int32 ciPositionRight = 4;
}

message Breakend {
    BreakendMate mate = 1;
    BreakendOrientation orientation = 2;
    string insSeq = 3;
}


message StructuralVariation {
    int32 ciStartLeft = 1;
    int32 ciStartRight = 2;
    int32 ciEndLeft = 3;
    int32 ciEndRight = 4;

    /**
     * Number of copies for CNV variants.
    */
    int32 copyNumber = 5;

    /**
    * Inserted sequence for long INS
    **/
    string leftSvInsSeq = 6;
    string rightSvInsSeq = 7;


    /**
    * Structural variation type: COPY_NUMBER_GAIN, COPY_NUMBER_LOSS, TANDEM_DUPLICATION, ...
    */
//    StructuralVariantType type = 8;

    Breakend breakend = 9;
}

message Variant {
    /**
     * Chromosome where the genomic variation occurred.
     */
    string chromosome = 1;

    /**
     * Normalized position where the genomic variation starts.
     * 
     * SNVs have the same start and end position
     * Insertions start in the last present position: if the first nucleotide
     * is inserted in position 6, the start is position 5
     * Deletions start in the first previously present position: if the first
     * deleted nucleotide is in position 6, the start is position 6
     * 
     */
    int32 start = 2;

    /**
     * Normalized position where the genomic variation ends.
     * 
     * SNVs have the same start and end positions
     * Insertions end in the first present position: if the last nucleotide
     * is inserted in position 9, the end is position 10
     * Deletions ends in the last previously present position: if the last
     * deleted nucleotide is in position 9, the end is position 9
     * 
     */
    int32 end = 3;

    /**
     * Reference allele.
     */
    string reference = 4;

    /**
     * Alternate allele.
     */
    string alternate = 5;

    /**
     * Reference strand for this variant
     */
    string strand = 6;

    /**
     * Information regarding Structural Variants
     */
    StructuralVariation sv = 14;

    /**
     * The variant ID.
     */
    string id = 13;

    /**
     * Other names used for this genomic variation.
     */
    repeated string names = 7;

    /**
     * Length of the genomic variation, which depends on the variation type.
     * 
     * SNVs have a length of 1 nucleotide
     * Indels have the length of the largest allele
     * 
     */
    int32 length = 8;

    /**
     * Type of variation: single nucleotide, indel or structural variation.
     */
    VariantType type = 9;

    /**
     * Information specific to each study the variant was read from, such as
     * samples or statistics.
     */
    repeated StudyEntry studies = 11;

    /**
     * Annotations of the genomic variation.
     */
    VariantAnnotation annotation = 12;
}

message VariantFileMetadata {
    string fileId = 1;
    string studyId = 2;
    string fileName = 3;
    string studyName = 4;
    repeated string samples = 5;
    map metadata = 6;
}