All Downloads are FREE. Search and download functionalities are using the official Maven repository.

protobuf.opencb.variant.proto Maven / Gradle / Ivy

The newest version!
syntax = "proto3";

package protobuf.opencb;

option java_package = "org.opencb.biodata.models.variant.protobuf";
option java_outer_classname = "VariantProto";
option java_generate_equals_and_hash = true;
//option java_multiple_files = true;

import "protobuf/opencb/variant_annotation.proto";

/**
 * Type of variation, which depends mostly on its length.
 * 
    *
  • SNVs involve a single nucleotide, without changes in length
  • *
  • MNVs involve multiple nucleotides, without changes in length
  • *
  • Indels are insertions or deletions of less than SV_THRESHOLD (50) nucleotides
  • *
  • Structural variations are large changes of more than SV_THRESHOLD nucleotides
  • *
  • Copy-number variations alter the number of copies of a region
  • *
*/ enum VariantType { // As the NO_VARIATION is the most common value on gVCFs, being the first value, // protobuf will use this as default value and save some space. NO_VARIATION = 0; // Defined in HTSJDK SNV = 2; // SO:0001483 MNV = 4; // SO:0002007 INDEL = 5; // SO:1000032 SV = 6; // SO:0001537 COPY_NUMBER = 7; // SO:0001019 COPY_NUMBER_GAIN = 16; // SO:0001742 COPY_NUMBER_LOSS = 17; // SO:0001743 SYMBOLIC = 8; // Defined in HTSJDK MIXED = 9; // Defined in HTSJDK INSERTION = 10; // SO:0000667 DELETION = 11; // SO:0000159 TRANSLOCATION = 12; // SO:0000199 INVERSION = 13; // SO:1000036 DUPLICATION = 14; // SO:1000035 TANDEM_DUPLICATION = 18; // SO:1000173 BREAKEND = 15; // Deprecated CNV = 20; // Deprecated. Renamed to COPY_NUMBER SNP = 1; // Deprecated MNP = 3; // Deprecated } message VariantStats { /** * Unique cohort identifier within the study. **/ string cohortId = 17; /** * Count of samples with non-missing genotypes in this variant from the cohort. * This value is used as denominator for genotypeFreq. **/ int32 sampleCount = 18; /** * Count of files with samples from the cohort that reported this variant. * This value is used as denominator for filterFreq. **/ int32 fileCount = 19; /** * Total number of alleles in called genotypes. It does not include missing alleles. * This value is used as denominator for refAlleleFreq and altAlleleFreq. **/ int32 alleleCount = 1; /** * Number of reference alleles found in this variant. **/ int32 refAlleleCount = 2; /** * Number of main alternate alleles found in this variants. It does not include secondary alternates. **/ int32 altAlleleCount = 3; /** * Reference allele frequency calculated from refAlleleCount and alleleCount, in the range [0,1] **/ float refAlleleFreq = 4; /** * Alternate allele frequency calculated from altAlleleCount and alleleCount, in the range [0,1] **/ float altAlleleFreq = 5; /** * Number of missing alleles **/ int32 missingAlleleCount = 8; /** * Number of genotypes with all alleles missing (e.g. ./.). It does not count partially missing genotypes like "./0" or "./1". **/ int32 missingGenotypeCount = 9; /** * Number of occurrences for each genotype. * This does not include genotype with all alleles missing (e.g. ./.), but it includes partially missing genotypes like "./0" or "./1". * Total sum of counts should be equal to the count of samples. **/ map genotypeCount = 6; /** * Genotype frequency for each genotype found calculated from the genotypeCount and samplesCount, in the range [0,1] * The sum of frequencies should be 1. **/ map genotypeFreq = 7; /** * The number of occurrences for each FILTER value in files from samples in this cohort reporting this variant. * As each file can contain more than one filter value (usually separated by ';'), the total sum of counts could be greater than to the count of files. **/ map filterCount = 14; /** * Frequency of each filter calculated from the filterCount and filesCount, in the range [0,1] **/ map filterFreq = 15; /** * The number of files from samples in this cohort reporting this variant with valid QUAL values. * This value is used as denominator to obtain the qualityAvg. */ int32 qualityCount = 20; /** * The average Quality value for files with valid QUAL values from samples in this cohort reporting this variant. * Some files may not have defined the QUAL value, so the sampling could be less than the filesCount. **/ float qualityAvg = 16; /** * Minor allele frequency. Frequency of the less common allele between the reference and the main alternate alleles. * This value does not take into acconunt secondary alternates. **/ float maf = 10; /** * Minor genotype frequency. Frequency of the less common genotype seen in this variant. * This value takes into account all values from the genotypeFreq map. **/ float mgf = 11; /** * Allele with minor frequency **/ string mafAllele = 12; /** * Genotype with minor frequency **/ string mgfGenotype = 13; } message OriginalCall { /** * Original variant ID before normalization including all secondary alternates. */ string variantId = 1; /** * Alternate allele index of the original multi-allellic variant call in which was decomposed. */ int32 alleleIndex = 2; } message FileEntry { string fileId = 1; OriginalCall call = 2; map data = 3; } message AlternateCoordinate { string chromosome = 1; int32 start = 2; int32 end = 3; /** * Reference allele. */ string reference = 4; /** * Alternate allele. */ string alternate = 5; /** * Type of variation: single nucleotide, indel or structural variation. */ VariantType type = 6; } message SampleEntry { string sampleId = 1; int32 fileIndex = 2; repeated string data = 3; } message StudyEntry { string studyId = 1; repeated FileEntry files = 2; /** * Alternate alleles that appear along with a variant alternate. */ repeated AlternateCoordinate secondaryAlternates = 3; repeated string sampleDataKeys = 4; repeated SampleEntry samples = 5; repeated VariantStats stats = 6; } ///** // * Type of structural variation // *
    // *
  • COPY_NUMBER_GAIN for CNVs
  • // *
  • COPY_NUMBER_LOSS for CNVs
  • // *
  • TANDEM_DUPLICATION for DUP
  • // *
// */ //enum StructuralVariantType { // unused = 0; // SO:0001742 //// COPY_NUMBER_GAIN = 0; // SO:0001742 //// COPY_NUMBER_LOSS = 1; // SO:0001743 //// TANDEM_DUPLICATION = 2; // SO:1000173 //} /* * SE | (Start -> End) | s | t[p[ | piece extending to the right of p is joined after t * SS | (Start -> Start) | s | t]p] | reverse comp piece extending left of p is joined after t * ES | (End -> Start) | s | ]p]t | piece extending to the left of p is joined before t * EE | (End -> End) | s | [p[t | reverse comp piece extending right of p is joined before t */ enum BreakendOrientation { SE = 0; SS = 1; ES = 2; EE = 3; } message BreakendMate { string chromosome = 1; int32 position = 2; int32 ciPositionLeft = 3; int32 ciPositionRight = 4; } message Breakend { BreakendMate mate = 1; BreakendOrientation orientation = 2; string insSeq = 3; } message StructuralVariation { int32 ciStartLeft = 1; int32 ciStartRight = 2; int32 ciEndLeft = 3; int32 ciEndRight = 4; /** * Number of copies for CNV variants. */ int32 copyNumber = 5; /** * Inserted sequence for long INS **/ string leftSvInsSeq = 6; string rightSvInsSeq = 7; /** * Structural variation type: COPY_NUMBER_GAIN, COPY_NUMBER_LOSS, TANDEM_DUPLICATION, ... */ // StructuralVariantType type = 8; Breakend breakend = 9; } message Variant { /** * Chromosome where the genomic variation occurred. */ string chromosome = 1; /** * Normalized position where the genomic variation starts. *
    *
  • SNVs have the same start and end position
  • *
  • Insertions start in the last present position: if the first nucleotide * is inserted in position 6, the start is position 5
  • *
  • Deletions start in the first previously present position: if the first * deleted nucleotide is in position 6, the start is position 6
  • *
*/ int32 start = 2; /** * Normalized position where the genomic variation ends. *
    *
  • SNVs have the same start and end positions
  • *
  • Insertions end in the first present position: if the last nucleotide * is inserted in position 9, the end is position 10
  • *
  • Deletions ends in the last previously present position: if the last * deleted nucleotide is in position 9, the end is position 9
  • *
*/ int32 end = 3; /** * Reference allele. */ string reference = 4; /** * Alternate allele. */ string alternate = 5; /** * Reference strand for this variant */ string strand = 6; /** * Information regarding Structural Variants */ StructuralVariation sv = 14; /** * The variant ID. */ string id = 13; /** * Other names used for this genomic variation. */ repeated string names = 7; /** * Length of the genomic variation, which depends on the variation type. *
    *
  • SNVs have a length of 1 nucleotide
  • *
  • Indels have the length of the largest allele
  • *
*/ int32 length = 8; /** * Type of variation: single nucleotide, indel or structural variation. */ VariantType type = 9; /** * Information specific to each study the variant was read from, such as * samples or statistics. */ repeated StudyEntry studies = 11; /** * Annotations of the genomic variation. */ VariantAnnotation annotation = 12; } message VariantFileMetadata { string fileId = 1; string studyId = 2; string fileName = 3; string studyName = 4; repeated string samples = 5; map metadata = 6; }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy