avro.variant.avdl Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of biodata-models Show documentation
The newest version!

@namespace("org.opencb.biodata.models.variant.avro")

protocol Variants {

    import idl "variantAnnotation.avdl";

    /**
     * Type of variation, which depends mostly on its length.
     * 
     * SNVs involve a single nucleotide, without changes in length
     * MNVs involve multiple nucleotides, without changes in length
     * Indels are insertions or deletions of less than SV_THRESHOLD (50) nucleotides
     * Structural variations are large changes of more than SV_THRESHOLD nucleotides
     * Copy-number variations alter the number of copies of a region
     * 
     */
    enum VariantType {
        SNV,                   // SO:0001483
        MNV,                   // SO:0002007
        INDEL,                 // SO:1000032
        SV,                    // SO:0001537
        INSERTION,             // SO:0000667
        DELETION,              // SO:0000159
        TRANSLOCATION,         // SO:0000199
        INVERSION,             // SO:1000036
        COPY_NUMBER,           // SO:0001019
        COPY_NUMBER_GAIN,      // SO:0001742
        COPY_NUMBER_LOSS,      // SO:0001743
        DUPLICATION,           // SO:1000035
        TANDEM_DUPLICATION,    // SO:1000173
        BREAKEND,
        NO_VARIATION,          // Defined in HTSJDK
        SYMBOLIC,              // Defined in HTSJDK
        MIXED,                 // Defined in HTSJDK

        SNP,                   // @Deprecated
        MNP,                   // @Deprecated
        CNV                    // @Deprecated
    }

    record VariantStats {
        /**
         * Unique cohort identifier within the study.
         **/
        string cohortId;

        /**
         * Count of samples with non-missing genotypes in this variant from the cohort.
         * This value is used as denominator for genotypeFreq.
         **/
        union { null, int } sampleCount;

        /**
         * Count of files with samples from the cohort that reported this variant.
         * This value is used as denominator for filterFreq.
         **/
        union { null, int } fileCount;

        /**
         * Total number of alleles in called genotypes. It does not include missing alleles.
         * This value is used as denominator for refAlleleFreq and altAlleleFreq.
         **/
        union { null, int } alleleCount;

        /**
         * Number of reference alleles found in this variant.
         **/
        union { null, int } refAlleleCount;

        /**
         * Number of main alternate alleles found in this variants. It does not include secondary alternates.
         **/
        union { null, int } altAlleleCount;

        /**
         * Reference allele frequency calculated from refAlleleCount and alleleCount, in the range [0,1]
         **/
        union { null, float } refAlleleFreq;

        /**
         * Alternate allele frequency calculated from altAlleleCount and alleleCount, in the range [0,1]
         **/
        union { null, float } altAlleleFreq;

        /**
         * Number of missing alleles.
         **/
        union { null, int } missingAlleleCount;

        /**
         * Number of genotypes with all alleles missing (e.g. ./.). It does not count partially missing genotypes like "./0" or "./1".
         **/
        union { null, int } missingGenotypeCount;

        /**
         * Number of occurrences for each genotype.
         * This does not include genotype with all alleles missing (e.g. ./.), but it includes partially missing genotypes like "./0" or "./1".
         * Total sum of counts should be equal to the count of samples.
         **/
        map genotypeCount = {};

        /**
         * Genotype frequency for each genotype found calculated from the genotypeCount and samplesCount, in the range [0,1]
         * The sum of frequencies should be 1.
         **/
        map genotypeFreq = {};

        /**
         * The number of occurrences for each FILTER value in files from samples in this cohort reporting this variant.
         * As each file can contain more than one filter value (usually separated by ';'), the total sum of counts could be greater than the count of files.
         **/
        map filterCount;

        /**
         * Frequency of each filter calculated from the filterCount and filesCount, in the range [0,1]
         **/
        map filterFreq;

        /**
         * The number of files from samples in this cohort reporting this variant with valid QUAL values.
         * This value is used as denominator to obtain the qualityAvg.
         */
        union { null, int } qualityCount;

        /**
         * The average Quality value for files with valid QUAL values from samples in this cohort reporting this variant.
         * Some files may not have defined the QUAL value, so the sampling could be less than the filesCount.
         **/
        union { null, float } qualityAvg;

        /**
         * Minor allele frequency. Frequency of the less common allele between the reference and the main alternate alleles.
         * This value does not take into acconunt secondary alternates.
         **/
        union { null, float } maf;

        /**
         * Minor genotype frequency. Frequency of the less common genotype seen in this variant.
         * This value takes into account all values from the genotypeFreq map.
         **/
        union { null, float } mgf;

        /**
         * Allele with minor frequency.
         **/
        union { null, string } mafAllele;

        /**
         * Genotype with minor frequency.
         **/
        union { null, string } mgfGenotype;
    }

    record VariantScore {

        /**
         * Variant score ID.
         */
        string id;
        /**
         * Main cohort used for calculating the score.
         */
        string cohort1;
        /**
         * Optional secondary cohort used for calculating the score.
         */
        union { null, string } cohort2 = null;
        /**
         * Score value
         */
        float score;
        /**
         * Score p value
         */
        union { null, float } pValue = null;
    }

    record OriginalCall {
        /**
         * Original variant ID before normalization including all secondary alternates.
         */
        string variantId;

        /**
         * Alternate allele index of the original multi-allellic variant call in which was decomposed.
         */
        union {null, int} alleleIndex;
    }

    record FileEntry {
        /**
         * Unique identifier of the source file.
         */
        union { null, string } fileId;

        /**
         * Original call position for the variant, if the file was normalized.
         *
         * {position}:{reference}:{alternate}(,{other_alternate})*:{allele_index}
         */
        union { null, OriginalCall } call;

        /**
         * Optional data that probably depend on the format of the file the
         * variant was initially read from.
         */
        map data;
    }

    record AlternateCoordinate {

        union { null, string } chromosome;

        /**
         * First position 1-based of the alternate. If null, the start is the same of the variant.
         */
        union { null, int } start;

        /**
         * End position 1-based of the alternate. If null, the end is the same of the variant.
         */
        union { null, int } end;

        /**
         * Reference allele. If null, the reference is the same of the variant.
         */
        union { null, string } reference;

        /**
         * Alternate allele.
         */
        string alternate;

        VariantType type;
    }

    record SampleEntry {
        union { null, string } sampleId;

        union { null, int } fileIndex;

        array data;
    }

    enum IssueType {
        DUPLICATION,
        DISCREPANCY,
        MENDELIAN_ERROR,
        DE_NOVO,
        COMPOUND_HETEROZYGOUS
    }

    record IssueEntry {
        IssueType type;

        SampleEntry sample;

        map data;
    }

    record StudyEntry {
        /**
         * Unique identifier of the study.
         */
        union { null, string } studyId;

        /**
         * List of files from the study where the variant was present.
         */
        array files = [];

        /**
         * Alternate alleles that appear along with a variant alternate.
         */
        union { null, array } secondaryAlternates = null;

        /**
         * Fields stored for each sample.
         */
        array sampleDataKeys;

        /**
         * Genotypes and other sample-related information. Each position is related
         * with one sample. The content are lists of values in the same order than the
         * sampleDataKeys array. The length of this lists must be the same as the sampleDataKeys field.
         */
        array samples;

        array issues = [];

        /**
         * Statistics of the genomic variation, such as its alleles/genotype count
         * or its minimum allele frequency, grouped by cohort name.
         */
        array stats;

        array scores = [];
    }

//    /**
//     * Confidence interval around a position for imprecise variants
//     */
//    record ConfidenceInterval {
//       int right;
////       int behind;
//       int left;
////       int forward;
//    }

    /**
     * @Deprecated, use VariantType instead
     */
    @javaAnnotation("Deprecated")
    enum StructuralVariantType {
        COPY_NUMBER_GAIN,            // SO:0001742
        COPY_NUMBER_LOSS,            // SO:0001743
        TANDEM_DUPLICATION            // SO:1000173
    }

    /**
     * SE | (Start -> End)   | s | t[p[ | piece extending to the right of p is joined after t
     * SS | (Start -> Start) | s | t]p] | reverse comp piece extending left of p is joined after t
     * ES | (End -> Start)   | s | ]p]t | piece extending to the left of p is joined before t
     * EE | (End -> End)     | s | [p[t | reverse comp piece extending right of p is joined before t
     */
    enum BreakendOrientation {
        SE,
        SS,
        ES,
        EE
    }

    record BreakendMate {
        union { null, string } chromosome;
        union { null, int } position;
        union { null, int } ciPositionLeft;
        union { null, int } ciPositionRight;
    }

    record Breakend {
        union { null, BreakendMate } mate;
        union { null, BreakendOrientation} orientation;
        union { null, string } insSeq;
    }

    record StructuralVariation {
        union {null, int} ciStartLeft;
        union {null, int} ciStartRight;
        union {null, int} ciEndLeft;
        union {null, int} ciEndRight;
        /**
         * Number of copies for CNV variants.
         */
        union {null, int} copyNumber;

        /**
        * Inserted sequence for long INS
        **/
        union { null, string } leftSvInsSeq;
        union { null, string } rightSvInsSeq;

        /**
        * @deprecated
        */
        union {null, StructuralVariantType} @javaAnnotation("Deprecated") type;

        union { null, Breakend } breakend = null;
    }

    record VariantAvro {

        /**
         * The variant ID.
         */
        union { null, string } id;

        /**
         * Other names used for this genomic variation.
         */
        array names = [];

        /**
         * Chromosome where the genomic variation occurred.
         */
        string chromosome;

        /**
         * Normalized position where the genomic variation starts.
         * 
         * SNVs have the same start and end position
         * Insertions start in the last present position: if the first nucleotide
         * is inserted in position 6, the start is position 5
         * Deletions start in the first previously present position: if the first
         * deleted nucleotide is in position 6, the start is position 6
         * 
         */
        int start;

        /**
         * Normalized position where the genomic variation ends.
         * 
         * SNVs have the same start and end positions
         * Insertions end in the first present position: if the last nucleotide
         * is inserted in position 9, the end is position 10
         * Deletions ends in the last previously present position: if the last
         * deleted nucleotide is in position 9, the end is position 9
         * 
         */
        int end;

        /**
         * Reference allele.
         */
        string reference;

        /**
         * Alternate allele.
         */
        string alternate;

        /**
         * Reference strand for this variant
         */
        union { null, string } strand = null;

        /**
         * Information regarding Structural Variants
         */
        union { null, StructuralVariation } sv = null;

        /**
         * Length of the genomic variation, which depends on the variation type.
         * 
         * SNVs have a length of 1 nucleotide
         * Indels have the length of the largest allele
         * 
         */
        int length;

        /**
         * Type of variation: single nucleotide, indel or structural variation.
         */
        VariantType type;

        /**
         * Information specific to each study the variant was read from, such as
         * samples or statistics.
         */
        array studies;

        /**
         * Annotations of the genomic variation.
         */
        union { null, VariantAnnotation } annotation = null;
    }

}