org.opencb.cellbase.lib.variant.hgvs.TranscriptUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cellbase-lib Show documentation
There is a newer version: 6.3.0
/*
 * Copyright 2015-2020 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.cellbase.lib.variant.hgvs;

import org.apache.commons.lang3.StringUtils;
import org.opencb.biodata.models.core.Exon;
import org.opencb.biodata.models.core.Transcript;
import org.opencb.biodata.models.core.Xref;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.cellbase.lib.variant.VariantAnnotationUtils;

import java.util.List;

public class TranscriptUtils {

    private Transcript transcript;

    public TranscriptUtils(Transcript transcript) {
        this.transcript = transcript;
    }

    public boolean isCoding() {
        // 0 in the cdnaCodingEnd means that the transcript doesn't
        // have a coding end <==> is non coding. Just annotating
        // coding transcripts in a first approach
        return transcript.getCdnaCodingEnd() != 0;
    }

    public boolean hasUnconfirmedStart() {
        return transcript.unconfirmedStart() || (transcript.getProteinSequence() != null
                && transcript.getProteinSequence().startsWith(VariantAnnotationUtils.UNKNOWN_AMINOACID));
    }

    public boolean hasUnconfirmedSEnd() {
        return transcript.unconfirmedEnd() || (transcript.getProteinSequence() != null
                && transcript.getProteinSequence().endsWith(VariantAnnotationUtils.UNKNOWN_AMINOACID));
    }

    public int cdnaToCds(int cdnaPosition) {
        int cdsPosition = 0;
        if (cdnaPosition >= transcript.getCdnaCodingStart()) {
            cdsPosition = transcript.getCdnaCodingStart() - cdnaPosition + 1;
        }
        return cdsPosition;
    }

    /**
     * Returns cDNA position when CDS is valid, 0 otherwise.
     * @param cdsPosition CDS position
     * @return CDS position or 0 if CDS is not valid
     */
    public int cdsToCdna(int cdsPosition) {
        int cdnaPosition = 0;
        // TODO We need to make sure CdsLength includes the STOP codon
        if (cdsPosition > 0 && cdsPosition <= transcript.getCdsLength() + 3) {
            cdnaPosition = cdsPosition + transcript.getCdnaCodingStart() - 1;
        }
        return cdnaPosition;
    }

    public int genomicsToCdna(int position) {
        int cds = genomicsToCds(position);
        return cdsToCdna(cds);
    }

    public int genomicsToCds(int position) {
        return 0;
    }

    public int getFirstCodonPosition() {
        int cdnaCodingStart = transcript.getCdnaCodingStart();
        // Unconfirmed start transcript always have a CdnaCodingStart=1, we need to increment the phase to get the first codon position
        if (transcript.unconfirmedStart()) {
            cdnaCodingStart += getFirstCodonPhase();
        }
        return cdnaCodingStart;
    }

    /**
     * Returns the exon phase for the first codon, this is always 0 for normal transcripts.
     * Different values are expected for unconfirmed transcripts.
     * @return Firs exon phase.
     */
    public int getFirstCodonPhase() {
        // This works fine for both normal and unconfirmed transcripts
        for (Exon exon : transcript.getExons()) {
            if (exon.getPhase() != -1) {
                return exon.getPhase();
            }
        }
        return -1;
    }

    public String getCodon(int codonPosition) {
        if (codonPosition > 0) {
            int cdsCodonStart = ((codonPosition - 1) * 3) + 1;
            if (hasUnconfirmedStart()) {
                // firstCodonPhase is 1. Codon 4 starts at position 8:
                //  cDNA        1    2      5      8      11
                //  codon       1    2      3      4      5
                //  sequence    G    CAG    ATG    GCT    TAT
                int firstCodonPhase = getFirstCodonPhase();

                if (codonPosition == 1 && firstCodonPhase != 0) {
                    return StringUtils.repeat('N', 3 - firstCodonPhase)
                            + transcript.getCdnaSequence().substring(0, firstCodonPhase);
                }

                // If first phase is not 0 we need to adjust codons to fetch a real one
                if (firstCodonPhase != 0) {
                    cdsCodonStart -= (3 - firstCodonPhase);
                }
            }
            int cdnaCodonStart = cdsToCdna(cdsCodonStart);
            // Adjust for manipulating strings, set to be zero base
            cdnaCodonStart = cdnaCodonStart - 1;

            // See ac289d2fcd10c8a0af3163b0fca6ffc149ba915a
            // Use cdnaCodonStart + 3, not transcript.getCdnaCodingEnd()
            // if the variant is the penultimate NT in the sequence, then transcript.cdnaCodingEnd can be too short and product a codon
            // of size = 2, generating an exception.
            int cdnaCodonEnd = cdnaCodonStart + 3;
            if (cdnaCodonStart >= 0 && transcript.getCdnaSequence().length() >= cdnaCodonEnd) {
                return transcript.getCdnaSequence().substring(cdnaCodonStart, cdnaCodonEnd);
            }
        }
        return "";
    }

    /**
     * Returns the codon position 1-based for a CDS position.
     * @param cdsPosition CDS position
     * @return Codon position
     */
    public int getCodonPosition(int cdsPosition) {
        // cdsPosition might need adjusting for transcripts with unclear start
        // Found GRCh38 transcript which does not have the unconfirmed start flag BUT the first aa is an X;
        // ENST00000618610 (ENSP00000484524)
        if (hasUnconfirmedStart()) {
            int firstCodonPhase = getFirstCodonPhase();
            // firstCodingExonPhase is the ENSEMBL's annotated phase for the transcript, which takes following values
            // - 0 if fits perfectly with the reading frame, i.e.
            // Sequence ---------ACTTACGGTC
            // Codons            ---|||---|||
            // - 1 if shifted one position, i.e.
            // Sequence ---------ACTTACGGTC
            // Codons             ---|||---||||
            // - 2 if shifted two positions, i.e.
            // Sequence ---------ACTTACGGTC
            // Codons              ---|||---|||
            if (firstCodonPhase != 0) {
                cdsPosition = cdsPosition + (3 - firstCodonPhase);
            }

//            if (cdsPosition > firstCodonPhase) {
//                cdsPosition += firstCodonPhase;
//            } else {
//                return 0;
//            }
        }
        // We add 1 to get the position not index
        return ((cdsPosition - 1) / 3) + 1;
    }

    /**
     * Returns the position 1-based in the codon.
     * @param cdsPosition CDS position
     * @return Valid values are 1, 2 or 3. When incomplete codon then 0 is returned.
     */
    public int getPositionAtCodon(int cdsPosition) {
        // phase might need adjusting for transcripts with unclear start
        // Found GRCh38 transcript which does not have the unconfirmed start flag BUT the first aa is an X;
        // ENST00000618610 (ENSP00000484524)
        if (hasUnconfirmedStart()) {
            int firstCodonPhase = getFirstCodonPhase();
            // firstCodingExonPhase is the ENSEMBL's annotated phase for the transcript, which takes following values
            // - 0 if fits perfectly with the reading frame, i.e.
            // Sequence ---------ACTTACGGTC
            // Codons            ---|||---|||
            // - 1 if shifted one position, i.e.
            // Sequence ---------ACTTACGGTC
            // Codons             ---|||---||||
            // - 2 if shifted two positions, i.e.
            // Sequence ---------ACTTACGGTC
            // Codons              ---|||---|||
//            if (firstCodonPhase != 0) {
//                cdsPosition = cdsPosition + (3 - firstCodonPhase);
//            }
//            System.out.println("cdsPosition = " + cdsPosition);
//            System.out.println("firstCodonPhase = " + firstCodonPhase);
            if (cdsPosition > firstCodonPhase) {
                cdsPosition -= firstCodonPhase;
//                cdsPosition += (3 - firstCodonPhase);
            } else {
                // If CDS position belongs to the first incomplete codon we return 0
                return 0;
            }
        }
        // We add 1 to get the position not index
        return ((cdsPosition - 1) % 3) + 1;
    }

    public String getXrefId(String sourceDatabase) {
        List xrefs = transcript.getXrefs();
         if (xrefs != null && !xrefs.isEmpty()) {
            for (Xref xref : xrefs) {
                if (sourceDatabase.equals(xref.getDbName())) {
                    return xref.getId();
                }
            }
        }
        return null;
    }

    public boolean isExonic(int genomicPosition) {
        for (Exon exon : transcript.getExons()) {
            if (genomicPosition >= exon.getStart() && genomicPosition <= exon.getEnd()) {
                return true;
            }
        }
        return false;
    }

    public boolean isExonic(int start, int end) {
        for (Exon exon : transcript.getExons()) {
            if (start == end) {
                if (start >= exon.getStart() && start <= exon.getEnd()) {
                    return true;
                }
            } else {
                if (start >= exon.getStart() && start <= exon.getEnd() || end >= exon.getStart() && end <= exon.getEnd()) {
                    return true;
                }
            }
        }
        return false;
    }

    public boolean isExonicSpliceSite(int position) {
        for (Exon exon : transcript.getExons()) {
            if (position > exon.getEnd() - 2 && position <= exon.getEnd()) {
                return true;
            }
        }
        return false;
    }

    public boolean isExonicSpliceSite(int start, int end) {
        for (Exon exon : transcript.getExons()) {
            if ((start > exon.getEnd() - 2 && start <= exon.getEnd()) || (end > exon.getEnd() - 2 && end <= exon.getEnd())) {
                return true;
            }
        }
        return false;
    }

    public String getFormattedCdnaSequence() {
        return this.getFormattedCdnaSequence(transcript.getCdnaSequence());
    }

    /**
     * Calculates de cDNA sequence applying the genomic variant provided. The type of variant (SNV, INDEL, ...), the transcript
     * unconfirmed start, the strand and the UTRs are taken care of.
     * @param variant A genomic variant with the coordinates dna the alleles
     * @return Alternate cDNA sequence caulculated.
     */
    protected String getAlternateCdnaSequence(Variant variant) {
        if (variant.getEnd() < transcript.getGenomicCodingStart() || variant.getStart() > transcript.getGenomicCodingEnd()) {
            return transcript.getCdnaSequence();
        }

        // Create a StringBuilder to easily mannipulate the string
        StringBuilder alternateCdnaSequence = new StringBuilder(transcript.getCdnaSequence());
        String alternate = transcript.getStrand().equals("+")
                ? variant.getAlternate()
                : VariantAnnotationUtils.reverseComplement(variant.getAlternate());

        // Genomic to cDNA
        int variantCdsPosition = HgvsCalculator.getCdsStart(transcript, variant.getStart());
        int cdnaVariantIndex = cdsToCdna(variantCdsPosition) - 1;

        switch (variant.getType()) {
            case SNV:
                if (transcript.getStrand().equals("-")) {
                    // FIXME a error on HgvsCalculator.getCdsStart
                    cdnaVariantIndex--;
                }
                alternateCdnaSequence.setCharAt(cdnaVariantIndex, alternate.charAt(0));
                break;
            case INSERTION:
            case DELETION:
            case INDEL:
                if (StringUtils.isBlank(variant.getReference()) || variant.getReference().equals("-")) {
                    // Insertion
                    alternateCdnaSequence.insert(cdnaVariantIndex, alternate);
                } else {
                    // Deletion
                    if (StringUtils.isBlank(variant.getAlternate()) || variant.getAlternate().equals("-")) {

                        String referenceAllele = variant.getReference();
                        if (transcript.getStrand().equals("+")) {
                            if (variant.getStart() < transcript.getGenomicCodingStart()
                                    && variant.getEnd() >= transcript.getGenomicCodingStart()) {
                                referenceAllele = referenceAllele.substring(transcript.getGenomicCodingStart() - variant.getStart());
                                variantCdsPosition = HgvsCalculator.getCdsStart(transcript, transcript.getGenomicCodingStart());
                            } else {
                                boolean intronExonBoundary = false;
                                Exon overlapExon = null;
                                for (Exon exon : transcript.getExons()) {
                                    if (variant.getStart() < exon.getStart() && variant.getEnd() >= exon.getEnd()) {
                                        intronExonBoundary = true;
                                        overlapExon = exon;
                                        break;
                                    }
                                }
                                if (intronExonBoundary) {
                                    referenceAllele = referenceAllele.substring(overlapExon.getGenomicCodingStart() - variant.getStart());
                                    variantCdsPosition = HgvsCalculator.getCdsStart(transcript, overlapExon.getGenomicCodingStart());
                                } else {
                                    // Normal exonic deletion
                                    variantCdsPosition = HgvsCalculator.getCdsStart(transcript, variant.getStart());
                                }
                            }
                        } else {
                            if (variant.getStart() <= transcript.getGenomicCodingEnd()
                                    && variant.getEnd() > transcript.getGenomicCodingEnd()) {
                                referenceAllele = referenceAllele.substring(0, transcript.getGenomicCodingEnd() - variant.getStart() + 1);
                                variantCdsPosition = HgvsCalculator.getCdsStart(transcript, transcript.getGenomicCodingEnd());
                            } else {
//                                boolean intronExonBoundary = false;
//                                Exon overlapExon = null;
//                                for (Exon exon : transcript.getExons()) {
//                                    if (variant.getStart() <= exon.getEnd() && variant.getEnd() > exon.getEnd()) {
//                                        intronExonBoundary = true;
//                                        overlapExon = exon;
//                                        break;
//                                    }
//                                }
//                                if (intronExonBoundary) {
//                                    referenceAllele = referenceAllele.substring(overlapExon.getGenomicCodingStart() - variant.getStart());
//                                    variantCdsPosition = HgvsCalculator.getCdsStart(transcript, overlapExon.getGenomicCodingEnd());
//                                } else {
//                                    // Normal exonic deletion
//                                    variantCdsPosition = HgvsCalculator.getCdsStart(transcript, variant.getEnd());
//                                }
                                variantCdsPosition = HgvsCalculator.getCdsStart(transcript, variant.getEnd());
                            }
                            // FIXME Method HgvsCalculator.getCdsStart wringly returns +1 for reverse strand
                            variantCdsPosition--;
                        }
                        cdnaVariantIndex = cdsToCdna(variantCdsPosition) - 1;
                        alternateCdnaSequence.replace(cdnaVariantIndex, cdnaVariantIndex + referenceAllele.length(), "");
                    } else {
                        System.out.println("No valid INDEL variant: " + variant.getId());
                        return null;
                    }
                }
                break;
            default:
                return null;
        }
        return alternateCdnaSequence.toString();
    }

    public String getFormattedCdnaSequence(String cdnaSequence) {
        StringBuilder cdnaPositions = new StringBuilder();
        StringBuilder codonPositions = new StringBuilder();
        StringBuilder formattedCdnaSequence = new StringBuilder();
        StringBuilder aaPositions = new StringBuilder();
        StringBuilder proteinSequence = new StringBuilder();
        StringBuilder proteinCodedSequence = new StringBuilder();

        String separator = "    ";

        int position = 1;
        int aaPosition = 1;
        int transcriptPhase = (transcript.getCdnaCodingStart() - 1) % 3;

        // Unconfirmed start transcripts ALWAYS start at position 1
        if (hasUnconfirmedStart()) {
            position = transcriptPhase != 0 ? 0 : 1;
            transcriptPhase = getFirstCodonPhase();
        }

        if (transcriptPhase > 0) {
            cdnaPositions.append(StringUtils.rightPad(String.valueOf(position == 0 ? 1 : position), transcriptPhase)).append(separator);
            codonPositions.append(StringUtils.rightPad(String.valueOf(position++), transcriptPhase)).append(separator);
            formattedCdnaSequence.append(cdnaSequence.substring(0, transcriptPhase)).append(separator);

            if (transcript.getProteinSequence().startsWith("X")) {
                aaPositions.append(StringUtils.rightPad("1", transcriptPhase)).append(separator);
                proteinSequence.append(StringUtils.rightPad("X", transcriptPhase)).append(separator);
                proteinCodedSequence.append(StringUtils.rightPad("X", transcriptPhase)).append(separator);
                aaPosition++;
            } else {
                aaPositions.append(StringUtils.repeat(' ', transcriptPhase)).append(separator);
                proteinSequence.append(StringUtils.repeat(' ', transcriptPhase)).append(separator);
                proteinCodedSequence.append(StringUtils.repeat(' ', transcriptPhase)).append(separator);
            }
        }

        String codon, aa;
        boolean insideCodingSequence = transcript.getCdnaCodingStart() == 1;
        for (int i = transcriptPhase; i < cdnaSequence.length(); i += 3) {
            cdnaPositions.append(StringUtils.rightPad(String.valueOf(i + 1), 4)).append("   ");
            codonPositions.append(StringUtils.rightPad(String.valueOf(position++), 4)).append("   ");
            codon = cdnaSequence.substring(i, Math.min(i + 3, cdnaSequence.length()));
            formattedCdnaSequence.append(codon).append(separator);

            // We are now inside the protein coding region
            if (!insideCodingSequence && i == transcript.getCdnaCodingStart() - 1) {
                insideCodingSequence = true;
            }

            if (insideCodingSequence) {
                aa = VariantAnnotationUtils.getAminoacid(VariantAnnotationUtils.MT.equals(transcript.getChromosome()), codon);
                if (aa != null && aa.equals("STOP")) {
                    // STOP has 4 letters so me need to remove 1 white space
                    proteinSequence.append(aa).append("   ");
                    insideCodingSequence = false;
                } else {
                    proteinSequence.append(aa).append(separator);
                }
                aaPositions.append(StringUtils.rightPad(String.valueOf(aaPosition++), 4)).append("   ");
                proteinCodedSequence.append(StringUtils.rightPad(VariantAnnotationUtils.TO_ABBREVIATED_AA.get(aa), 4)).append("   ");
            } else {
                aaPositions.append(" - ").append(separator);
                proteinSequence.append(" - ").append(separator);
                proteinCodedSequence.append(" - ").append(separator);
            }
        }

        return cdnaPositions.toString() + "\n"
                + codonPositions.toString() + "\n"
                + formattedCdnaSequence.toString() + "\n"
                + StringUtils.repeat('-', cdnaPositions.length()) + "\n"
                + aaPositions.toString() + "\n"
                + proteinSequence.toString() + "\n"
                + proteinCodedSequence.toString();
    }

    @Override
    public String toString() {
        final StringBuilder sb = new StringBuilder("TranscriptUtils{");
        sb.append("transcript=").append(transcript);
        sb.append('}');
        return sb.toString();
    }

    public Transcript getTranscript() {
        return transcript;
    }

    public TranscriptUtils setTranscript(Transcript transcript) {
        this.transcript = transcript;
        return this;
    }


}