All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.utils.codecs.gtf;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.Feature;
import htsjdk.tribble.annotation.Strand;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * A {@link GencodeGtfFeature} represents data in a GENCODE GTF file.
 *
 * Features are grouped logically by related data.
 * While the abstract class {@link GencodeGtfFeature} represents a single line
 * of a GENCODE GTF File, the concrete instantiations represent at least one line,
 * and often more than one.
 *
 * For example, a {@link GencodeGtfGeneFeature} represents all lines in the given
 * data file with information on a particular gene.  This includes all transcripts,
 * exons, coding regions, etc. in that gene.
 *
 * Similarly, a {@link GencodeGtfTranscriptFeature} represents all lines in the given
 * data file with information on a particular transcript.
 *
 * However, a {@link GencodeGtfSelenocysteineFeature} represents a particular line
 * in the given data file that contains information on a specific selenocysteine.
 *
 * The specification of a GTF file is defined here:
 * http://mblab.wustl.edu/GTF22.html
 *
 * Currently only supports GENCODE versions 19-26.
 *
 * Created by jonn on 7/21/17.
 */
public abstract class GencodeGtfFeature implements Feature, Comparable {

    private static final Logger logger = LogManager.getLogger(GencodeGtfFeature.class);

    // ===========================================================================


    public static final String ANNOTATION_SOURCE_ENSEMBL = "ENSEMBL";
    public static final String ANNOTATION_SOURCE_HAVANA = "HAVANA";
    public static final String ANNOTATION_SOURCE_ENA = "ena";

    // ===========================================================================

    // Metadata fields:

    private static final String FIELD_DELIMITER                 = "\t";

    public static final int NO_FEATURE_ORDER                    = -1;
    public static final int NO_EXON_NUMBER                      = -1;

    private static final int NUM_FIELDS                         = 9;

    private static final int CHROMOSOME_NAME_INDEX              = 0;
    private static final int ANNOTATION_SOURCE_INDEX            = 1;
    private static final int FEATURE_TYPE_INDEX                 = 2;
    private static final int START_LOCATION_INDEX               = 3;
    private static final int END_LOCATION_INDEX                 = 4;
    private static final int GENOMIC_STRAND_INDEX               = 6;
    private static final int GENOMIC_PHASE_INDEX                = 7;
    private static final int EXTRA_FIELDS_INDEX                 = 8;

    private static final String EXTRA_FIELD_DELIMITER           = ";";

    private static final int EXTRA_FIELD_KEY_INDEX              = 0;
    private static final int EXTRA_FIELD_VALUE_INDEX            = 1;
    public static final String EXTRA_FIELD_KEY_VALUE_SPLITTER   = " ";

    private static final Pattern NUMBER_PATTERN                 = Pattern.compile("\\d\\d*");

    private String ucscGenomeVersion =  null;
    @VisibleForTesting
    final GencodeGtfFeatureBaseData baseData;

    // ================================================================================================

    /**
     * Populate this GencodeGtfFeature with the given data.
     * @param gtfFields {@link String[]} containing an ordered list of fields to use to populate this {@link GencodeGtfFeature}.
     * @param gtfFileType A {@link String} containing the file type of the GTF data that created this {@link GencodeGtfFeature}.
     */
    protected GencodeGtfFeature(final String[] gtfFields, final String gtfFileType) {

        Utils.validateArg(gtfFields.length == NUM_FIELDS, "Unexpected number of fields: " + gtfFields.length + " != " + NUM_FIELDS);

        baseData = new GencodeGtfFeatureBaseData();

        try {
            baseData.genomicPosition = new SimpleInterval(
                    gtfFields[CHROMOSOME_NAME_INDEX],
                    Integer.valueOf(gtfFields[START_LOCATION_INDEX]),
                    Integer.valueOf(gtfFields[END_LOCATION_INDEX])
            );
        }
        catch (final NumberFormatException ex) {
            throw new UserException.MalformedFile("Cannot read integer value for start/end position!");
        }

        baseData.gtfSourceFileType       = gtfFileType;

        baseData.annotationSource        = gtfFields[ANNOTATION_SOURCE_INDEX];
        baseData.featureType             = GencodeGtfFeature.FeatureType.getEnum( gtfFields[FEATURE_TYPE_INDEX].toLowerCase() );
        baseData.genomicStrand           = convertStringToStrand( gtfFields[GENOMIC_STRAND_INDEX] );
        baseData.genomicPhase            = GenomicPhase.getEnum( gtfFields[GENOMIC_PHASE_INDEX] );

        // Get the extra fields from the last column:
        final String[] extraFields    = gtfFields[EXTRA_FIELDS_INDEX].split(EXTRA_FIELD_DELIMITER, -1);

        final StringBuilder anonymousOptionalFieldBuilder = new StringBuilder();

        // Now there are "optional" fields to go through (some actually required, some actually optional),
        // But we need to match up the field names to the fields themselves:
        for ( final String extraField : extraFields ) {

            final String trimmedExtraField = extraField.trim();
            if (trimmedExtraField.isEmpty()) {
                continue;
            }

            final int splitPoint = trimmedExtraField.indexOf(EXTRA_FIELD_KEY_VALUE_SPLITTER);
            if( splitPoint == -1 ) {
                throw new UserException.MalformedFile("Extraneous optional field data - not in a key/value pair: " + extraField);
            }

            final String fieldName = trimmedExtraField.substring(0, splitPoint).trim();

            // The value of the field may be between two quotes.
            // We remove them here.
            final String rawFieldValue = trimmedExtraField.substring(splitPoint + 1, trimmedExtraField.length());
            final String fieldValue = StringUtils.remove(rawFieldValue.trim(), '"');

            if( fieldValue.contains(EXTRA_FIELD_KEY_VALUE_SPLITTER) ){
                throw new UserException("Expected a key/value pair but found several values " + fieldName + "/" + fieldValue);
            }

            OptionalField optionalField = null;

            switch (fieldName) {
                // Find the right field to set:
                case GencodeGTFFieldConstants.GENE_ID:
                    baseData.geneId = fieldValue;
                    break;
                case GencodeGTFFieldConstants.TRANSCRIPT_ID:
                    baseData.transcriptId = fieldValue;
                    break;
                case GencodeGTFFieldConstants.GENE_TYPE:
                    baseData.geneType = fieldValue;
                    break;
                // For ENSEMBL GTF files:
                case GencodeGTFFieldConstants.GENE_BIOTYPE:
                    baseData.geneType = fieldValue;
                    break;
                case GencodeGTFFieldConstants.GENE_STATUS:
                    baseData.geneStatus = fieldValue;
                    break;
                case GencodeGTFFieldConstants.GENE_NAME:
                    baseData.geneName = fieldValue;
                    break;
                case GencodeGTFFieldConstants.TRANSCRIPT_TYPE:
                    baseData.transcriptType = fieldValue;
                    break;
                case GencodeGTFFieldConstants.TRANSCRIPT_BIOTYPE:
                    baseData.transcriptType = fieldValue;
                    break;
                case GencodeGTFFieldConstants.TRANSCRIPT_STATUS:
                    baseData.transcriptStatus = fieldValue;
                    break;
                case GencodeGTFFieldConstants.TRANSCRIPT_NAME:
                    baseData.transcriptName = fieldValue;
                    break;
                case GencodeGTFFieldConstants.EXON_NUMBER:
                    try {
                        baseData.exonNumber = Integer.valueOf(fieldValue);
                    }
                    catch (final NumberFormatException ex) {
                        throw new UserException.MalformedFile("Could not convert field value into integer: " + fieldValue);
                    }
                    break;
                case GencodeGTFFieldConstants.EXON_ID:
                    baseData.exonId = fieldValue;
                    break;
                case GencodeGTFFieldConstants.LEVEL:
                    baseData.locusLevel = fieldValue;
                    break;
                default:
                    optionalField = new OptionalField<>(fieldName, fieldValue);
                    break;
            }

            // If the optional field was good, we add it:
            if ( optionalField != null ) {
                baseData.optionalFields.add(optionalField);
            }
        }
    }

    /**
     * Converts the given {@link String} into a {@link Strand}.
     * @param s {@link String} to convert into a {@link Strand}.
     * @return The {@link Strand} corresponding to {@code s}.
     */
    private static Strand convertStringToStrand( final String s ) {
        if ( s.equals("+") ) {
            return Strand.POSITIVE;
        }
        else if ( s.equals("-") ) {
            return Strand.NEGATIVE;
        }
        else {
            throw new IllegalArgumentException("Unexpected value: " + s);
        }
    }

    /**
     * Populate this GencodeGtfFeature with the given data.
     */
    protected GencodeGtfFeature(final GencodeGtfFeatureBaseData baseData) {
        this.baseData = baseData;
    }

    // ================================================================================================

    /**
     * Create the appropriate {@link GencodeGtfFeature} object based on the given {@code baseData}
     * @param baseData A {@link GencodeGtfFeatureBaseData} object containing all data for a single line in a GENCODE GTF File.
     * @return A {@link GencodeGtfFeature} containing the data in {@code baseData}
     */
    public static GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
        Utils.nonNull(baseData);

        // Create our feature:
        return baseData.featureType.create(baseData);
    }

    /**
     * Create a {@link GencodeGtfFeature} based on a line from a Gencode GTF File.
     * @param gtfLine A line from a Gencode GTF File to convert into a {@link GencodeGtfFeature} object.
     * @param gtfFileType A {@link String} containing the file type of the GTF data that created this {@link GencodeGtfFeature}.
     * @return The {@link GencodeGtfFeature} representing the information in {@code gtfLine}
     */
    public static GencodeGtfFeature create(final String gtfLine, final String gtfFileType) {
        Utils.nonNull(gtfLine);
        return create(gtfLine.split(FIELD_DELIMITER), gtfFileType);
    }

    /**
     * Create a {@link GencodeGtfFeature} based on a line from a Gencode GTF File.
     * @param gtfFields A line from a Gencode GTF File split on the {@link #FIELD_DELIMITER} character.
     * @param gtfFileType A {@link String} containing the file type of the GTF data that created this {@link GencodeGtfFeature}.
     * @return The {@link GencodeGtfFeature} representing the information in {@code gtfLine}
     */
    public static GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
        Utils.nonNull(gtfFields);

        // Ensure that the input data are superficially well-formed:
        if ( gtfFields.length != GencodeGtfCodec.NUM_COLUMNS ) {
            throw new UserException.MalformedFile("Invalid number of fields in the given GENCODE line " +
                    " - Given: " + gtfFields.length + " Expected: " + GencodeGtfCodec.NUM_COLUMNS);
        }

        final FeatureType featureType = FeatureType.getEnum( gtfFields[FEATURE_TYPE_INDEX] );

        // Return our feature:
        return featureType.create(gtfFields, gtfFileType);
    }

    // ================================================================================================

    @Override
    public String getContig() {
        return baseData.genomicPosition.getContig();
    }

    @Override
    public int getStart() {
        return baseData.genomicPosition.getStart();
    }

    @Override
    public int getEnd() {
        return baseData.genomicPosition.getEnd();
    }

    // ================================================================================================

    /**
     * Get all the features from this {@link GencodeGtfFeature} itself.
     * This is useful to get any subfeatures included in this {@link GencodeGtfFeature}.
     * @return A {@link List} of the features represented in this {@link GencodeGtfFeature}.
     */
    @VisibleForTesting
    public List getAllFeatures() {
        final List list = new ArrayList<>();
        list.add(this);
        return list;
    }


    /**
     * Get all the {@link GencodeGtfFeatureBaseData} objects from this {@link GencodeGtfFeature} itself.
     * This is useful to get any subfeatures included in this {@link GencodeGtfFeature}.
     * @return A {@link List} of the {@link GencodeGtfFeatureBaseData} objects represented in this {@link GencodeGtfFeature}.
     */
    @VisibleForTesting
    List GencodeGtfFeatureBaseData() {
        final List baseDataParts = new ArrayList<>();
        baseDataParts.add(this.baseData);
        getAllFeatures().forEach(feature -> baseDataParts.add(feature.baseData));
        return baseDataParts;
    }

    /**
     * Serializes the base data in {@link GencodeGtfFeature} to a string.
     * @return a {@link String} representing this {@link GencodeGtfFeature}
     */
    private String serializeToStringHelper() {

        final StringBuilder stringBuilder = new StringBuilder();

        stringBuilder.append( baseData.genomicPosition.getContig() );
        stringBuilder.append( '\t' );
        stringBuilder.append( baseData.annotationSource );
        stringBuilder.append( '\t' );
        stringBuilder.append( baseData.featureType );
        stringBuilder.append( '\t' );
        stringBuilder.append( baseData.genomicPosition.getStart() );
        stringBuilder.append( '\t' );
        stringBuilder.append( baseData.genomicPosition.getEnd() );
        stringBuilder.append( "\t.\t" );
        stringBuilder.append( baseData.genomicStrand );
        stringBuilder.append( '\t' );
        stringBuilder.append( baseData.genomicPhase );
        stringBuilder.append( '\t' );

        if ( baseData.geneId != null ) {
            stringBuilder.append("gene_id \"");
            stringBuilder.append(baseData.geneId);
            stringBuilder.append( "\"; " );
        }
        if ( baseData.transcriptId != null) {
            stringBuilder.append("transcript_id \"");
            stringBuilder.append(baseData.transcriptId);
            stringBuilder.append( "\"; " );
        }
        if ( baseData.geneType != null ) {
            stringBuilder.append("gene_type \"");
            stringBuilder.append(baseData.geneType);
            stringBuilder.append( "\"; " );
        }
        if ( baseData.geneStatus != null ) {
            stringBuilder.append("gene_status \"");
            stringBuilder.append(baseData.geneStatus);
            stringBuilder.append( "\"; " );
        }
        if ( baseData.geneName != null ) {
            stringBuilder.append("gene_name \"");
            stringBuilder.append(baseData.geneName);
            stringBuilder.append( "\"; " );
        }
        if ( baseData.transcriptType != null ) {
            stringBuilder.append("transcript_type \"");
            stringBuilder.append(baseData.transcriptType);
            stringBuilder.append( "\"; " );
        }
        if ( baseData.transcriptStatus != null ) {
            stringBuilder.append("transcript_status \"");
            stringBuilder.append(baseData.transcriptStatus);
            stringBuilder.append( "\"; " );
        }
        if ( baseData.transcriptName != null ) {
            stringBuilder.append("transcript_name \"");
            stringBuilder.append(baseData.transcriptName);
            stringBuilder.append( "\"; " );
        }
        if ( baseData.exonNumber != NO_EXON_NUMBER ) {
            stringBuilder.append("exon_number ");
            stringBuilder.append(baseData.exonNumber);
            stringBuilder.append( "; " );
        }
        if ( baseData.exonId != null) {
            stringBuilder.append("exon_id \"");
            stringBuilder.append(baseData.exonId);
            stringBuilder.append( "\"; ");
        }
        if (baseData.locusLevel != null) {
            stringBuilder.append("level ");
            stringBuilder.append(baseData.locusLevel);
            stringBuilder.append("; ");
        }

        // = = = = = = = = = = = = = = = = = = = = = = =

        // Output our optional fields:
        stringBuilder.append(
                baseData.optionalFields.stream().map(Object::toString).collect(Collectors.joining(" "))
        );

        return stringBuilder.toString().trim();
    }

    /**
     * Serializes all data in {@link GencodeGtfFeature} to a string.
     * This includes all subfields of child classes.
     * @return a {@link String} representing this {@link GencodeGtfFeature}
     */
    public String serializeToString() {
        final StringBuilder stringBuilder = new StringBuilder();

        final List features = getAllFeatures();
        Collections.sort( features );

        for ( final GencodeGtfFeature feature : features ) {
            stringBuilder.append( feature.serializeToStringHelper() );
            stringBuilder.append("\n");
        }

        return stringBuilder.toString().trim();
    }

    @Override
    public String toString() {
        return serializeToString();
    }

    // ================================================================================================

    public String getGtfSourceFileType() { return baseData.gtfSourceFileType; }

    public String getUcscGenomeVersion() {
        return ucscGenomeVersion;
    }

    public void setUcscGenomeVersion(final String ucscGenomeVersion) {
        this.ucscGenomeVersion = ucscGenomeVersion;
    }

    public SimpleInterval getGenomicPosition() { return baseData.genomicPosition; }

    public int getFeatureOrderNumber() { return baseData.featureOrderNumber; }

    public String getChromosomeName() {
        return baseData.genomicPosition.getContig();
    }

    public String getAnnotationSource() {
        return baseData.annotationSource;
    }

    public FeatureType getFeatureType() {
        return baseData.featureType;
    }

    public int getGenomicStartLocation() {
        return baseData.genomicPosition.getStart();
    }

    public int getGenomicEndLocation() {
        return baseData.genomicPosition.getEnd();
    }

    public Strand getGenomicStrand() {
        return baseData.genomicStrand;
    }

    public GenomicPhase getGenomicPhase() {
        return baseData.genomicPhase;
    }

    public String getGeneId() {
        return baseData.geneId;
    }

    public String getTranscriptId() {
        return baseData.transcriptId;
    }

    public String getGeneType() {
        return baseData.geneType;
    }

    public String getGeneName() {
        return baseData.geneName;
    }

    public String getTranscriptType() {
        return baseData.transcriptType;
    }

    public String getTranscriptName() {
        return baseData.transcriptName;
    }

    public String getGeneStatus() {
        return baseData.geneStatus;
    }

    public String getTranscriptStatus() {
        return baseData.transcriptStatus;
    }

    public int getExonNumber() {
        return baseData.exonNumber;
    }

    public String getExonId() {
        return baseData.exonId;
    }

    public String getLocusLevel() {
        return baseData.locusLevel;
    }

    public List> getOptionalFields() {
        return baseData.optionalFields;
    }

    // Certian optional fields support multiple values (for example "tag"), so we need to return a list of them.
    public List> getOptionalField(final String key) {
        final List> optionalFields = new ArrayList<>();
        for (final OptionalField optionalField : baseData.optionalFields) {
            if ( optionalField.getName().equals(key) ) {
                optionalFields.add(optionalField);
            }
        }
        return optionalFields;
    }

    /**
     * Comparable interface implementation for {@link GencodeGtfFeature}.
     *
     * Order is determined by {@link GencodeGtfFeatureBaseData#featureOrderNumber}
     *
     * @param other {@link GencodeGtfFeature} to which to compare
     * @return -1 if this < other; 0 if this == other; 1 if this > other
     */
    @Override
    public int compareTo(final GencodeGtfFeature other) {
        Utils.nonNull(other);
        return (baseData.featureOrderNumber - other.baseData.featureOrderNumber);
    }

    @Override
    public boolean equals(final Object that) {
        if (that == null) {
            return false;
        }
        else if ( this == that ) {
            return true;
        }

        boolean isEqual = that instanceof GencodeGtfFeature;
        if (isEqual) {
            final GencodeGtfFeature thatFeature = (GencodeGtfFeature) that;
            isEqual = Objects.equals(baseData, thatFeature.baseData);

            if ( isEqual ) {
                isEqual = ucscGenomeVersion.equals( thatFeature.getUcscGenomeVersion() );
            }
        }

        return isEqual;
    }

    @Override
    public int hashCode() {
        return baseData != null ? baseData.hashCode() : 0;
    }

    /**
     * Checks if {@code other} is contained within this {@link GencodeGtfFeature}.
     * Comparison is made using {@link SimpleInterval#contains(Locatable)} ala {@link GencodeGtfFeatureBaseData#genomicPosition}
     * @param other {@link Locatable} of which to check the bounds.
     * @return true if {@code other} is contained within the bounds of this {@link GencodeGtfFeature}, false otherwise.
     */
    public boolean contains(final Locatable other) {
        return baseData.genomicPosition.contains(other);
    }

    /**
     * Checks if {@code other} overlaps with this {@link GencodeGtfFeature}.
     * Comparison is made using {@link SimpleInterval#overlaps(Locatable)} ala {@link GencodeGtfFeatureBaseData#genomicPosition}
     * @param other {@link Locatable}-derived class of which to check the bounds.
     * @return true if {@code other} overlaps the bounds of this {@link GencodeGtfFeature}, false otherwise.
     */
    public boolean overlaps(final Locatable other) {
        return baseData.genomicPosition.overlaps(other);
    }

    public void setFeatureOrderNumber(final int featureOrderNumber) {
        this.baseData.featureOrderNumber = featureOrderNumber;
    }

    // ================================================================================================

    static public class OptionalField {

        private String name;
        private T value;

        public OptionalField(final String name, final T value) {
            this.name = name;
            this.value = value;
        }

        public String getName() {
            return name;
        }

        public void setName(final String name) {
            this.name = name;
        }

        public T getValue() {
            return value;
        }

        public void setValue(final T value) {
            this.value = value;
        }

        @Override
        public String toString() {

            final StringBuilder sb = new StringBuilder();

            sb.append(name);
            sb.append(" ");

            // We need to do some formatting for the numbers / non-numbers in the field:
            final String valueString = value.toString();
            if ( NUMBER_PATTERN.matcher(valueString).matches() ) {
                sb.append(valueString);
                sb.append(";");
            }
            else {
                sb.append("\"");
                sb.append(valueString);
                sb.append("\";");
            }

            return sb.toString();
        }

        @Override
        public int hashCode() {
            int result = name != null ? name.hashCode() : 0;
            result = 31 * result + (value != null ? value.hashCode() : 0);
            return result;
        }

        @Override
        public boolean equals(final Object other) {

            if (other == null) {
                return false;
            }
            else if ( this == other ) {
                return true;
            }

            if ( !(other instanceof OptionalField) ) {
                return false;
            }

            final OptionalField otherOptionalField = (OptionalField) other;

            return (name.equals(otherOptionalField.name)) &&
                    (value.equals(otherOptionalField.value));
        }
    }

    // ================================================================================================



    // ================================================================================================

    /**
     * Keyword identifying the source of the feature, like a program
     * (e.g. Augustus or RepeatMasker) or an organization (like TAIR).
     *
     * For more information, see:
     *     https://www.gencodegenes.org/data_format.html
     *     https://en.wikipedia.org/wiki/General_feature_format
     */
    public enum AnnotationSource {
        ENSEMBL,
        HAVANA,
        ena // From ENSEMBLE GTFs
    }

    /**
     * Type of the feature represented in a single line of a GENCODE GTF File.
     *
     * For more information, see:
     *     https://www.gencodegenes.org/data_format.html
     *     https://en.wikipedia.org/wiki/General_feature_format
     */
    public enum FeatureType {
        GENE("gene"){
            public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
                return GencodeGtfGeneFeature.create(baseData);
            }
            public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
                return GencodeGtfGeneFeature.create(gtfFields, gtfFileType);
            }
        },
        TRANSCRIPT("transcript"){
            public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
                return GencodeGtfTranscriptFeature.create(baseData);
            }
            public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
                return GencodeGtfTranscriptFeature.create(gtfFields, gtfFileType);
            }
        },
        SELENOCYSTEINE("Selenocysteine"){
            public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
                return GencodeGtfSelenocysteineFeature.create(baseData);
            }
            public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
                return GencodeGtfSelenocysteineFeature.create(gtfFields, gtfFileType);
            }
        },
        EXON("exon"){
            public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
                return GencodeGtfExonFeature.create(baseData);
            }
            public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
                return GencodeGtfExonFeature.create(gtfFields, gtfFileType);
            }
        },
        CDS("CDS"){
            public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
                return GencodeGtfCDSFeature.create(baseData);
            }
            public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
                return GencodeGtfCDSFeature.create(gtfFields, gtfFileType);
            }
        },
        START_CODON("start_codon"){
            public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
                return GencodeGtfStartCodonFeature.create(baseData);
            }
            public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
                return GencodeGtfStartCodonFeature.create(gtfFields, gtfFileType);
            }
        },
        STOP_CODON("stop_codon"){
            public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
                return GencodeGtfStopCodonFeature.create(baseData);
            }
            public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
                return GencodeGtfStopCodonFeature.create(gtfFields, gtfFileType);
            }
        },
        UTR("UTR"){
            public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
                return GencodeGtfUTRFeature.create(baseData);
            }
            public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
                return GencodeGtfUTRFeature.create(gtfFields, gtfFileType);
            }
        };

        @SuppressWarnings("unchecked")
        private static final Map VALUE_MAP =
                Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v));

        private final String serialized;

        FeatureType(final String serializedValue) { serialized = serializedValue; }

        @Override
        public String toString() { return serialized; }

        public static FeatureType getEnum(final String s) {
            final String lowerS = s.toLowerCase();
            if ( VALUE_MAP.containsKey(lowerS) ){
                return VALUE_MAP.get(lowerS);
            }
            throw new IllegalArgumentException("Unexpected value: " + s);
        }

        /**
         * Create a {@link GencodeGtfFeature} of this type given {@code baseData}
         * @param baseData The data to use to create a {@link GencodeGtfFeature}
         * @return The {@link GencodeGtfFeature} represented by the given {@code baseData}
         */
        abstract public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData);

        /**
         * Create a {@link GencodeGtfFeature} of this type given {@code gtfFields}
         * @param gtfFields The data to use to create a {@link GencodeGtfFeature}
         * @param gtfFileType A {@link String} containing the file type of the GTF data that created this {@link GencodeGtfFeature}.
         * @return The {@link GencodeGtfFeature} represented by the given {@code gtfFields}
         */
        abstract public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType);
    }

    /**
     * Whether the first base of the CDS segment is the first (frame 0), second (frame 1) or third (frame 2) \
     * in the codon of the ORF.
     *
     * For more information, see:
     *     https://www.gencodegenes.org/data_format.html
     *     https://en.wikipedia.org/wiki/General_feature_format
     */
    public enum GenomicPhase {
        ZERO("0"),
        ONE ("1"),
        TWO ("2"),
        DOT (".");

        @SuppressWarnings("unchecked")
        private static final Map VALUE_MAP =
                Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v));

        private final String serialized;

        GenomicPhase(final String serializedValue) {
            serialized = serializedValue;
        }

        @Override
        public String toString() {
            return serialized;
        }

        public static GenomicPhase getEnum(final String s) {
            final String lowerS = s.toLowerCase();
            if ( VALUE_MAP.containsKey(lowerS) ){
                return VALUE_MAP.get(lowerS);
            }
            throw new IllegalArgumentException("Unexpected value: " + s);
        }
    }


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy