All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.utils.codecs.gtf.EnsemblGtfCodec Maven / Gradle / Ivy

There is a newer version: 4.6.0.0
Show newest version
package org.broadinstitute.hellbender.utils.codecs.gtf;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.util.IOUtil;
import htsjdk.tribble.readers.LineIterator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.UserException;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Path;
import java.util.*;

/**
 * Codec to decode data in GTF format from ENSEMBL.
 * According to ENSEMBL, GTF files downloaded from them conform to GFF version 2 (http://gmod.org/wiki/GFF2).
 */
final public class EnsemblGtfCodec extends AbstractGtfCodec {

    private static final Logger logger = LogManager.getLogger(EnsemblGtfCodec.class);

    //==================================================================================================================
    // Public Static Members:

    public static String GTF_FILE_TYPE_STRING = "ENSEMBL";

    //==================================================================================================================
    // Private Static Members:

    private static String            VERSION_FIELD    = "genome-version";
    private static String            DEFAULT_VERSION  = "ENSEMBL_DEFAULT_VERSION";
    private static final Set COMMENT_PREFIXES = Collections.unmodifiableSet(new LinkedHashSet<>(Arrays.asList("#!", "##")));

    //==================================================================================================================
    // Private Members:

    private final        List header          = new ArrayList<>();
    private              int          currentLineNum  = 1;
    private              String       version         = null;

    //==================================================================================================================
    // Constructors:

    public EnsemblGtfCodec() {
        super();
    }

    //==================================================================================================================
    // Override Methods:

    @Override
    String getGtfFileType() {
        return GTF_FILE_TYPE_STRING;
    }

    @Override
    String getDefaultLineComment() {
        return COMMENT_PREFIXES.iterator().next();
    }

    @Override
    Set getAllLineComments() {
        return COMMENT_PREFIXES;
    }

    @Override
    int getCurrentLineNumber() {
        return currentLineNum;
    }

    @Override
    List getHeader() {
        return header;
    }

    @Override
    boolean passesFileNameCheck(final String inputFilePath) {
        try {
            final Path p = IOUtil.getPath(inputFilePath);

            return p.getFileName().toString().toLowerCase().endsWith("." + GTF_FILE_EXTENSION);
        }
        catch (final FileNotFoundException ex) {
            logger.warn("File does not exist! - " + inputFilePath + " - returning name check as failure.");
        }
        catch (final IOException ex) {
            logger.warn("Caught IOException on file: " + inputFilePath + " - returning name check as failure.");
        }

        return false;
    }

    @Override
    List readActualHeader(final LineIterator reader) {

        // Read in the header lines:
        ingestHeaderLines(reader);

        // Validate our header:
        validateHeader(header, true);

        // Set our line number to be the line of the first actual Feature:
        currentLineNum = header.size() + 1;

        // Set up our version number:
        populateVersionNumber();

        return header;
    }

    @Override
    boolean validateFeatureSubtype(final GencodeGtfFeature feature) {
        return validateEnsemblGtfFeature( feature );
    }

    @Override
    void incrementLineNumber() {
        ++currentLineNum;
    }

    @Override
    String getUcscVersionNumber() {
        return version;
    }

    @Override
    /**
     * {@inheritDoc}
     *
     * Because ENSEMBL GTF files are strictly a superset of GENCODE GTF files, we need to do some extra checks here to
     * make sure that this file can NOT be decoded by {@link GencodeGtfCodec} but can still be decoded by this
     * {@link EnsemblGtfCodec}.
     */
    public boolean canDecode(final String inputFilePath) {

        // Create a GencodeGtfCodec so we can see if it will decode the input file.
        final GencodeGtfCodec gencodeGtfCodec = new GencodeGtfCodec();
        if ( gencodeGtfCodec.canDecode(inputFilePath) ) {
            // Uh oh!  We can decode this as GENCODE.
            // So we should NOT decode this as ENSEMBL.
            return false;
        }

        return super.canDecode(inputFilePath);
    }

    //==================================================================================================================
    // Static Methods:

    /**
     * Validates a given {@link GencodeGtfFeature} against a given version of the ENSEMBL GTF file spec.
     * This method ensures that all required fields are defined, but does not interrogate their values.
     * @param feature A {@link GencodeGtfFeature} to validate.  MUST NOT BE {@code null}.
     * @return True if {@code feature} contains all required fields for the given GENCODE GTF version, {@code gtfVersion}
     */
    private static boolean validateEnsemblGtfFeature(final GencodeGtfFeature feature) {

        final GencodeGtfFeature.FeatureType featureType = feature.getFeatureType();

        if ( featureType != GencodeGtfFeature.FeatureType.GENE) {
            if (feature.getTranscriptId() == null) {
                return false;
            }
            if (feature.getTranscriptType() == null) {
                return false;
            }
            if (feature.getTranscriptName() == null) {
                return false;
            }
        }

        return true;
    }

    //==================================================================================================================
    // Instance Methods:

    private void populateVersionNumber() {

        // If `genome-version` was specified in the header, we should use that.
        // Otherwise we can return a placeholder.

        String ver = DEFAULT_VERSION;

        // Attempt to get the version from the header:
        for ( final String line : header ) {
            for ( final String comment : getAllLineComments() ) {
                if ( line.startsWith(comment + VERSION_FIELD) ) {
                    ver = line.replaceFirst(comment + VERSION_FIELD + "\\s*", "").trim();
                }
            }
        }

        version = ver;
    }

    /**
     * Check if the given header of a tentative ENSEMBL GTF file is, in fact, the header to such a file.
     * @param header Header lines to check for conformity to ENSEMBL GTF specifications.
     * @param throwIfInvalid If true, will throw a {@link UserException.MalformedFile} if the header is invalid.
     * @return true if the given {@code header} is that of a ENSEMBL GTF file; false otherwise.
     */
    @VisibleForTesting
    boolean validateHeader(final List header, final boolean throwIfInvalid) {
        // As it turns out, the ENSEMBL GTF header is pretty loosy-goosy.
        // No fields are required, and therefore it could actually be empty.

        // Rather than attempting to validate the file, here we just
        // assert that all header lines begin with a comment (they should already).
        int lineNum = 1;
        for (final String line : header) {

            if ( !isLineCommented(line) ) {
                if ( throwIfInvalid ) {
                    throw new UserException.MalformedFile("ENSEMBL GTF Header line " + lineNum + " is not commented: " + line);
                }
                else {
                    return false;
                }
            }

            ++lineNum;
        }

        return true;
    }

    //==================================================================================================================
    // Helper Data Types:

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy