org.broadinstitute.hellbender.utils.codecs.refseq.RefSeqCodec Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
There is a newer version: 4.6.0.0
package org.broadinstitute.hellbender.utils.codecs.refseq;

import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.Feature;
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.readers.LineIterator;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.ArrayList;

/**
 * Allows for reading in RefSeq information
 * TODO this header needs to be rewritten
 * 
 * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop,
 * strandedness of transcription. 
 * 
 *
 * 
 * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the documentation guide here
 * https://gatkforums.broadinstitute.org/gatk/discussion/1329/where-can-i-get-a-gene-list-in-refseq-format
 * 
 *
 * You will need to consult individual walkers for the argument name
 *
 * File format example
 * If you want to define your own file for use, the format is (tab delimited):
 * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete)
 * and exon frames, for example:
 *  * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
 * 
 * for more information see here
 * 
 *     
 * 
 *
 * @since 2010
 */
public class RefSeqCodec extends AsciiFeatureCodec{

    // codec file extension
    protected static final String FILE_EXT = "refseq";
    public static final String COMMENT_LINE_CHARACTER = "#";
    public static final String LINE_DELIMITER = "\t";
    public static final int MINIMUM_LINE_FIELD_COUNT = 16;
    private boolean zero_coding_length_user_warned = false;

    // Indexes for interpreting lines:
    public static final int TRANSCRIPT_ID_INDEX = 1;
    public static final int CONTIG_INDEX = 2;
    public static final int STRAND_INDEX = 3;
    public static final int INTERVAL_LEFT_BOUND_INDEX = 4;
    public static final int INTERVAL_RIGHT_BOUND_INDEX = 5;
    public static final int CODING_START_INDEX = 6;
    public static final int CODING_STOP_INDEX = 7;
    public static final int EXON_STARTS_INDEX = 9;
    public static final int EXON_STOPS_INDEX = 10;
    public static final int GENE_NAME_INDEX = 12;
    public static final int EXON_FRAMES_INDEX = 15;


    public RefSeqCodec() {
        super(RefSeqFeature.class);
    }

    @Override
    public Feature decodeLoc(final LineIterator lineIterator) {
        final String line = lineIterator.next();
        if (line.startsWith(COMMENT_LINE_CHARACTER)){
            return null;
        }
        final String fields[] = line.split(LINE_DELIMITER);
        if (fields.length < MINIMUM_LINE_FIELD_COUNT){
            throw new TribbleException("RefSeq (decodeLoc) : Unable to parse line -> " + line + ", we expected at least 16 columns, we saw " + fields.length);
        }
        final String contig_name = fields[CONTIG_INDEX];
        try {
            return new RefSeqFeature(new SimpleInterval(contig_name, Integer.parseInt(fields[INTERVAL_LEFT_BOUND_INDEX])+1, Integer.parseInt(fields[INTERVAL_RIGHT_BOUND_INDEX])));
        //TODO maybe except for malformed simple intervals? Genome locs had that
        } catch ( NumberFormatException e ) {
            throw new UserException.MalformedFile("Could not parse location from line: " + line);
        }
    }

    /** Fills this object from a text line in RefSeq (UCSC) text dump file */
    @Override
    public RefSeqFeature decode(String line) {
        if (line.startsWith(COMMENT_LINE_CHARACTER)) {
            return null;
        }
        String fields[] = line.split(LINE_DELIMITER);

        // we reference postion 15 in the split array below, make sure we have at least that many columns
        if (fields.length < MINIMUM_LINE_FIELD_COUNT) {
            throw new TribbleException("RefSeq (decode) : Unable to parse line -> " + line + ", we expected at least 16 columns, we saw " + fields.length);
        }
        String contig_name = fields[CONTIG_INDEX];
        RefSeqFeature feature = new RefSeqFeature(new SimpleInterval(contig_name, Integer.parseInt(fields[INTERVAL_LEFT_BOUND_INDEX])+1, Integer.parseInt(fields[INTERVAL_RIGHT_BOUND_INDEX])));

        feature.setTranscript_id(fields[TRANSCRIPT_ID_INDEX]);
        if ( fields[STRAND_INDEX].length()==1 && fields[STRAND_INDEX].charAt(0)=='+') {
            feature.setStrand(1);

        } else if ( fields[STRAND_INDEX].length()==1 && fields[STRAND_INDEX].charAt(0)=='-') {
            feature.setStrand(-1);

        } else {
            throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[STRAND_INDEX] + " for line=" + line);
        }

        int coding_start = Integer.parseInt(fields[CODING_START_INDEX])+1;
        int coding_stop = Integer.parseInt(fields[CODING_STOP_INDEX]);

        if ( coding_start > coding_stop ) {
            if ( ! zero_coding_length_user_warned ) {
                Utils.warnUser("RefSeq file contains transcripts with zero coding length. "+
                        "Such transcripts will be ignored (this warning is printed only once)");
                zero_coding_length_user_warned = true;
            }
            return null;
        }

        feature.setTranscript_interval(new SimpleInterval(contig_name, Integer.parseInt(fields[INTERVAL_LEFT_BOUND_INDEX])+1, Integer.parseInt(fields[INTERVAL_RIGHT_BOUND_INDEX])));
        feature.setTranscript_coding_interval(new SimpleInterval(contig_name, coding_start, coding_stop));
        feature.setGene_name(fields[GENE_NAME_INDEX]);
        String[] exon_starts = fields[EXON_STARTS_INDEX].split(",");
        String[] exon_stops = fields[EXON_STOPS_INDEX].split(",");
        String[] eframes = fields[EXON_FRAMES_INDEX].split(",");

        if ( exon_starts.length != exon_stops.length ) {
            throw new UserException.MalformedFile("Data format error: numbers of exon start and stop positions differ for line=" + line);
        }

        if ( exon_starts.length != eframes.length ) {
            throw new UserException.MalformedFile("Data format error: numbers of exons and exon frameshifts differ for line=" + line);
        }

        ArrayList exons = new ArrayList<>(exon_starts.length);
        ArrayList exon_frames = new ArrayList(eframes.length);

        for ( int i = 0 ; i < exon_starts.length  ; i++ ) {
            // NOTE, we add 1 here to account for represention issues in the exon counts
            exons.add(new SimpleInterval(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
            exon_frames.add(Integer.decode(eframes[i]));
        }

        feature.setExons(exons);
        feature.setExon_frames(exon_frames);
        return feature;
    }

    /**
     * Can the file be decoded?
     * @param path path the file to test for parsability with this codec
     * @return true if the path has the correct file extension, false otherwise
     */
    @Override
    public boolean canDecode(final String path) { return path.endsWith("." + FILE_EXT); }

    @Override
    public Object readActualHeader(LineIterator lineIterator) {
        // No header for this format
        return null;
    }
}