picard.annotation.RefFlatReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of picard Show documentation
picard
There is a newer version: 3.2.0
/*
 * The MIT License
 *
 * Copyright (c) 2011 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package picard.annotation;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.OverlapDetector;
import picard.annotation.Gene.Transcript;
import picard.annotation.Gene.Transcript.Exon;
import picard.util.TabbedTextFileWithHeaderParser;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Loads gene annotations from a refFlat file into an OverlapDetector.  Discards annotations that are not
 * internally consistent, e.g. transcripts on different chromosomes or different strands.
 */
public class RefFlatReader {
    private static final Log LOG = Log.getInstance(RefFlatReader.class);
    // These are in the order that columns appear in refFlat format.
    public enum RefFlatColumns{GENE_NAME, TRANSCRIPT_NAME, CHROMOSOME, STRAND, TX_START, TX_END, CDS_START, CDS_END,
        EXON_COUNT, EXON_STARTS, EXON_ENDS}

    private static final String[] RefFlatColumnLabels = new String[RefFlatColumns.values().length];

    static {
        for (int i = 0; i < RefFlatColumnLabels.length; ++i) {
            RefFlatColumnLabels[i] = RefFlatColumns.values()[i].name();
        }
    }

    private final File refFlatFile;
    private final SAMSequenceDictionary sequenceDictionary;

    RefFlatReader(final File refFlatFile, final SAMSequenceDictionary sequenceDictionary) {
        this.refFlatFile = refFlatFile;
        this.sequenceDictionary = sequenceDictionary;
    }

    static OverlapDetector load(final File refFlatFile, final SAMSequenceDictionary sequenceDictionary) {
        return new RefFlatReader(refFlatFile, sequenceDictionary).load();
    }

    OverlapDetector load() {
        final OverlapDetector overlapDetector = new OverlapDetector(0, 0);

        final int expectedColumns = RefFlatColumns.values().length;
        final TabbedTextFileWithHeaderParser parser = new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels);
        final Map> refFlatLinesByGene =
                new HashMap>();

        for (final TabbedTextFileWithHeaderParser.Row row : parser) {
            final int lineNumber = parser.getCurrentLineNumber(); // getCurrentLineNumber returns the number of the next line
            if (row.getFields().length != expectedColumns) {
                throw new AnnotationException("Wrong number of fields in refFlat file " + refFlatFile + " at line " +
                        lineNumber);
            }
            final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
            final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
            final String transcriptDescription = geneName + ":" + transcriptName;
            final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name());
            if (!isSequenceRecognized(chromosome)) {
                LOG.debug("Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome);
            } else {
                List transcriptLines = refFlatLinesByGene.get(geneName);
                if (transcriptLines == null) {
                    transcriptLines = new ArrayList();
                    refFlatLinesByGene.put(geneName, transcriptLines);
                }
                transcriptLines.add(row);
            }
        }

        int longestInterval = 0;
        int numIntervalsOver1MB = 0;

        for (final List transcriptLines : refFlatLinesByGene.values()) {
            try {
                final Gene gene = makeGeneFromRefFlatLines(transcriptLines);
                overlapDetector.addLhs(gene, gene);
                if (gene.length() > longestInterval) longestInterval = gene.length();
                if (gene.length() > 1000000) ++numIntervalsOver1MB;
            } catch (AnnotationException e) {
                LOG.debug(e.getMessage() + " -- skipping");
            }
        }
        LOG.debug("Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB);
        return overlapDetector;
    }

    private boolean isSequenceRecognized(final String sequence) {
        return (sequenceDictionary.getSequence(sequence) != null);
    }


    private Gene makeGeneFromRefFlatLines(final List transcriptLines) {
        final String geneName = transcriptLines.get(0).getField(RefFlatColumns.GENE_NAME.name());
        final String strandStr = transcriptLines.get(0).getField(RefFlatColumns.STRAND.name());
        final boolean negative = strandStr.equals("-");
        final String chromosome = transcriptLines.get(0).getField(RefFlatColumns.CHROMOSOME.name());

        // Figure out the extend of the gene
        int start = Integer.MAX_VALUE;
        int end = Integer.MIN_VALUE;
        for (final TabbedTextFileWithHeaderParser.Row row: transcriptLines) {
            start = Math.min(start, row.getIntegerField(RefFlatColumns.TX_START.name()) + 1);
            end   = Math.max(end,   row.getIntegerField(RefFlatColumns.TX_END.name()));
        }

        final Gene gene = new Gene(chromosome, start, end, negative, geneName);

        for (final TabbedTextFileWithHeaderParser.Row row: transcriptLines) {
            if (!strandStr.equals(row.getField(RefFlatColumns.STRAND.name()))) {
                throw new AnnotationException("Strand disagreement in refFlat file for gene " + geneName);
            }
            if (!chromosome.equals(row.getField(RefFlatColumns.CHROMOSOME.name()))) {
                throw new AnnotationException("Chromosome disagreement(" + chromosome + " != " + row.getField(RefFlatColumns.CHROMOSOME.name()) +
                                                      ") in refFlat file for gene " + geneName);
            }

            // This adds it to the Gene also
            final Transcript tx = makeTranscriptFromRefFlatLine(gene, row);
        }

        return gene;
    }

    /**
     * Conversion from 0-based half-open to 1-based inclusive intervals is done here.
     */
    private Gene.Transcript makeTranscriptFromRefFlatLine(final Gene gene, final TabbedTextFileWithHeaderParser.Row row) {
        final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
        final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
        final String transcriptDescription = geneName + ":" + transcriptName;
        final int exonCount = Integer.parseInt(row.getField(RefFlatColumns.EXON_COUNT.name()));
        final String[] exonStarts = row.getField(RefFlatColumns.EXON_STARTS.name()).split(",");
        final String[] exonEnds = row.getField(RefFlatColumns.EXON_ENDS.name()).split(",");

        if (exonCount != exonStarts.length) {
            throw new AnnotationException("Number of exon starts does not agree with number of exons for " + transcriptDescription);
        }
        if (exonCount != exonEnds.length) {
            throw new AnnotationException("Number of exon ends does not agree with number of exons for " + transcriptDescription);
        }

        final int transcriptionStart = row.getIntegerField(RefFlatColumns.TX_START.name()) + 1;
        final int transcriptionEnd = row.getIntegerField(RefFlatColumns.TX_END.name());
        final int codingStart = row.getIntegerField(RefFlatColumns.CDS_START.name()) + 1;
        final int codingEnd = row.getIntegerField(RefFlatColumns.CDS_END.name());

        final Transcript tx = gene.addTranscript(transcriptName, transcriptionStart, transcriptionEnd, codingStart, codingEnd, exonCount);

        for (int i = 0; i < exonCount; ++i) {
            final Exon e = tx.addExon(Integer.parseInt(exonStarts[i]) + 1, Integer.parseInt(exonEnds[i]));

            if (e.start > e.end) {
                throw new AnnotationException("Exon has 0 or negative extent for " + transcriptDescription);
            }
            if (i > 0 && tx.exons[i-1].end >= tx.exons[i].start) {
                throw new AnnotationException("Exons overlap for " + transcriptDescription);
            }
        }

        return tx;
    }
}