org.broadinstitute.hellbender.tools.walkers.conversion.GtfToBed Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.walkers.conversion;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.*;
import htsjdk.tribble.Feature;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.WorkflowProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature;

import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.util.*;

/**
 * Convert Gencode GTF files to BED format with options for gene and transcript level processing.
 * This tool allows for the extraction of gene and transcript information from Gencode GTF files and
 * outputs the data in BED format.
 *
 *
 * The conversion process includes sorting entries
 * by karyotype, providing flexibility in the selection of either gene or transcript level
 * data, and an option to only use basic tags. It ensures that the BED output is sorted and formatted correctly for subsequent use.
 * Note that it has been tested for both human and mouse Gencode GTFs. 
 *
 * Usage examples
 * Example commands to run GtfToBed for typical scenarios:
 *
 * (i) Convert GTF to BED with gene level data
 * This mode extracts and converts gene data from the input GTF file to BED format:
 *
 *  *     java -jar GtfToBed.jar \
 *     -gtf-path input.gtf \
 *     -sequence-dictionary dictionary.dict \
= *    -output output.bed \
 * 
 *
 * (ii) Convert GTF to BED with transcript level data
 * This mode extracts and converts transcript data from the input GTF file to BED format:
 *
 *  *     java -jar GtfToBed.jar \
 *     -gtf-path input.gtf \
 *     -sequence-dictionary dictionary.dict \
 *     -sort-by-transcript \
 *     -output output.bed \
 * 
 *
 * (iii) Convert GTF to BED with transcript level data, filtering for only those with the basic tag
 * This mode extracts and converts basic transcript data from the input GTF file to BED format:
 *
 *  *     java -jar GtfToBed.jar \
 *     -gtf-path input.gtf \
 *     -sequence-dictionary dictionary.dict \
 *     -sort-by-transcript \
 *     -use-basic-transcript \
 *     -output output.bed \
 * 
 *
 * (iiii)  Convert GTF to BED with transcript level data using a reference FASTA instead of DICT file.
 * This mode extracts and converts transcript data from the input GTF file to BED format using a FASTA file:
 *
 *  *     java -jar GtfToBed.jar \
 *     -gtf-path input.gtf \
 *     -reference ref.fa \
 *     -sort-by-transcript \
 *     -output output.bed \
 * 
 */

@CommandLineProgramProperties(
        summary = "Converts Gencode GTF files to Bed file format with each row of bed file being either a gene or a transcript.",
        oneLineSummary = "Gencode GTF to BED",
        programGroup = ShortVariantDiscoveryProgramGroup.class
)

@DocumentedFeature
@WorkflowProperties
public class GtfToBed extends FeatureWalker {
    public static final String SORT_BY_TRANSCRIPT_LONG_NAME = "sort-by-transcript";
    public static final String USE_BASIC_TRANSCRIPT_LONG_NAME = "use-basic-transcript";
    public static final String INPUT_LONG_NAME = "gtf-path";
    protected final Logger logger = LogManager.getLogger(this.getClass());

    @Argument(fullName = INPUT_LONG_NAME, doc = "Path to Gencode GTF file")
    public GATKPath inputFile;

    @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME , doc = "Output BED file")
    public GATKPath outputFile;

    @Argument(fullName = SORT_BY_TRANSCRIPT_LONG_NAME, doc = "Make each row of BED file sorted by transcript", optional = true)
    public boolean sortByTranscript = false;

    @Argument(fullName = USE_BASIC_TRANSCRIPT_LONG_NAME, doc = "Only use basic transcripts")
    public boolean sortByBasic = false;

    //stores either gene or transcript ID and summary information about the feature
    private final Map featureInfoMap = new HashMap<>();

    //Sequence Dictionary
    private SAMSequenceDictionary sequenceDictionary = null;

    @Override
    protected boolean isAcceptableFeatureType(Class featureType) {
        return featureType.isAssignableFrom(GencodeGtfFeature.class);
    }

    // runs per line of gtf file
    @Override
    public void apply(GencodeGtfFeature feature, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {
        // list of all features of the gene
        List geneFeatures = feature.getAllFeatures();

        // process each gtf feature in the list of gene features
        for (GencodeGtfFeature gtfFeature : geneFeatures) {
            // the basic tag is in optional fields
            List> optionalFields = getOptionalFields(gtfFeature);

            // if the gtf feature is a Gene
            if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.GENE) {
                processGeneFeature(gtfFeature);
            }

            //  check if the gtf feature is a transcript. If user only wants basic transcripts check that it has the basic tag
            else if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.TRANSCRIPT) {
                if (sortByBasic) {
                    for (GencodeGtfFeature.OptionalField field : optionalFields) {
                        if ("basic".equals(field.getValue())) {
                            processTranscriptFeature(gtfFeature);
                        }
                    }
                } else {
                    processTranscriptFeature(gtfFeature);
                }
            }
        }
    }

    // gets the tag out of the list of optional fields
    private List> getOptionalFields(GencodeGtfFeature gtfFeature) {
        List> optionalFields = null;
        try {
            optionalFields = gtfFeature.getOptionalField("tag");
        } catch (Exception e) {
            logger.error("Could not retrieve optional fields: ", e);
        }
        return optionalFields;
    }

    // stores the gene ID and Interval info in hashmap
    private void processGeneFeature(GencodeGtfFeature gtfFeature) {
        final int geneStart = gtfFeature.getStart();
        final int geneEnd = gtfFeature.getEnd();
        final Interval interval = new Interval(gtfFeature.getContig(), geneStart, geneEnd);

        // put the interval, type as gene, and the name of gene
        final GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.GENE, gtfFeature.getGeneName());

        // store in hashmap with key as geneId
        featureInfoMap.put(gtfFeature.getGeneId(), gtfInfo);
    }

    // stores the transcript ID and Interval info in hashmap
    private void processTranscriptFeature(GencodeGtfFeature gtfFeature) {
        //get interval and put the interval, type as transcript, and the name of the gene it's in
        final Interval interval = new Interval(gtfFeature.getContig(), gtfFeature.getStart(), gtfFeature.getEnd());
        final GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.TRANSCRIPT, gtfFeature.getGeneName());

        //store in hashmap with key as transcriptId
        featureInfoMap.put(gtfFeature.getTranscriptId(), gtfInfo);

        //update start/end of corresponding gene if needed
        updateGeneStart(gtfFeature);
        updateGeneEnd(gtfFeature);
    }

    // update the gene interval start position based on the transcript
    private void updateGeneStart(GencodeGtfFeature gtfFeature) {
        // get the start value of the gene
        int geneStart = featureInfoMap.get(gtfFeature.getGeneId()).getStart();

        // if the transcript start is less than the gene start
        if (gtfFeature.getStart() < geneStart) {
            // set the gene start to be the transcript start
            geneStart = gtfFeature.getStart();
            updateGeneInterval(gtfFeature, geneStart, featureInfoMap.get(gtfFeature.getGeneId()).getEnd());
        }
    }

    // update the gene interval end position based on the transcript
    private void updateGeneEnd(GencodeGtfFeature gtfFeature) {
        // get the end value of the gene
        int geneEnd = featureInfoMap.get(gtfFeature.getGeneId()).getEnd();

        // if the transcript end is greater than the gene end
        if (gtfFeature.getEnd() > geneEnd) {
            // set the gene end to be the transcript end
            geneEnd = gtfFeature.getEnd();
            updateGeneInterval(gtfFeature, featureInfoMap.get(gtfFeature.getGeneId()).getStart(), geneEnd);
        }
    }

    // updates an interval of the gene if it needs to be changed
    private void updateGeneInterval(GencodeGtfFeature gtfFeature, int geneStart, int geneEnd) {
        Interval geneInterval = new Interval(gtfFeature.getContig(), geneStart, geneEnd);
        GtfInfo gtfGeneInfo = new GtfInfo(geneInterval, GtfInfo.Type.GENE, gtfFeature.getGeneName());
        featureInfoMap.put(gtfFeature.getGeneId(), gtfGeneInfo);
    }

    @Override
    public void onTraversalStart() {
        sequenceDictionary = getBestAvailableSequenceDictionary();
        if(sequenceDictionary == null){
            throw new UserException("Sequence Dictionary must be specified (" + StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME + ").");
        }
    }

    // runs immediately after it has gone through each line of gtf (apply method)
    @Override
    public Object onTraversalSuccess() {
        // create linked hash map to store sorted values of idToInfo
        LinkedHashMap karyotypeIdToInfo = getSortedMap(sequenceDictionary);

        // if user wants to sort by transcript only use transcripts else only use genes
        GtfInfo.Type selectedType = sortByTranscript ? GtfInfo.Type.TRANSCRIPT : GtfInfo.Type.GENE;
        writeToBed(selectedType, karyotypeIdToInfo);

        return null;
    }

    //Compare GtfInfo objects positionally by contig and start position. If transcripts have the same contig and start, compare by TranscriptId
    public static class GtfInfoComparator implements Comparator> {

        private final SAMSequenceDictionary dictionary;

        GtfInfoComparator(SAMSequenceDictionary dictionary) {
            this.dictionary = dictionary;
        }

        // compare two entries of a map where key = geneId or transcriptId and value = gtfInfo object
        @Override
        public int compare(Map.Entry e1, Map.Entry e2) {
            final Interval e1Interval = e1.getValue().getInterval();
            final Interval e2Interval = e2.getValue().getInterval();

            Utils.nonNull(dictionary.getSequence(e1Interval.getContig()), "could not get sequence for " + e1Interval.getContig());
            Utils.nonNull(dictionary.getSequence(e2Interval.getContig()), "could not get sequence for " + e2Interval.getContig());

            //compare by contig, then start, then by key
            return Comparator
                    .comparingInt((Map.Entry e) ->
                            dictionary.getSequence(e.getValue().getInterval().getContig()).getSequenceIndex())
                    .thenComparingInt(e -> e.getValue().getInterval().getStart())
                    .thenComparing(Map.Entry::getKey)
                    .compare(e1,e2);


        }
    }

    // sorts the map containing the features based on contig and start position
    private LinkedHashMap getSortedMap(SAMSequenceDictionary sequenceDictionary) {
        // create a list that has the keys and values of idToInfo and sort the list using GtfInfoComparator
        List> entries = new ArrayList<>(featureInfoMap.entrySet());
        entries.sort(new GtfInfoComparator(sequenceDictionary));

        // put each (sorted) entry in the list into a linked hashmap
        LinkedHashMap karyotypeIdToInfo = new LinkedHashMap<>();
        for (Map.Entry entry : entries) {
            karyotypeIdToInfo.put(entry.getKey(), entry.getValue());
        }

        return karyotypeIdToInfo;
    }

    // writes to bed file
    private void writeToBed(GtfInfo.Type type, Map sortedMap) {
        try (final OutputStream writer = Files.newOutputStream(outputFile.toPath())) {
            for (Map.Entry entry : sortedMap.entrySet()) {
                if (entry.getValue().getType() == type) {
                    String line = formatBedLine(entry, type);
                    writer.write((line + System.lineSeparator()).getBytes());
                }
            }
        } catch (IOException e) {
            throw new GATKException("Error writing to BED file", e);
        }
    }

    // formats each line of the bed file depending on whether user has selected gene or transcript
    private String formatBedLine(Map.Entry entry, GtfInfo.Type type) {
        GtfInfo info = entry.getValue();
        String line = info.getInterval().getContig() + "\t" +
                info.getInterval().getStart() + "\t" +
                info.getInterval().getEnd() + "\t" +
                info.getGeneName();

        if (type == GtfInfo.Type.TRANSCRIPT) {
            line += "," + entry.getKey();
        }

        return line;
    }

    @Override
    public GATKPath getDrivingFeaturePath() {
        return inputFile;
    }
}