All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.conversion.GtfToBed Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.conversion;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.*;
import htsjdk.tribble.Feature;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.WorkflowProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature;

import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.util.*;

/**
 * 

Convert Gencode GTF files to BED format with options for gene and transcript level processing. * This tool allows for the extraction of gene and transcript information from Gencode GTF files and * outputs the data in BED format.

* * *

The conversion process includes sorting entries * by karyotype, providing flexibility in the selection of either gene or transcript level * data, and an option to only use basic tags. It ensures that the BED output is sorted and formatted correctly for subsequent use. * Note that it has been tested for both human and mouse Gencode GTFs.

* *

Usage examples

*

Example commands to run GtfToBed for typical scenarios:

* *

(i) Convert GTF to BED with gene level data

*

This mode extracts and converts gene data from the input GTF file to BED format:

* *
 *     java -jar GtfToBed.jar \
 *     -gtf-path input.gtf \
 *     -sequence-dictionary dictionary.dict \
= *    -output output.bed \
 * 
* *

(ii) Convert GTF to BED with transcript level data

*

This mode extracts and converts transcript data from the input GTF file to BED format:

* *
 *     java -jar GtfToBed.jar \
 *     -gtf-path input.gtf \
 *     -sequence-dictionary dictionary.dict \
 *     -sort-by-transcript \
 *     -output output.bed \
 * 
* *

(iii) Convert GTF to BED with transcript level data, filtering for only those with the basic tag

*

This mode extracts and converts basic transcript data from the input GTF file to BED format:

* *
 *     java -jar GtfToBed.jar \
 *     -gtf-path input.gtf \
 *     -sequence-dictionary dictionary.dict \
 *     -sort-by-transcript \
 *     -use-basic-transcript \
 *     -output output.bed \
 * 
* *

(iiii)

Convert GTF to BED with transcript level data using a reference FASTA instead of DICT file. *

This mode extracts and converts transcript data from the input GTF file to BED format using a FASTA file:

* *
 *     java -jar GtfToBed.jar \
 *     -gtf-path input.gtf \
 *     -reference ref.fa \
 *     -sort-by-transcript \
 *     -output output.bed \
 * 
*/ @CommandLineProgramProperties( summary = "Converts Gencode GTF files to Bed file format with each row of bed file being either a gene or a transcript.", oneLineSummary = "Gencode GTF to BED", programGroup = ShortVariantDiscoveryProgramGroup.class ) @DocumentedFeature @WorkflowProperties public class GtfToBed extends FeatureWalker { public static final String SORT_BY_TRANSCRIPT_LONG_NAME = "sort-by-transcript"; public static final String USE_BASIC_TRANSCRIPT_LONG_NAME = "use-basic-transcript"; public static final String INPUT_LONG_NAME = "gtf-path"; protected final Logger logger = LogManager.getLogger(this.getClass()); @Argument(fullName = INPUT_LONG_NAME, doc = "Path to Gencode GTF file") public GATKPath inputFile; @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME , doc = "Output BED file") public GATKPath outputFile; @Argument(fullName = SORT_BY_TRANSCRIPT_LONG_NAME, doc = "Make each row of BED file sorted by transcript", optional = true) public boolean sortByTranscript = false; @Argument(fullName = USE_BASIC_TRANSCRIPT_LONG_NAME, doc = "Only use basic transcripts") public boolean sortByBasic = false; //stores either gene or transcript ID and summary information about the feature private final Map featureInfoMap = new HashMap<>(); //Sequence Dictionary private SAMSequenceDictionary sequenceDictionary = null; @Override protected boolean isAcceptableFeatureType(Class featureType) { return featureType.isAssignableFrom(GencodeGtfFeature.class); } // runs per line of gtf file @Override public void apply(GencodeGtfFeature feature, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { // list of all features of the gene List geneFeatures = feature.getAllFeatures(); // process each gtf feature in the list of gene features for (GencodeGtfFeature gtfFeature : geneFeatures) { // the basic tag is in optional fields List> optionalFields = getOptionalFields(gtfFeature); // if the gtf feature is a Gene if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.GENE) { processGeneFeature(gtfFeature); } // check if the gtf feature is a transcript. If user only wants basic transcripts check that it has the basic tag else if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.TRANSCRIPT) { if (sortByBasic) { for (GencodeGtfFeature.OptionalField field : optionalFields) { if ("basic".equals(field.getValue())) { processTranscriptFeature(gtfFeature); } } } else { processTranscriptFeature(gtfFeature); } } } } // gets the tag out of the list of optional fields private List> getOptionalFields(GencodeGtfFeature gtfFeature) { List> optionalFields = null; try { optionalFields = gtfFeature.getOptionalField("tag"); } catch (Exception e) { logger.error("Could not retrieve optional fields: ", e); } return optionalFields; } // stores the gene ID and Interval info in hashmap private void processGeneFeature(GencodeGtfFeature gtfFeature) { final int geneStart = gtfFeature.getStart(); final int geneEnd = gtfFeature.getEnd(); final Interval interval = new Interval(gtfFeature.getContig(), geneStart, geneEnd); // put the interval, type as gene, and the name of gene final GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.GENE, gtfFeature.getGeneName()); // store in hashmap with key as geneId featureInfoMap.put(gtfFeature.getGeneId(), gtfInfo); } // stores the transcript ID and Interval info in hashmap private void processTranscriptFeature(GencodeGtfFeature gtfFeature) { //get interval and put the interval, type as transcript, and the name of the gene it's in final Interval interval = new Interval(gtfFeature.getContig(), gtfFeature.getStart(), gtfFeature.getEnd()); final GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.TRANSCRIPT, gtfFeature.getGeneName()); //store in hashmap with key as transcriptId featureInfoMap.put(gtfFeature.getTranscriptId(), gtfInfo); //update start/end of corresponding gene if needed updateGeneStart(gtfFeature); updateGeneEnd(gtfFeature); } // update the gene interval start position based on the transcript private void updateGeneStart(GencodeGtfFeature gtfFeature) { // get the start value of the gene int geneStart = featureInfoMap.get(gtfFeature.getGeneId()).getStart(); // if the transcript start is less than the gene start if (gtfFeature.getStart() < geneStart) { // set the gene start to be the transcript start geneStart = gtfFeature.getStart(); updateGeneInterval(gtfFeature, geneStart, featureInfoMap.get(gtfFeature.getGeneId()).getEnd()); } } // update the gene interval end position based on the transcript private void updateGeneEnd(GencodeGtfFeature gtfFeature) { // get the end value of the gene int geneEnd = featureInfoMap.get(gtfFeature.getGeneId()).getEnd(); // if the transcript end is greater than the gene end if (gtfFeature.getEnd() > geneEnd) { // set the gene end to be the transcript end geneEnd = gtfFeature.getEnd(); updateGeneInterval(gtfFeature, featureInfoMap.get(gtfFeature.getGeneId()).getStart(), geneEnd); } } // updates an interval of the gene if it needs to be changed private void updateGeneInterval(GencodeGtfFeature gtfFeature, int geneStart, int geneEnd) { Interval geneInterval = new Interval(gtfFeature.getContig(), geneStart, geneEnd); GtfInfo gtfGeneInfo = new GtfInfo(geneInterval, GtfInfo.Type.GENE, gtfFeature.getGeneName()); featureInfoMap.put(gtfFeature.getGeneId(), gtfGeneInfo); } @Override public void onTraversalStart() { sequenceDictionary = getBestAvailableSequenceDictionary(); if(sequenceDictionary == null){ throw new UserException("Sequence Dictionary must be specified (" + StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME + ")."); } } // runs immediately after it has gone through each line of gtf (apply method) @Override public Object onTraversalSuccess() { // create linked hash map to store sorted values of idToInfo LinkedHashMap karyotypeIdToInfo = getSortedMap(sequenceDictionary); // if user wants to sort by transcript only use transcripts else only use genes GtfInfo.Type selectedType = sortByTranscript ? GtfInfo.Type.TRANSCRIPT : GtfInfo.Type.GENE; writeToBed(selectedType, karyotypeIdToInfo); return null; } //Compare GtfInfo objects positionally by contig and start position. If transcripts have the same contig and start, compare by TranscriptId public static class GtfInfoComparator implements Comparator> { private final SAMSequenceDictionary dictionary; GtfInfoComparator(SAMSequenceDictionary dictionary) { this.dictionary = dictionary; } // compare two entries of a map where key = geneId or transcriptId and value = gtfInfo object @Override public int compare(Map.Entry e1, Map.Entry e2) { final Interval e1Interval = e1.getValue().getInterval(); final Interval e2Interval = e2.getValue().getInterval(); Utils.nonNull(dictionary.getSequence(e1Interval.getContig()), "could not get sequence for " + e1Interval.getContig()); Utils.nonNull(dictionary.getSequence(e2Interval.getContig()), "could not get sequence for " + e2Interval.getContig()); //compare by contig, then start, then by key return Comparator .comparingInt((Map.Entry e) -> dictionary.getSequence(e.getValue().getInterval().getContig()).getSequenceIndex()) .thenComparingInt(e -> e.getValue().getInterval().getStart()) .thenComparing(Map.Entry::getKey) .compare(e1,e2); } } // sorts the map containing the features based on contig and start position private LinkedHashMap getSortedMap(SAMSequenceDictionary sequenceDictionary) { // create a list that has the keys and values of idToInfo and sort the list using GtfInfoComparator List> entries = new ArrayList<>(featureInfoMap.entrySet()); entries.sort(new GtfInfoComparator(sequenceDictionary)); // put each (sorted) entry in the list into a linked hashmap LinkedHashMap karyotypeIdToInfo = new LinkedHashMap<>(); for (Map.Entry entry : entries) { karyotypeIdToInfo.put(entry.getKey(), entry.getValue()); } return karyotypeIdToInfo; } // writes to bed file private void writeToBed(GtfInfo.Type type, Map sortedMap) { try (final OutputStream writer = Files.newOutputStream(outputFile.toPath())) { for (Map.Entry entry : sortedMap.entrySet()) { if (entry.getValue().getType() == type) { String line = formatBedLine(entry, type); writer.write((line + System.lineSeparator()).getBytes()); } } } catch (IOException e) { throw new GATKException("Error writing to BED file", e); } } // formats each line of the bed file depending on whether user has selected gene or transcript private String formatBedLine(Map.Entry entry, GtfInfo.Type type) { GtfInfo info = entry.getValue(); String line = info.getInterval().getContig() + "\t" + info.getInterval().getStart() + "\t" + info.getInterval().getEnd() + "\t" + info.getGeneName(); if (type == GtfInfo.Type.TRANSCRIPT) { line += "," + entry.getKey(); } return line; } @Override public GATKPath getDrivingFeaturePath() { return inputFile; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy