All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.db_loading.CreateGeneIntervalsFromConservedGFFPlugin Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
/**
 * 
 */
package net.maizegenetics.pangenome.db_loading;

import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import javax.swing.ImageIcon;

import org.apache.log4j.Logger;

import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;

import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Utils;

/**
 * Creates bed file with chromosome and coordinates to be used to create PHG reference ranges.
 * Gene names are included with each reference range.  The gene name column
 * is ignored when loading the bed file - it is there for when we want to associate
 * reference ranges with the genes on which they were based.
 * 
 * @author lcj34
 *
 */
public class CreateGeneIntervalsFromConservedGFFPlugin extends AbstractPlugin{
    private static final Logger myLogger = Logger.getLogger(CreateGeneIntervalsFromConservedGFFPlugin.class);

    private PluginParameter myGffFile = new PluginParameter.Builder("gffFile", null, String.class).guiName("Gene File").required(true).inFile()
            .description("Tab delimited .txt file containing gene-only GFF data from reference GFF file for all desired chromosomes, ").build();
    private PluginParameter myOutputFile = new PluginParameter.Builder("outputFile", null, String.class).guiName("Output File").required(true).outFile()
            .description("Full path of file to be written").build();
    private PluginParameter myConservedGeneFile = new PluginParameter.Builder("conservedGeneFile", null, String.class).guiName("Conserved Gene File").required(true).inFile()
            .description("File containing list of conserved genes to include as anchor regions ").build();
 
    public CreateGeneIntervalsFromConservedGFFPlugin() {
        super(null, false);
    }

    public CreateGeneIntervalsFromConservedGFFPlugin(Frame parentFrame) {
        super(parentFrame, false);
    }

    public CreateGeneIntervalsFromConservedGFFPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }
    
    public static void main(String[] args) {
        GeneratePluginCode.generate(CreateGeneIntervalsFromConservedGFFPlugin.class);        
    }
    
    @Override
    public DataSet processData(DataSet input) {

        myLogger.info(" CreateGeneIntervalsFromConservedGFFPlugin using gene file: " + gffFile() + ", create ref GenomeSequence");
        // Position has both chrom and physical position
        RangeMap geneRange = TreeRangeMap.create();

       // String anchorFileJustGenes = outputDir() + "anchorCoordinates_conservedGenes_allchrs.bed";
        String anchorFileJustGenes = outputFile();
        List conservedGeneList = new ArrayList<>();
        try (BufferedReader gffbr = Utils.getBufferedReader(gffFile());
             BufferedReader conservedbr = Utils.getBufferedReader(conservedGeneFile());
             BufferedWriter genesAnchorbw = Utils.getBufferedWriter(anchorFileJustGenes)){
            // Process all chrom gene input files
            
            String line = null;
            while ((line = conservedbr.readLine()) != null) {
                conservedGeneList.add(line);
            }
            // This would re-sort incorrectly - string of start puts 10 before 9
            //Collections.sort(conservedGeneList); // gff file already in order
            
            String geneline; 
            while ((geneline = gffbr.readLine()) != null) {
                String[] geneTokens = geneline.split("\\t");
                String chrom = geneTokens[0];

                // Gene column looks as below.  Grab just the gene name
                //ID=gene:Zm00001d027231;biotype=protein_coding;gene_id=Zm00001d027231;logic_name=maker_gene
                String description = geneTokens[8];
                String genename = description.split(";")[0].split(":")[1];
                
                if (!conservedGeneList.contains(genename)) continue; // only process conserved genes
                Chromosome curChrom = Chromosome.instance(chrom);
                Position startPos = new GeneralPosition.Builder(curChrom,Integer.parseInt(geneTokens[3])).build();
                Position endPos = new GeneralPosition.Builder(curChrom,Integer.parseInt(geneTokens[4])).build();
                addRange(geneRange, Range.closed(startPos, endPos),genename);
            } 
                               
            // Step 4:  write the bed files                
            myLogger.info("Begin writing  files");
            
            // DON'T WRITE - header, is giving me problems
            String anchorFileHeader = "#Chr\tGeneStart\tGeneEnd\tGeneNames\n";
            //genesAnchorbw.write(anchorFileHeader);
            
            writeFiles(geneRange, genesAnchorbw);
            
        } catch (Exception exc) {
            exc.printStackTrace();
        }
        myLogger.info("\n\nFinished all chrom files!");
        
        return null;
    }
    
    private static void addRange(RangeMap geneRange, Range range, String gene) {
        List, String>> overlaps = new ArrayList<>(
                geneRange.subRangeMap(range).asMapOfRanges().entrySet());
        //if overlaps has length, merge ranges together
        if (overlaps.size() != 0) {
            
            Map.Entry, String> overlappingEntry = geneRange.getEntry(overlaps.get(0).getKey().lowerEndpoint());
            //then use the combined range and assign the call
            String newGene = overlappingEntry.getValue() + "-" + gene;
  
            // Update overlappingEntry value with new merged gene value.
            // 2nd put is to ensure new entry is merged with the new value
            geneRange.put(overlappingEntry.getKey(),newGene); 
            geneRange.putCoalescing(range, newGene);
        }
        else {
            geneRange.put(range, gene);
        }
    }
    
    private static void writeFiles(RangeMap geneRangeMap,BufferedWriter genesAnchorbw){
        
        try {           
            // These lists should be of the same size and must be printed in sequential order 
            List> geneList = new ArrayList>(geneRangeMap.asMapOfRanges().keySet());
 
            myLogger.info("writeFiles:  size of geneList: " + geneList.size() );
 
            for (int idx = 0; idx < geneList.size(); idx++) {         

                Range geneRange = geneList.get(idx);
                int gstart = geneRange.lowerEndpoint().getPosition()-1; // bed file is 0-based, gff was 1-based
                int gend = geneRange.upperEndpoint().getPosition(); // bed files is inclusive/exclusive, gff was inclusive/inclusive
                String chrom = geneRange.lowerEndpoint().getChromosome().getName();
                String gene = geneRangeMap.get(geneRange.lowerEndpoint());
               
                StringBuilder anchorSB = new StringBuilder();               
                // anchors just-genes
                // NOTE:  genes are written twice because the load script is still
                // written for a db table that contains gene start/end.  This needs
                // to change.
                // NOTE _ we do NOT need genes written twice
                anchorSB.setLength(0);
                anchorSB.append(chrom).append("\t")
                .append(gstart).append("\t").append(gend).append("\t")
                .append(gene).append("\n");
                genesAnchorbw.write(anchorSB.toString());               
            }
        } catch (Exception exc) {
            exc.printStackTrace();
        }       
    }
    
    @Override
    public ImageIcon getIcon() {
        // TODO Auto-generated method stub
        return null;
    }

    @Override
    public String getButtonName() {
        // TODO Auto-generated method stub
        return null;
    }

    @Override
    public String getToolTipText() {
        // TODO Auto-generated method stub
        return null;
    }

    /**
     * Tab delimited .txt file containing gene-only GFF data
     * from reference GFF file for all desired chromosomes,
     * 
     *
     * @return Gene File
     */
    public String gffFile() {
        return myGffFile.value();
    }

    /**
     * Set Gene File. Tab delimited .txt file containing gene-only
     * GFF data from reference GFF file for all desired chromosomes,
     * 
     *
     * @param value Gene File
     *
     * @return this plugin
     */
    public CreateGeneIntervalsFromConservedGFFPlugin gffFile(String value) {
        myGffFile = new PluginParameter<>(myGffFile, value);
        return this;
    }

    /**
     * Name of output file to be written
     *
     * @return Output File
     */
    public String outputFile() {
        return myOutputFile.value();
    }

    /**
     * Set Output File. Name of file to be written
     *
     * @param value Output Directory
     *
     * @return this plugin
     */
    public CreateGeneIntervalsFromConservedGFFPlugin outputFile(String value) {
        myOutputFile = new PluginParameter<>(myOutputFile, value);
        return this;
    }

    /**
     * File containing list of conserved genes to include
     * as anchor regions 
     *
     * @return Conserved Gene File
     */
    public String conservedGeneFile() {
        return myConservedGeneFile.value();
    }

    /**
     * Set Conserved Gene File. File containing list of conserved
     * genes to include as anchor regions 
     *
     * @param value Conserved Gene File
     *
     * @return this plugin
     */
    public CreateGeneIntervalsFromConservedGFFPlugin conservedGeneFile(String value) {
        myConservedGeneFile = new PluginParameter<>(myConservedGeneFile, value);
        return this;
    }


}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy