All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.db_loading.CreateIntervalsFileFromGffPlugin Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.db_loading;

import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import javax.swing.ImageIcon;

import org.apache.log4j.Logger;

import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;

import net.maizegenetics.analysis.gbs.v2.TagExportToFastqPlugin;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.CheckSum;
import net.maizegenetics.util.Utils;

/**
 * This class creates the interval files needed for running GATK haplotype caller,
 * and the csv files needed for loading reference sequence into the database.
 * 
 * Two sets of files are created:  one set has coordinates based just on the ref
 * gene coordinates.  The other is gene coordinates plus user-specified flanking regions
 * 
 * Algorithm:
 *   1.  read gff file, grab gene coordinates 
 *   2.  For each Chromosome:  merge genes that overlap, toss genes that are embedded within another gene
 *       Store list as mergedGeneList.
 *   3.  Using the mergedGeneList in 3, create 2nd per-chrom coordinate lists that includes flanking regions
 *   4.  Write files:
 *         interval format (chrom:start-end): a. mergedGeneList; b. mergedGEneList with flanking
 *         csv format (chr,anchorstart,anchorend,geneStart,geneEnd,geneName)
 *            a.  mergedGeneList; b. mergedGEneList with flanking
 *         debug files:  List of merged, list of embedded files written for informational purposes
 *         
 *  NOTE:  the csv files contain the name of all genes contained in an anchor.  This data is not
 *         stored in the DB.  IT is included because the biologists have at times asked for it
 *         and this is a good place for it to be stored and retrieved.
 *  
 *  INPUT:
 *    1. refFile:  String: path to reference genome.  needed to find size of chromosomes for
 *                         adding flanking regions to last chrom entry.
 *    2. geneFile: String: path to single file containing all chrom gene data in GFF format; or
 *                         path to directory containing per-chrom files with gene data in GFF format.
 *                         These data files must consist of GFF gene data alone, not the full gff.
 *    3. outputBase: String: directory, including trailing "/", where output files will be written.
 *    4. numFlanking: int:  number of flanking bps to add on each end of the anchors.
 *    
 *  OUTPUT:
 *    1.  intervals file based on gene coordinates.
 *    2.  intervals file based on gene coordinates + numflanking bps
 *    3.  csv file based on gene coordinates
 *    4.  csv file based on gene coordinates + numflanking bps
 *    
 * @author lcj34
 *
 */
public class CreateIntervalsFileFromGffPlugin extends AbstractPlugin {
    private static final Logger myLogger = Logger.getLogger(CreateIntervalsFileFromGffPlugin.class);

    private PluginParameter myRefFile = new PluginParameter.Builder("refFile", null, String.class).guiName("Ref Genome File").required(true).inFile()
            .description("Fasta file containing reference genome").build();
    private PluginParameter myGeneFile = new PluginParameter.Builder("geneFile", null, String.class).guiName("Gene File").required(true).inFile()
            .description("Tab delimited .txt file containing gene-only GFF data from reference GFF file for all desired chromosomes, ").build();
    private PluginParameter myOutputDir = new PluginParameter.Builder("outputDir", null, String.class).guiName("Output Directory").required(true).outFile()
            .description("Directory where output files will be written").build();
    private PluginParameter myNumFlanking = new PluginParameter.Builder("numFlanking", 1000, Integer.class).guiName("Number of Flanking BPs")
            .description("Number of flanking basepairs to add at each end of the gene sequence").build();


    public CreateIntervalsFileFromGffPlugin() {
        super(null, false);
    }

    public CreateIntervalsFileFromGffPlugin(Frame parentFrame) {
        super(parentFrame, false);
    }

    public CreateIntervalsFileFromGffPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }
    
    @Override
    public DataSet processData(DataSet input) {

        System.out.println(" CreateIntervalsFileFromGff using gene file: " + geneFile() + ", create ref GenomeSequence");
        GenomeSequence myRefSequence = GenomeSequenceBuilder.instance(refFile());
        // Position has both chrom and physical position
        RangeMap geneRange = TreeRangeMap.create();
        RangeMap flankingRange = TreeRangeMap.create();
        try {
            // Process all chrom gene input files
            BufferedReader genebr = Utils.getBufferedReader(geneFile()); 
            String geneline; 
            while ((geneline = genebr.readLine()) != null) {
                String[] geneTokens = geneline.split("\\t");
                String chrom = geneTokens[0];

                // Gene column looks as below.  Grab just the gene name
                //ID=gene:Zm00001d027231;biotype=protein_coding;gene_id=Zm00001d027231;logic_name=maker_gene
                String description = geneTokens[8];
                String genename = description.split(";")[0].split(":")[1];
                
                Chromosome curChrom = new Chromosome(chrom);
                Position startPos = new GeneralPosition.Builder(curChrom,Integer.parseInt(geneTokens[3])).build();
                Position endPos = new GeneralPosition.Builder(curChrom,Integer.parseInt(geneTokens[4])).build();
                addRange(geneRange, Range.closed(startPos, endPos),genename);
            } 
            genebr.close();
            
            // create anchors with flanking          
            System.out.println("Call createFlankingList");
            flankingRange = createFlankingList( geneRange,numFlanking(), myRefSequence);
            if (flankingRange == null) {
                System.out.println("CreateFlankingList failed - exit");
                return null;
            }
                   
            // Step 4:  write the CSV/Intervals files                
            System.out.println("Begin writing  files");
                        
            // Files to write.
            String anchorFileString = outputDir() + "anchorCoordinates_withFlanking_allchrs.csv";
            BufferedWriter flankingAnchorbw = Utils.getBufferedWriter(anchorFileString);
            
            String anchorFileJustGenes = outputDir() + "anchorCoordinates_justGenes_allchrs.csv";
            BufferedWriter genesAnchorbw = Utils.getBufferedWriter(anchorFileJustGenes);
          
            String intervalsFlankingString = outputDir() + "anchorCoordinates_withFlanking_allchrs.intervals";
            BufferedWriter flankingIntervalsbw = Utils.getBufferedWriter(intervalsFlankingString);
            
            String intervalsJustGenes = outputDir() + "anchorCoordinates_justGenes_allchrs.intervals";
            BufferedWriter genesIntervalsbw = Utils.getBufferedWriter(intervalsJustGenes);
            
            // add header only once, then call to write files
            String anchorFileHeader = "Chr,StartPos,EndPos,GeneStart,GeneEnd,GeneName\n";
            flankingAnchorbw.write(anchorFileHeader);
            genesAnchorbw.write(anchorFileHeader);   
            
            writeFiles(geneRange, flankingRange, flankingAnchorbw, genesAnchorbw, 
                        flankingIntervalsbw, genesIntervalsbw);

            flankingAnchorbw.close();
            genesAnchorbw.close();
            flankingIntervalsbw.close();
            genesIntervalsbw.close();  
            
            // Step 5:  write plugin parameters file
            printParametersFile( outputDir(),  refFile(),  geneFile(),  numFlanking());
        } catch (Exception exc) {
            exc.printStackTrace();
        }
        System.out.println("\n\nFInished all chrom files!");
        
        return null;
    }
    
    private static void addRange(RangeMap geneRange, Range range, String gene) {
        List, String>> overlaps = new ArrayList<>(
                geneRange.subRangeMap(range).asMapOfRanges().entrySet());
        //if overlaps has length, merge ranges together
        if (overlaps.size() != 0) {
            
            Map.Entry, String> overlappingEntry = geneRange.getEntry(overlaps.get(0).getKey().lowerEndpoint());
            //then use the combined range and assign the call
            String newGene = overlappingEntry.getValue() + "-" + gene;
  
            // Update overlappingEntry value with new merged gene value.
            // 2nd put is to ensure new entry is merged with the new value
            geneRange.put(overlappingEntry.getKey(),newGene); 
            geneRange.putCoalescing(range, newGene);
        }
        else {
            geneRange.put(range, gene);
        }
    }
  
    private static Position findFlankingStartPos(RangeMap geneRange,
            Range data, int numFlanking, int chromLen) {
        int flankCheck = numFlanking*2;
        Chromosome chrom = data.lowerEndpoint().getChromosome();
        int curStart = data.lowerEndpoint().getPosition(); 
        int lowerflank = data.lowerEndpoint().getPosition() - flankCheck;
        Position flankStart2000 = new GeneralPosition.Builder(chrom,lowerflank).build();
        Position flankUpToLower = new GeneralPosition.Builder(chrom,curStart-1).build();
        Range startCheck = Range.closed(flankStart2000,flankUpToLower);

        // Want to add 1000bp (or user defined number) flanking on either side of each entry in geneRange map,
        // then add the new range positions to the flankingRange map to be returned.
        // Before adding the 1000 bps, need to verify
        //  1.  there are 1000 bps between the start of the chrom and the start of this entry (or start = 1)
        //  2.  the distance between this entry and the current entry is >= 2000 (1000 on each end)
        //  3.  the distance between this entry and the chrom len is at least 1000 (otherwise chromlen = end)
        List, String>> overlapsStart = new ArrayList<>(
                geneRange.subRangeMap(startCheck).asMapOfRanges().entrySet());
        Position newLowerPos = null;
                 
        if (overlapsStart.size() > 0) {
            // the last overlap should be the closest in position to the current range start
            int prevEnd = overlapsStart.get(overlapsStart.size()-1).getKey().upperEndpoint().getPosition() ;
            int newFlankNum = curStart-prevEnd;
            int newLowerInt = (curStart - newFlankNum/2) + 1;// want start to be 1 past ;
            newLowerPos = new GeneralPosition.Builder(chrom,newLowerInt).build();
        } else {
            int newLowerInt = curStart - numFlanking < 1 ? 1 : curStart - numFlanking;
            newLowerPos = new GeneralPosition.Builder(chrom,newLowerInt).build();
        }
        return newLowerPos;
    }

    private static Position findFlankingEndPos(RangeMap geneRange,
            Range data, int numFlanking, int chromLen) {
        Chromosome chrom = data.lowerEndpoint().getChromosome();
        int flankCheck = numFlanking*2;
        int curEnd = data.upperEndpoint().getPosition();
        int upperflank = data.upperEndpoint().getPosition() + flankCheck;
        Position curEndPlusOne = new GeneralPosition.Builder(chrom,curEnd+1).build();
        Position flankEnd2000 = new GeneralPosition.Builder(chrom,upperflank).build();
        Range endCheck = Range.closed(curEndPlusOne,flankEnd2000);
        List, String>> overlapsEnd = new ArrayList<>(
                geneRange.subRangeMap(endCheck).asMapOfRanges().entrySet());
        Position newUpperPos = null;
        
        if (overlapsEnd.size() > 0) {
            // the first overlap should be the closest in position to the current range end
            int nextStart = overlapsEnd.get(0).getKey().lowerEndpoint().getPosition() ;
            int newFlankNum = nextStart-curEnd; 
            int newUpperInt = (curEnd + newFlankNum/2); // don't add 1 here, gets added at start
            newUpperPos = new GeneralPosition.Builder(chrom,newUpperInt).build();
        } else {
            int newUpperInt = curEnd + numFlanking > chromLen ? chromLen : curEnd + numFlanking;
            newUpperPos = new GeneralPosition.Builder(chrom,newUpperInt).build();
        }
        return newUpperPos;
    }
    
    private  static RangeMap createFlankingList(RangeMap geneRange,
            int numFlanking, GenomeSequence myRefSequence) {
        // there are no embedded or overlapped entries in the geneRange map.
        // The goal is to add 1000bps flanking to start/end of each entry.
        // there may not be 2000bps between each gene entry.  If not, split the difference
        // between:  half to geneA end, half to geneB start.  This is done in
        // findFlankingStartPos() and findFlankingEndPos()
        RangeMap flankingRange = TreeRangeMap.create();

        try {
            geneRange.asMapOfRanges().entrySet().stream().forEach(range -> {
                Range data = range.getKey();
                Chromosome chrom = data.lowerEndpoint().getChromosome();
                int chromLen = myRefSequence.chromosomeSize(chrom);               
                // Find new start/end positions with specified nubmer of flanking, add to map
                Position newLowerPos = findFlankingStartPos( geneRange, data,numFlanking,chromLen);     
                Position newUpperPos = findFlankingEndPos( geneRange, data,numFlanking,chromLen);      
                flankingRange.put(Range.closed(newLowerPos, newUpperPos), range.getValue());
            });
            
            if (geneRange.asMapOfRanges().size() != flankingRange.asMapOfRanges().size()) {
                System.out.println("ERROR - mergedGeneList size " + geneRange.asMapOfRanges().size() 
                  + " does NOT equal lastGeneList size " + flankingRange.asMapOfRanges().size());
                return null;
            }
            return flankingRange;
        } catch (Exception exc) {
            exc.printStackTrace();
            return null;
        }
    }
    
    // This method writes 4 files:  anchors file in csv format including gene names for both anchors-with-flanking
    // and genes-only; and the same 2 files in "intervals" format for use with GATK haplotype caller.
    // Intervals file (for haplotype caller) needs the format:
    //  8:12921-13181  (chrom:startpos-endpos)
    // CSV file (for loading db) needs format:
    //  chr,anchorstart,anchorend,genestart,geneend,geneName
    private static void writeFiles(RangeMap geneRangeMap, RangeMap anchorRangeMap,
            BufferedWriter flankingAnchorbw, BufferedWriter genesAnchorbw,
            BufferedWriter flankingIntervalsbw, BufferedWriter genesIntervalsbw){
        
        try {           
            // These lists should be of the same size and must be printed in sequential order 
            List> geneList = new ArrayList>(geneRangeMap.asMapOfRanges().keySet());
            List> anchorList = new ArrayList>(anchorRangeMap.asMapOfRanges().keySet());

            System.out.println("writeFiles:  size of geneList: " + geneList.size() 
              + ", size of anchorList: " + anchorList.size());
            for (int idx = 0; idx < geneList.size(); idx++) {
                Range anchorRange = anchorList.get(idx);               
                int astart = anchorRange.lowerEndpoint().getPosition();
                int aend = anchorRange.upperEndpoint().getPosition();                

                Range geneRange = geneList.get(idx);
                int gstart = geneRange.lowerEndpoint().getPosition();
                int gend = geneRange.upperEndpoint().getPosition();
                String chrom = geneRange.lowerEndpoint().getChromosome().getName();
                String gene = geneRangeMap.get(geneRange.lowerEndpoint());
                
                // anchors with flanking
                StringBuilder anchorSB = new StringBuilder();
                anchorSB.append(chrom).append(",").append(astart).append(",").append(aend).append(",")
                    .append(gstart).append(",").append(gend).append(",")
                    .append(gene).append("\n");
                flankingAnchorbw.write(anchorSB.toString());
                
                // anchors just-genes
                anchorSB.setLength(0);
                anchorSB.append(chrom).append(",").append(gstart).append(",").append(gend).append(",")
                .append(gstart).append(",").append(gend).append(",")
                .append(gene).append("\n");
                genesAnchorbw.write(anchorSB.toString());

                // interval file - with flanking
                StringBuilder intervalSB = new StringBuilder();
                intervalSB.append(chrom).append(":").append(astart).append("-").append(aend).append("\n");               
                flankingIntervalsbw.write(intervalSB.toString());
                
                // interval file - just genes
                intervalSB.setLength(0);
                intervalSB.append(chrom).append(":").append(gstart).append("-").append(gend).append("\n");               
                genesIntervalsbw.write(intervalSB.toString());
            }
        } catch (Exception exc) {
            exc.printStackTrace();
        }       
    }
    
    private static void printParametersFile(String outputDir, String refFile, String geneFile, int numFlanking) {
        // Create the parameters file as per JIRA PHG_6
        String paramFile = outputDir + "CreateIntervalsFileFromGff_parameter.txt";
        String geneCSV = outputDir + "anchorCoordinates_justGenes_allchrs.csv";
        String geneIntervals = outputDir + "anchorCoordinates_justGenes_allchrs.intervals";
        String flankingCSV = outputDir + "anchorCoordinates_withFlanking_allchrs.csv";
        String flankingIntervals = outputDir + "anchorCoordinates_withFlanking_allchrs.intervals";
        try {
            BufferedWriter bw = Utils.getBufferedWriter(paramFile);
            String headers = "PluginName\tDateExecuted\tParameterName\tParameter_IO\tParameterType\tValue\tSHA-1\n";
            bw.write(headers);

            DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss");
            LocalDateTime now = LocalDateTime.now();
            String formattedTime = dtf.format(now);
            String nameDate = "CreateIntervalsFileFromGff\t" + formattedTime + "\t";

            String refCheckSum = CheckSum.getProtocolChecksum(refFile, "SHA-1");
            StringBuilder paramSB = new StringBuilder().append(nameDate).append("refFile\tinput\tString\t")
                    .append(refFile).append("\t").append(refCheckSum).append("\n");
            bw.write(paramSB.toString());
            paramSB.setLength(0);
            String geneCheckSum = CheckSum.getProtocolChecksum(geneFile, "SHA-1");
            paramSB.append(nameDate).append("geneFile\tinput\tString\t").append(geneFile).append("\t")
                   .append(geneCheckSum).append("\n");
            bw.write(paramSB.toString());
            paramSB.setLength(0);
            paramSB.append(nameDate).append("numFlanking\tinput\tint\t").append(numFlanking).append("\tNA\n");
            bw.write(paramSB.toString());
            paramSB.setLength(0);
            
            // List the output files. These files are outputDir + name
            String geneCSVcheckSum = CheckSum.getProtocolChecksum(geneCSV, "SHA-1");
            paramSB.append(nameDate).append("outputDir\toutput\tString\t").append(geneCSV).append("\t")
                   .append(geneCSVcheckSum).append("\n");
            bw.write(paramSB.toString());
            paramSB.setLength(0);
            
            String geneIntervalsCheckSum = CheckSum.getProtocolChecksum(geneIntervals, "SHA-1");
            paramSB.append(nameDate).append("outputDir\toutput\tString\t").append(geneIntervals).append("\t")
                   .append(geneIntervalsCheckSum).append("\n");
            bw.write(paramSB.toString());
            paramSB.setLength(0);
            
            String flankingCSVCheckSum = CheckSum.getProtocolChecksum(flankingCSV, "SHA-1");
            paramSB.append(nameDate).append("outputDir\toutput\tString\t").append(flankingCSV).append("\t")
                   .append(flankingCSVCheckSum).append("\n");
            bw.write(paramSB.toString());
            paramSB.setLength(0);
            
            String flankingIntervalsCheckSum = CheckSum.getProtocolChecksum(flankingIntervals, "SHA-1");
            paramSB.append(nameDate).append("outputDir\toutput\tString\t").append(flankingIntervals).append("\t")
                   .append(flankingIntervalsCheckSum).append("\n");
            bw.write(paramSB.toString());
            paramSB.setLength(0);

            bw.close();
        } catch (Exception exc) {
            exc.printStackTrace();
        }
    }
    
    /**
     * @param args
     */
//    public static void main(String[] args) {       
// 
//        String refFile = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/Zea_mays.AGPv4.dna.toplevel.fa";
//        String outputBase = "/Users/lcj34/notes_files/repgen/wgs_pipeline/anchorsFromV4gff/allGeneAnchorFiles/july2017_phg_intervals_RangeMaps/";
//        //String geneFile = "/Volumes/Samsung_T1/wgs_pipeline/v4_gff_files/allGeneBedFiles";
//        String geneFile = "/Volumes/Samsung_T1/wgs_pipeline/v4_gff_files/gffv4Rel34_all10chroms_gene.txt";
//        int numFlanking = 1000; // number flanking bps to add on each end
//        
//        mainProcessMergeOverlapsAddGapDifference(refFile,geneFile,outputBase,numFlanking);
//    }
    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
//     public static void main(String[] args) {
//         GeneratePluginCode.generate(CreateIntervalsFileFromGff.class);
//     }

    /**
     * Convenience method to run plugin with one return object.
     */
    // TODO: Replace  with specific type.
    public String runPlugin(DataSet input) {
        return (String) performFunction(input).getData(0).getData();
    }

    @Override
    public ImageIcon getIcon() {
        // TODO Auto-generated method stub
        return null;
    }

    @Override
    public String getButtonName() {
        // TODO Auto-generated method stub
        return null;
    }

    @Override
    public String getToolTipText() {
        // TODO Auto-generated method stub
        return null;
    }
    
    /**
     * Fasta file containing reference genome
     *
     * @return Ref Genome File
     */
    public String refFile() {
        return myRefFile.value();
    }

    /**
     * Set Ref Genome File. Fasta file containing reference
     * genome
     *
     * @param value Ref Genome File
     *
     * @return this plugin
     */
    public CreateIntervalsFileFromGffPlugin refFile(String value) {
        myRefFile = new PluginParameter<>(myRefFile, value);
        return this;
    }

    /**
     * Tab delimited .txt file containing gene-only GFF data
     * from reference GFF file, 
     *
     * @return Gene File
     */
    public String geneFile() {
        return myGeneFile.value();
    }

    /**
     * Set Gene File. Tab delimited .txt file containing gene-only
     * GFF data from reference GFF file, 
     *
     * @param value Gene File
     *
     * @return this plugin
     */
    public CreateIntervalsFileFromGffPlugin geneFile(String value) {
        myGeneFile = new PluginParameter<>(myGeneFile, value);
        return this;
    }

    /**
     * Directory where output files will be written
     *
     * @return Output Directory
     */
    public String outputDir() {
        return myOutputDir.value();
    }

    /**
     * Set Output Directory. Directory where output files
     * will be written
     *
     * @param value Output Directory
     *
     * @return this plugin
     */
    public CreateIntervalsFileFromGffPlugin outputDir(String value) {
        myOutputDir = new PluginParameter<>(myOutputDir, value);
        return this;
    }

    /**
     * Number of flanking basepairs to add at each end of
     * the gene sequence
     *
     * @return Number of Flanking BPs
     */
    public Integer numFlanking() {
        return myNumFlanking.value();
    }

    /**
     * Set Number of Flanking BPs. Number of flanking basepairs
     * to add at each end of the gene sequence
     *
     * @param value Number of Flanking BPs
     *
     * @return this plugin
     */
    public CreateIntervalsFileFromGffPlugin numFlanking(Integer value) {
        myNumFlanking = new PluginParameter<>(myNumFlanking, value);
        return this;
    }

}





© 2015 - 2024 Weber Informatics LLC | Privacy Policy