net.maizegenetics.pangenome.db_loading.CreateAnchorFilesFromGeneGFF Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
/**
 * 
 */
package net.maizegenetics.pangenome.db_loading;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Utils;

/**
 * Based on WGS_whatever.CreateAnchorsFromGeneGff.java
 * 
 * This method creates two fasta files of anchor coordinates:
 * The first has exact gene coordinates to be used when blasting
 * B73 reference genes against a particular assembly, e.g. Sorghum.
 * 
 * The second has genes with 1000bp flanking added.
 * 
 * Method:
 * 
 * Takes a gff file sorted to contain only gene data  
 * If any gene is embedded within another, it is tossed.
 * If any gene start position overlaps the previous gene end position,
 * merge the gene.
 * 
 * After mergin genes, add 1000bp flanking on either end.  If there is less
 * than 2000 bps between genes, spli the difference
 * 
 * Can take 1 file or a directory of files.  The file names are assume to look
 * like this:
 *    gffv4Rel34_chr5_gene.txt
 * This matters to the parsing of the chromosome name (ie "5" above)
 * 
 * OUTPUT Notes:
 * 1.  Two csv files of chr,start,end are created for each chromosome.  The first
 *     has just merged/tossed gene coordinates, the second has these genes with 1000bp flanking added.
 * 2.  A fasta file for each chromosome is created:  contains all the anchors
 *     for that chromosome.  This was created when the thought was that Zack needed
 *     it for comparison.  THis way he didn't have to go through a db.
 * 3.  The fasta files are not used in LoadAnchorsToPHGdb.java.  It uses the csv file
 *     created which contains that chr,start,end.  They ARE used when blasting genes
 *     against the assemblies.
 * 4.  AFter all the chromosomes have been processed, concat them into 1 file
 *     for use with LoadANchorsToPHGdb.java via this command line:
 *     cat anchorsFile_chr1_MergedPlus1000orGapDiffcoordinates.csv anchorsFile_chr2_MergedPlus1000orGapDiffcoordinates.csv  ... > anchorsFile_all10_MergedPlus1000orGapDiffcoordinates.csv  
 *     - Then edit out the header lines except for the first one.
 *     
 * @author lcj34
 *
 */
public class CreateAnchorFilesFromGeneGFF {
    static GenomeSequence myRefSequence = null;
    
    // THis method creates 2 sets of files that should have the same number of entries,
    // but different lenght for the sequences.
    // BOth the genes-only (which is merged/tossed genes) and the genes-with_flanking (the
    // merged/tossed genes with 1000bps added to each side) are created in this method.  They
    // need to be created together as the former is used for blasting assemblies, the latter
    // for storing to the db.  If they aren't created together, we get incompatible number
    // of genes.
    public static void mainProcessMergeOverlapsAddGapDifference(String geneFile, String outputBase, String chr, int numFlanking) {
        
        Chromosome chrom = new Chromosome(chr);
        BufferedReader genebr = Utils.getBufferedReader(geneFile);
        
        String mergedGenes = outputBase + "chr" + chr + "_mergedGenes.txt";
        BufferedWriter mergedGenesbw = Utils.getBufferedWriter(mergedGenes);
        String embeddedGenes = outputBase + "chr" + chr + "_embeddedGenes.txt";
        BufferedWriter embeddedbw = Utils.getBufferedWriter(embeddedGenes);
 
        try {
            String geneline;
            int geneCount = 0;
            int mergedGeneCount = 0;
            int skippedGeneCount = 0;
            System.out.println("\nStart processing genes");
            int chromLen = myRefSequence.chromosomeSize(chrom);
            List geneGFFList = new ArrayList();
            
            // Step 1.  List of genes from gff file
            while ((geneline = genebr.readLine()) != null) {
                geneCount++;
                String[] geneTokens = geneline.split("\\t");
                String description = geneTokens[8];
                int semicolonLoc = description.indexOf(";");
                String genename = description.substring(0, semicolonLoc); //strip off after gene
                int colonLoc = description.indexOf(":");
                genename = genename.substring(colonLoc+1); // strip off beginning "ID=gene:", leavning just gene name
                // fill in map
                GeneGFFData gffData = new GeneGFFData(Integer.parseInt(geneTokens[3]),
                        Integer.parseInt(geneTokens[4]), genename);
                geneGFFList.add(gffData);
            }
            Collections.sort(geneGFFList); // sort by start position (may already be sorted in gene file)
            genebr.close();
            
            // Step 2.  Merge any genes that have overlapped or are completely contained in another file
            List mergedGeneList = new ArrayList();
            GeneGFFData data1 = geneGFFList.get(0);

            GeneGFFData data2 = null;
            // create the list of merged genes with start/end positions and names
            for (int idx = 1; idx < geneGFFList.size(); idx++) {
 
                data2 = geneGFFList.get(idx);
                // This  creates a list of merged genes. 
                if (data2.start() > data1.start() && data2.end() <= data1.end()) {
                    // these genes overlap.   skip the second gene
                    skippedGeneCount++;
                    String overlaps = chr + " " + data1.toString() + " and " + data2.toString() + "\n";
                    embeddedbw.write(overlaps);
                    continue; // skip it
                }
                if (data2.start() <= (data1.end()) ) {
                    // merge the 2, update index properly
                    String mergeLine = data1.name() + " start-end: " + data1.start() + "-" + data1.end() + ", " + data2.name() + " start-end " + data2.start() 
                         + "-" + data2.end() + "\n";
                    mergedGenesbw.write(mergeLine);
                    String name = data1.name() + "-" + data2.name();
                    data1 = new GeneGFFData(data1.start(), data2.end(), name);
                    mergedGeneCount++;
                    // don't yet add to the list - wait until there is no next
                } else {
                    // put data 1 into new list                     
                    mergedGeneList.add(data1);
                    data1 = data2; // set up to compared 2 with the next dataset
                }
            }
            // Handle the last one.  It should already be set up.
            mergedGeneList.add(data1);
            System.out.println("AFter merging overlapped genes: mergeGeneList size " + mergedGeneList.size() + ", mergedGeneCount: " + mergedGeneCount);
            //Collections.sort(mergedGeneList); // should already be sorted.
            
            
            // Step 3: check for flanking overlap, split the difference
            data1 = mergedGeneList.get(0);
            data2 = null;
            int start = data1.start() - 1000 > 0 ? data1.start()-1000 : 1;
            data1 = new GeneGFFData(start,data1.end(),data1.name());
            List lastGeneList = new ArrayList();
            // create the list of genes with gap-difference added 
            for (int idx = 1; idx < mergedGeneList.size(); idx++) {
                data2 = mergedGeneList.get(idx);
                // This  creates a final list of merged genes with start/end positions that include the 1000 bp flanking region
                if ((data2.start() -numFlanking) < (data1.end() + numFlanking) ) {
                    // find the gap interval, split the difference
                    int gapInterval = data2.start() - data1.end();
                    int d2Start = (data2.start() - gapInterval/2) +1;
                    data2 = new GeneGFFData(d2Start,data2.end(),data2.name());
                    int d1end = data1.end() + gapInterval/2;
                    data1 = new GeneGFFData(data1.start(),d1end,data1.name());
                    // add data1 to list
                    lastGeneList.add(data1);
                    data1 = data2;
                } else {
                    // put data 1 into new list 
                    int d1end = data1.end() +1000 < chromLen ? data1.end() + 1000 : chromLen;
                    data1 = new GeneGFFData(data1.start(),d1end, data1.name())  ;                 
                    lastGeneList.add(data1);
                    int d2start = data2.start() - numFlanking;
                    data2 = new GeneGFFData(d2start,data2.end(), data2.name());
                    data1 = data2; // set up to compared 2 with the next dataset
                }
            }
            // add last gene
            int d1end = data1.end() +numFlanking < chromLen ? data1.end()+ numFlanking : chromLen;
            data1 = new GeneGFFData(data1.start(), d1end, data1.name())  ;                   
            lastGeneList.add(data1);
            System.out.println("lastGEneList size: " + lastGeneList.size());
            
            // There should be the same number of entries in each map.  If not,
            // we have an error and need obort.
            if (mergedGeneList.size() != lastGeneList.size()) {
                System.out.println("ERROR - mergedGeneList size " + mergedGeneList.size() 
                  + " does NOT equal lastGeneList size " + lastGeneList.size());
                return;
            }
            
            System.out.println("MergedGeneList size matches lastGeneList size - continue!");
            
            String anchorFileString = outputBase + "anchorsFile_MergedPlus1000orGapDiffcoordinate_chr" + chr + ".csv";
            BufferedWriter flankingAnchorbw = Utils.getBufferedWriter(anchorFileString);
            
            String anchorFileJustGenes = outputBase + "anchorsFile_MergedTossed_justGenes_chr" + chr + ".csv";
            BufferedWriter genesAnchorbw = Utils.getBufferedWriter(anchorFileJustGenes);
            
            String anchorFileHeader = "Chr,StartPos,EndPos,GeneStart,GeneEnd\n";
            flankingAnchorbw.write(anchorFileHeader);
            genesAnchorbw.write(anchorFileHeader);
            
            String genePlus1000Fasta = outputBase + "genes_Plus1000orGapDiffFromRef_chr" + chr+ ".fa";
            BufferedWriter flankingFastabw = Utils.getBufferedWriter(genePlus1000Fasta);
            
            String geneFasta = outputBase + "genes_justGenes_chr" + chr + ".fa";
            BufferedWriter genesFastabw = Utils.getBufferedWriter(geneFasta);
                    
            // Step 4.  Write the files
            int totalCount = 0;
            
            // write the genes-only file
            for (int idx = 0; idx < mergedGeneList.size(); idx++) {
                // create fasta lines
                GeneGFFData gffdata = mergedGeneList.get(idx);
                byte[] genebytes = myRefSequence.chromosomeSequence(chrom,gffdata.start(),gffdata.end());
                String geneString = NucleotideAlignmentConstants.nucleotideBytetoString(genebytes);
                
                //write fasta - start/end writeen twice to signify chrom start/end then
                // gene start/end.  They are the same for this file (no flanking added)
                StringBuilder seqSB = new StringBuilder();
                seqSB.append(">").append(chr).append(":").append(Integer.toString(gffdata.start()))
                     .append(":").append(Integer.toString(gffdata.end())).append(":")
                     .append(Integer.toString(gffdata.start()))
                     .append(":").append(Integer.toString(gffdata.end()))
                     .append(" ").append(gffdata.name()).append("\n")
                     .append(geneString).append("\n");
                
                genesFastabw.write(seqSB.toString());                
                // write to anchors file.  THis file will be used in LoadAnchorsToDB.java 
                StringBuilder anchorSB = new StringBuilder();
                
                // The gene start and gene end are the same as the chrom start/end position
                // as this set has not added the flanking bps.
                anchorSB.append(chr).append(",").append(Integer.toString(gffdata.start()))
                        .append(",").append(Integer.toString(gffdata.end())).append(",")
                        .append(Integer.toString(gffdata.start())).append(",")
                        .append(Integer.toString(gffdata.end())).append("\n");
                genesAnchorbw.write(anchorSB.toString());
            }
            
         // write the genes-with-flanking file
            for (int idx = 0; idx < lastGeneList.size(); idx++) {
                totalCount++;
                // create fasta lines
                GeneGFFData gffdata = lastGeneList.get(idx);
                GeneGFFData geneData = mergedGeneList.get(idx); // for gene start/end positions
                byte[] genebytes = myRefSequence.chromosomeSequence(chrom,gffdata.start(),gffdata.end());
                String geneString = NucleotideAlignmentConstants.nucleotideBytetoString(genebytes);
                
                // write fasta - do I need the genestart/geneEnd data added??  I added it
                StringBuilder seqSB = new StringBuilder();
                seqSB.append(">").append(chr).append(":").append(Integer.toString(gffdata.start()))
                     .append(":").append(Integer.toString(gffdata.end()))
                     .append(":").append(Integer.toString(geneData.start())).append(":")
                     .append(Integer.toString(geneData.end())).append(" ")
                     .append(gffdata.name()).append("\n")
                     .append(geneString).append("\n");
                flankingFastabw.write(seqSB.toString());
                
                // write to anchors file.  THis file will be used in LoadAnchorsToDB.java 
                //String anchorDataString = chr + "," + gffdata.start() + "," + gffdata.end() + "\n";
                StringBuilder csvSB = new StringBuilder();
                csvSB.append(chr).append(",").append(Integer.toString(gffdata.start())).append(",")
                     .append(Integer.toString(gffdata.end())).append(",")
                     .append(Integer.toString(geneData.start())).append(",")
                     .append(Integer.toString(geneData.end())).append("\n");
                flankingAnchorbw.write(csvSB.toString());
            }
         
            System.out.println("Gene plus 1000, merge overlaps, split Gaps fasta written to " + genePlus1000Fasta);
            System.out.println("Just merged Gene file written to " + geneFasta);
            System.out.println("Finished chrom " + chr + ",num total fasta lines: " + totalCount + ", number genes processed: " + geneCount
                    + ", number of genes merged: " + mergedGeneCount + ", number of embedded/skipped genes: " + skippedGeneCount);
 
            genesFastabw.close();
            genesAnchorbw.close();
            
            flankingFastabw.close();
            flankingAnchorbw.close();
            mergedGenesbw.close();
            embeddedbw.close();
        } catch (Exception exc) {
            exc.printStackTrace();
        }       
    }
    
    // THis version makes initial anchors from just the genes.
    public static void mainProcessDataJustGenes(String geneFile, String outputBase, String chr) {
        
        Chromosome chrom = new Chromosome(chr);
        BufferedReader genebr = Utils.getBufferedReader(geneFile);
        String geneFasta = outputBase + "genesChr" + chr + "_genesExactCoordinatesFromRef.fa";
        BufferedWriter geneBW = Utils.getBufferedWriter(geneFasta);
        
        String anchorFileString = outputBase + "anchorsFile_chr" + chr + "_genesExactCoordinates.csv";
        BufferedWriter anchorFilebw = Utils.getBufferedWriter(anchorFileString);
 
        try {
            String geneline;
            int geneCount = 0;
 
            System.out.println("\nStart processing genes");
            int chromLen = myRefSequence.chromosomeSize(chrom);
            List geneGFFList = new ArrayList();
            while ((geneline = genebr.readLine()) != null) {
                geneCount++;
                String[] geneTokens = geneline.split("\\t");
                String description = geneTokens[8];
                int semicolonLoc = description.indexOf(";");
                String genename = description.substring(0, semicolonLoc); //strip off after gene
                int colonLoc = description.indexOf(":");
                genename = genename.substring(colonLoc+1); // strip off beginning "ID=gene:", leavning just gene name
                // fill in map
                GeneGFFData gffData = new GeneGFFData(Integer.parseInt(geneTokens[3]),
                        Integer.parseInt(geneTokens[4]), genename);
                geneGFFList.add(gffData);
            }
            Collections.sort(geneGFFList); // sort by start position (may already be sorted in gene file)
            genebr.close();
            
            String anchorFileHeader = "Chr,StartPos,EndPos\n";
            anchorFilebw.write(anchorFileHeader);
                    
            int totalCount = 0;
            // write the files
            for (int idx = 0; idx < geneGFFList.size(); idx++) {
                totalCount++;
                // create fasta lines
                GeneGFFData gffdata = geneGFFList.get(idx);
                int start = gffdata.start(); // lcj - this was previously run with - 1000 - and created a wrong ilfe
                int end = gffdata.end() ;// lcj - this was previously run with +1000 - it was wrong, forgot to delete
                byte[] genebytes = myRefSequence.chromosomeSequence(chrom,start,end);
                String geneString = NucleotideAlignmentConstants.nucleotideBytetoString(genebytes);
                String tagSeq = ">" + gffdata.name() + ":" + chr + ":" + start + ":" + end + "\n" + geneString + "\n"; // write header and data with newlines
                geneBW.write(tagSeq);
                
                // write to anchors file.  THis file will be used in LoadAnchorsToDB.java 
                String anchorDataString = chr + "," + start + "," + end + "\n";
                anchorFilebw.write(anchorDataString);
            }
            
            System.out.println("Gene fasta written to " + geneFasta);
            System.out.println("Finished chrom " + chr + ",num total fasta lines: " + totalCount + ", number genes processed: " + geneCount);
 
            geneBW.close();
            anchorFilebw.close();
        } catch (Exception exc) {
            exc.printStackTrace();
        }       
    }

    /**
     * @param args
     */
    public static void main(String[] args) {

        List infiles;
        String refFile = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/Zea_mays.AGPv4.dna.toplevel.fa";
//        String geneFile = "/Volumes/Samsung_T1/wgs_pipeline/v4_gff_files/chr10/gffv4Rel34_chr10_gene.txt";
//        String outputBase = "/Users/lcj34/notes_files/repgen/wgs_pipeline/anchorsFromV4gff/chr10/";
//        String chr = "10";
        
        //String outputBase = "/Users/lcj34/notes_files/repgen/wgs_pipeline/debug/";
        //String outputBase = "/Users/lcj34/notes_files/repgen/wgs_pipeline/anchorsFromV4gff/allGeneAnchorFiles/may25_doubleFiles/";
        String outputBase = "/Users/lcj34/notes_files/repgen/wgs_pipeline/anchorsFromV4gff/allGeneAnchorFiles/july2017_phg_intervalsOldMethod/";
        String geneFile = "/Volumes/Samsung_T1/wgs_pipeline/v4_gff_files/allGeneBedFiles";
        int numFlanking = 1000; // number flanking bps to add on each end
        //String geneFile = "/Volumes/Samsung_T1/wgs_pipeline/v4_gff_files/chr10/gffv4Rel34_chr10_gene.txt"; // just 1 gene for testing
        // create list of directory files
        File dirList = new File(geneFile);
        if (!(dirList.exists())) {
            throw new IllegalStateException("Input file or directory not found !!") ;
        }
 
        if (dirList.isDirectory()) {
            String inputFileGlob="glob:*{.txt}";
            infiles= DirectoryCrawler.listPaths(inputFileGlob, Paths.get(geneFile).toAbsolutePath());
            if (infiles.size() < 1) {
                throw new IllegalStateException("no .txt files found in input directory !!");
            }
        } else {
            infiles=new ArrayList<>();
            Path filePath= Paths.get(geneFile).toAbsolutePath(); 
            infiles.add(filePath);
        }
        Collections.sort(infiles); // not really needed
        
        System.out.println("Size of infiles: " + infiles.size());
        
        // create reference
        System.out.println("BEGIN createAnchorFilesFromGeneGFF - create reference");
        myRefSequence = GenomeSequenceBuilder.instance(refFile);
        
        // Loops through all chromosomes
        for  (Path filepath: infiles) {
            // file looks like:  gffv4Rel34_chr2_gene.txt
            String fileName = filepath.getFileName().toString();
            String fileNameSubstr = fileName.substring(fileName.indexOf("_")+1);
            
            String chrName = fileNameSubstr.substring(0,fileNameSubstr.indexOf("_"));// get second one
            chrName = chrName.toUpperCase();
            
            chrName = chrName.replace("CHR", ""); // keep number string, minus  leading "chr" 
            System.out.println("\nCreateAnchorFilesFromGeneGFF:  processing chromsome " + chrName);
            //mainProcessDataJustGenes(filepath.toString(),outputBase,chrName);
            mainProcessMergeOverlapsAddGapDifference(filepath.toString(),outputBase,chrName,numFlanking);
            
        }
        System.out.println("\n\nFInished all chrom files!");
 
    }
}