net.maizegenetics.pangenome.db_loading.CreateAnchorFilesFromGeneGFF Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.db_loading;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Utils;
/**
* Based on WGS_whatever.CreateAnchorsFromGeneGff.java
*
* This method creates two fasta files of anchor coordinates:
* The first has exact gene coordinates to be used when blasting
* B73 reference genes against a particular assembly, e.g. Sorghum.
*
* The second has genes with 1000bp flanking added.
*
* Method:
*
* Takes a gff file sorted to contain only gene data
* If any gene is embedded within another, it is tossed.
* If any gene start position overlaps the previous gene end position,
* merge the gene.
*
* After mergin genes, add 1000bp flanking on either end. If there is less
* than 2000 bps between genes, spli the difference
*
* Can take 1 file or a directory of files. The file names are assume to look
* like this:
* gffv4Rel34_chr5_gene.txt
* This matters to the parsing of the chromosome name (ie "5" above)
*
* OUTPUT Notes:
* 1. Two csv files of chr,start,end are created for each chromosome. The first
* has just merged/tossed gene coordinates, the second has these genes with 1000bp flanking added.
* 2. A fasta file for each chromosome is created: contains all the anchors
* for that chromosome. This was created when the thought was that Zack needed
* it for comparison. THis way he didn't have to go through a db.
* 3. The fasta files are not used in LoadAnchorsToPHGdb.java. It uses the csv file
* created which contains that chr,start,end. They ARE used when blasting genes
* against the assemblies.
* 4. AFter all the chromosomes have been processed, concat them into 1 file
* for use with LoadANchorsToPHGdb.java via this command line:
* cat anchorsFile_chr1_MergedPlus1000orGapDiffcoordinates.csv anchorsFile_chr2_MergedPlus1000orGapDiffcoordinates.csv ... > anchorsFile_all10_MergedPlus1000orGapDiffcoordinates.csv
* - Then edit out the header lines except for the first one.
*
* @author lcj34
*
*/
public class CreateAnchorFilesFromGeneGFF {
static GenomeSequence myRefSequence = null;
// THis method creates 2 sets of files that should have the same number of entries,
// but different lenght for the sequences.
// BOth the genes-only (which is merged/tossed genes) and the genes-with_flanking (the
// merged/tossed genes with 1000bps added to each side) are created in this method. They
// need to be created together as the former is used for blasting assemblies, the latter
// for storing to the db. If they aren't created together, we get incompatible number
// of genes.
public static void mainProcessMergeOverlapsAddGapDifference(String geneFile, String outputBase, String chr, int numFlanking) {
Chromosome chrom = new Chromosome(chr);
BufferedReader genebr = Utils.getBufferedReader(geneFile);
String mergedGenes = outputBase + "chr" + chr + "_mergedGenes.txt";
BufferedWriter mergedGenesbw = Utils.getBufferedWriter(mergedGenes);
String embeddedGenes = outputBase + "chr" + chr + "_embeddedGenes.txt";
BufferedWriter embeddedbw = Utils.getBufferedWriter(embeddedGenes);
try {
String geneline;
int geneCount = 0;
int mergedGeneCount = 0;
int skippedGeneCount = 0;
System.out.println("\nStart processing genes");
int chromLen = myRefSequence.chromosomeSize(chrom);
List geneGFFList = new ArrayList();
// Step 1. List of genes from gff file
while ((geneline = genebr.readLine()) != null) {
geneCount++;
String[] geneTokens = geneline.split("\\t");
String description = geneTokens[8];
int semicolonLoc = description.indexOf(";");
String genename = description.substring(0, semicolonLoc); //strip off after gene
int colonLoc = description.indexOf(":");
genename = genename.substring(colonLoc+1); // strip off beginning "ID=gene:", leavning just gene name
// fill in map
GeneGFFData gffData = new GeneGFFData(Integer.parseInt(geneTokens[3]),
Integer.parseInt(geneTokens[4]), genename);
geneGFFList.add(gffData);
}
Collections.sort(geneGFFList); // sort by start position (may already be sorted in gene file)
genebr.close();
// Step 2. Merge any genes that have overlapped or are completely contained in another file
List mergedGeneList = new ArrayList();
GeneGFFData data1 = geneGFFList.get(0);
GeneGFFData data2 = null;
// create the list of merged genes with start/end positions and names
for (int idx = 1; idx < geneGFFList.size(); idx++) {
data2 = geneGFFList.get(idx);
// This creates a list of merged genes.
if (data2.start() > data1.start() && data2.end() <= data1.end()) {
// these genes overlap. skip the second gene
skippedGeneCount++;
String overlaps = chr + " " + data1.toString() + " and " + data2.toString() + "\n";
embeddedbw.write(overlaps);
continue; // skip it
}
if (data2.start() <= (data1.end()) ) {
// merge the 2, update index properly
String mergeLine = data1.name() + " start-end: " + data1.start() + "-" + data1.end() + ", " + data2.name() + " start-end " + data2.start()
+ "-" + data2.end() + "\n";
mergedGenesbw.write(mergeLine);
String name = data1.name() + "-" + data2.name();
data1 = new GeneGFFData(data1.start(), data2.end(), name);
mergedGeneCount++;
// don't yet add to the list - wait until there is no next
} else {
// put data 1 into new list
mergedGeneList.add(data1);
data1 = data2; // set up to compared 2 with the next dataset
}
}
// Handle the last one. It should already be set up.
mergedGeneList.add(data1);
System.out.println("AFter merging overlapped genes: mergeGeneList size " + mergedGeneList.size() + ", mergedGeneCount: " + mergedGeneCount);
//Collections.sort(mergedGeneList); // should already be sorted.
// Step 3: check for flanking overlap, split the difference
data1 = mergedGeneList.get(0);
data2 = null;
int start = data1.start() - 1000 > 0 ? data1.start()-1000 : 1;
data1 = new GeneGFFData(start,data1.end(),data1.name());
List lastGeneList = new ArrayList();
// create the list of genes with gap-difference added
for (int idx = 1; idx < mergedGeneList.size(); idx++) {
data2 = mergedGeneList.get(idx);
// This creates a final list of merged genes with start/end positions that include the 1000 bp flanking region
if ((data2.start() -numFlanking) < (data1.end() + numFlanking) ) {
// find the gap interval, split the difference
int gapInterval = data2.start() - data1.end();
int d2Start = (data2.start() - gapInterval/2) +1;
data2 = new GeneGFFData(d2Start,data2.end(),data2.name());
int d1end = data1.end() + gapInterval/2;
data1 = new GeneGFFData(data1.start(),d1end,data1.name());
// add data1 to list
lastGeneList.add(data1);
data1 = data2;
} else {
// put data 1 into new list
int d1end = data1.end() +1000 < chromLen ? data1.end() + 1000 : chromLen;
data1 = new GeneGFFData(data1.start(),d1end, data1.name()) ;
lastGeneList.add(data1);
int d2start = data2.start() - numFlanking;
data2 = new GeneGFFData(d2start,data2.end(), data2.name());
data1 = data2; // set up to compared 2 with the next dataset
}
}
// add last gene
int d1end = data1.end() +numFlanking < chromLen ? data1.end()+ numFlanking : chromLen;
data1 = new GeneGFFData(data1.start(), d1end, data1.name()) ;
lastGeneList.add(data1);
System.out.println("lastGEneList size: " + lastGeneList.size());
// There should be the same number of entries in each map. If not,
// we have an error and need obort.
if (mergedGeneList.size() != lastGeneList.size()) {
System.out.println("ERROR - mergedGeneList size " + mergedGeneList.size()
+ " does NOT equal lastGeneList size " + lastGeneList.size());
return;
}
System.out.println("MergedGeneList size matches lastGeneList size - continue!");
String anchorFileString = outputBase + "anchorsFile_MergedPlus1000orGapDiffcoordinate_chr" + chr + ".csv";
BufferedWriter flankingAnchorbw = Utils.getBufferedWriter(anchorFileString);
String anchorFileJustGenes = outputBase + "anchorsFile_MergedTossed_justGenes_chr" + chr + ".csv";
BufferedWriter genesAnchorbw = Utils.getBufferedWriter(anchorFileJustGenes);
String anchorFileHeader = "Chr,StartPos,EndPos,GeneStart,GeneEnd\n";
flankingAnchorbw.write(anchorFileHeader);
genesAnchorbw.write(anchorFileHeader);
String genePlus1000Fasta = outputBase + "genes_Plus1000orGapDiffFromRef_chr" + chr+ ".fa";
BufferedWriter flankingFastabw = Utils.getBufferedWriter(genePlus1000Fasta);
String geneFasta = outputBase + "genes_justGenes_chr" + chr + ".fa";
BufferedWriter genesFastabw = Utils.getBufferedWriter(geneFasta);
// Step 4. Write the files
int totalCount = 0;
// write the genes-only file
for (int idx = 0; idx < mergedGeneList.size(); idx++) {
// create fasta lines
GeneGFFData gffdata = mergedGeneList.get(idx);
byte[] genebytes = myRefSequence.chromosomeSequence(chrom,gffdata.start(),gffdata.end());
String geneString = NucleotideAlignmentConstants.nucleotideBytetoString(genebytes);
//write fasta - start/end writeen twice to signify chrom start/end then
// gene start/end. They are the same for this file (no flanking added)
StringBuilder seqSB = new StringBuilder();
seqSB.append(">").append(chr).append(":").append(Integer.toString(gffdata.start()))
.append(":").append(Integer.toString(gffdata.end())).append(":")
.append(Integer.toString(gffdata.start()))
.append(":").append(Integer.toString(gffdata.end()))
.append(" ").append(gffdata.name()).append("\n")
.append(geneString).append("\n");
genesFastabw.write(seqSB.toString());
// write to anchors file. THis file will be used in LoadAnchorsToDB.java
StringBuilder anchorSB = new StringBuilder();
// The gene start and gene end are the same as the chrom start/end position
// as this set has not added the flanking bps.
anchorSB.append(chr).append(",").append(Integer.toString(gffdata.start()))
.append(",").append(Integer.toString(gffdata.end())).append(",")
.append(Integer.toString(gffdata.start())).append(",")
.append(Integer.toString(gffdata.end())).append("\n");
genesAnchorbw.write(anchorSB.toString());
}
// write the genes-with-flanking file
for (int idx = 0; idx < lastGeneList.size(); idx++) {
totalCount++;
// create fasta lines
GeneGFFData gffdata = lastGeneList.get(idx);
GeneGFFData geneData = mergedGeneList.get(idx); // for gene start/end positions
byte[] genebytes = myRefSequence.chromosomeSequence(chrom,gffdata.start(),gffdata.end());
String geneString = NucleotideAlignmentConstants.nucleotideBytetoString(genebytes);
// write fasta - do I need the genestart/geneEnd data added?? I added it
StringBuilder seqSB = new StringBuilder();
seqSB.append(">").append(chr).append(":").append(Integer.toString(gffdata.start()))
.append(":").append(Integer.toString(gffdata.end()))
.append(":").append(Integer.toString(geneData.start())).append(":")
.append(Integer.toString(geneData.end())).append(" ")
.append(gffdata.name()).append("\n")
.append(geneString).append("\n");
flankingFastabw.write(seqSB.toString());
// write to anchors file. THis file will be used in LoadAnchorsToDB.java
//String anchorDataString = chr + "," + gffdata.start() + "," + gffdata.end() + "\n";
StringBuilder csvSB = new StringBuilder();
csvSB.append(chr).append(",").append(Integer.toString(gffdata.start())).append(",")
.append(Integer.toString(gffdata.end())).append(",")
.append(Integer.toString(geneData.start())).append(",")
.append(Integer.toString(geneData.end())).append("\n");
flankingAnchorbw.write(csvSB.toString());
}
System.out.println("Gene plus 1000, merge overlaps, split Gaps fasta written to " + genePlus1000Fasta);
System.out.println("Just merged Gene file written to " + geneFasta);
System.out.println("Finished chrom " + chr + ",num total fasta lines: " + totalCount + ", number genes processed: " + geneCount
+ ", number of genes merged: " + mergedGeneCount + ", number of embedded/skipped genes: " + skippedGeneCount);
genesFastabw.close();
genesAnchorbw.close();
flankingFastabw.close();
flankingAnchorbw.close();
mergedGenesbw.close();
embeddedbw.close();
} catch (Exception exc) {
exc.printStackTrace();
}
}
// THis version makes initial anchors from just the genes.
public static void mainProcessDataJustGenes(String geneFile, String outputBase, String chr) {
Chromosome chrom = new Chromosome(chr);
BufferedReader genebr = Utils.getBufferedReader(geneFile);
String geneFasta = outputBase + "genesChr" + chr + "_genesExactCoordinatesFromRef.fa";
BufferedWriter geneBW = Utils.getBufferedWriter(geneFasta);
String anchorFileString = outputBase + "anchorsFile_chr" + chr + "_genesExactCoordinates.csv";
BufferedWriter anchorFilebw = Utils.getBufferedWriter(anchorFileString);
try {
String geneline;
int geneCount = 0;
System.out.println("\nStart processing genes");
int chromLen = myRefSequence.chromosomeSize(chrom);
List geneGFFList = new ArrayList();
while ((geneline = genebr.readLine()) != null) {
geneCount++;
String[] geneTokens = geneline.split("\\t");
String description = geneTokens[8];
int semicolonLoc = description.indexOf(";");
String genename = description.substring(0, semicolonLoc); //strip off after gene
int colonLoc = description.indexOf(":");
genename = genename.substring(colonLoc+1); // strip off beginning "ID=gene:", leavning just gene name
// fill in map
GeneGFFData gffData = new GeneGFFData(Integer.parseInt(geneTokens[3]),
Integer.parseInt(geneTokens[4]), genename);
geneGFFList.add(gffData);
}
Collections.sort(geneGFFList); // sort by start position (may already be sorted in gene file)
genebr.close();
String anchorFileHeader = "Chr,StartPos,EndPos\n";
anchorFilebw.write(anchorFileHeader);
int totalCount = 0;
// write the files
for (int idx = 0; idx < geneGFFList.size(); idx++) {
totalCount++;
// create fasta lines
GeneGFFData gffdata = geneGFFList.get(idx);
int start = gffdata.start(); // lcj - this was previously run with - 1000 - and created a wrong ilfe
int end = gffdata.end() ;// lcj - this was previously run with +1000 - it was wrong, forgot to delete
byte[] genebytes = myRefSequence.chromosomeSequence(chrom,start,end);
String geneString = NucleotideAlignmentConstants.nucleotideBytetoString(genebytes);
String tagSeq = ">" + gffdata.name() + ":" + chr + ":" + start + ":" + end + "\n" + geneString + "\n"; // write header and data with newlines
geneBW.write(tagSeq);
// write to anchors file. THis file will be used in LoadAnchorsToDB.java
String anchorDataString = chr + "," + start + "," + end + "\n";
anchorFilebw.write(anchorDataString);
}
System.out.println("Gene fasta written to " + geneFasta);
System.out.println("Finished chrom " + chr + ",num total fasta lines: " + totalCount + ", number genes processed: " + geneCount);
geneBW.close();
anchorFilebw.close();
} catch (Exception exc) {
exc.printStackTrace();
}
}
/**
* @param args
*/
public static void main(String[] args) {
List infiles;
String refFile = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/Zea_mays.AGPv4.dna.toplevel.fa";
// String geneFile = "/Volumes/Samsung_T1/wgs_pipeline/v4_gff_files/chr10/gffv4Rel34_chr10_gene.txt";
// String outputBase = "/Users/lcj34/notes_files/repgen/wgs_pipeline/anchorsFromV4gff/chr10/";
// String chr = "10";
//String outputBase = "/Users/lcj34/notes_files/repgen/wgs_pipeline/debug/";
//String outputBase = "/Users/lcj34/notes_files/repgen/wgs_pipeline/anchorsFromV4gff/allGeneAnchorFiles/may25_doubleFiles/";
String outputBase = "/Users/lcj34/notes_files/repgen/wgs_pipeline/anchorsFromV4gff/allGeneAnchorFiles/july2017_phg_intervalsOldMethod/";
String geneFile = "/Volumes/Samsung_T1/wgs_pipeline/v4_gff_files/allGeneBedFiles";
int numFlanking = 1000; // number flanking bps to add on each end
//String geneFile = "/Volumes/Samsung_T1/wgs_pipeline/v4_gff_files/chr10/gffv4Rel34_chr10_gene.txt"; // just 1 gene for testing
// create list of directory files
File dirList = new File(geneFile);
if (!(dirList.exists())) {
throw new IllegalStateException("Input file or directory not found !!") ;
}
if (dirList.isDirectory()) {
String inputFileGlob="glob:*{.txt}";
infiles= DirectoryCrawler.listPaths(inputFileGlob, Paths.get(geneFile).toAbsolutePath());
if (infiles.size() < 1) {
throw new IllegalStateException("no .txt files found in input directory !!");
}
} else {
infiles=new ArrayList<>();
Path filePath= Paths.get(geneFile).toAbsolutePath();
infiles.add(filePath);
}
Collections.sort(infiles); // not really needed
System.out.println("Size of infiles: " + infiles.size());
// create reference
System.out.println("BEGIN createAnchorFilesFromGeneGFF - create reference");
myRefSequence = GenomeSequenceBuilder.instance(refFile);
// Loops through all chromosomes
for (Path filepath: infiles) {
// file looks like: gffv4Rel34_chr2_gene.txt
String fileName = filepath.getFileName().toString();
String fileNameSubstr = fileName.substring(fileName.indexOf("_")+1);
String chrName = fileNameSubstr.substring(0,fileNameSubstr.indexOf("_"));// get second one
chrName = chrName.toUpperCase();
chrName = chrName.replace("CHR", ""); // keep number string, minus leading "chr"
System.out.println("\nCreateAnchorFilesFromGeneGFF: processing chromsome " + chrName);
//mainProcessDataJustGenes(filepath.toString(),outputBase,chrName);
mainProcessMergeOverlapsAddGapDifference(filepath.toString(),outputBase,chrName,numFlanking);
}
System.out.println("\n\nFInished all chrom files!");
}
}