net.maizegenetics.pangenome.db_loading.CreateIntervalsFileFromGffPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.db_loading;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import javax.swing.ImageIcon;
import org.apache.log4j.Logger;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;
import net.maizegenetics.analysis.gbs.v2.TagExportToFastqPlugin;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.CheckSum;
import net.maizegenetics.util.Utils;
/**
* This class creates the interval files needed for running GATK haplotype caller,
* and the csv files needed for loading reference sequence into the database.
*
* Two sets of files are created: one set has coordinates based just on the ref
* gene coordinates. The other is gene coordinates plus user-specified flanking regions
*
* Algorithm:
* 1. read gff file, grab gene coordinates
* 2. For each Chromosome: merge genes that overlap, toss genes that are embedded within another gene
* Store list as mergedGeneList.
* 3. Using the mergedGeneList in 3, create 2nd per-chrom coordinate lists that includes flanking regions
* 4. Write files:
* interval format (chrom:start-end): a. mergedGeneList; b. mergedGEneList with flanking
* csv format (chr,anchorstart,anchorend,geneStart,geneEnd,geneName)
* a. mergedGeneList; b. mergedGEneList with flanking
* debug files: List of merged, list of embedded files written for informational purposes
*
* NOTE: the csv files contain the name of all genes contained in an anchor. This data is not
* stored in the DB. IT is included because the biologists have at times asked for it
* and this is a good place for it to be stored and retrieved.
*
* INPUT:
* 1. refFile: String: path to reference genome. needed to find size of chromosomes for
* adding flanking regions to last chrom entry.
* 2. geneFile: String: path to single file containing all chrom gene data in GFF format; or
* path to directory containing per-chrom files with gene data in GFF format.
* These data files must consist of GFF gene data alone, not the full gff.
* 3. outputBase: String: directory, including trailing "/", where output files will be written.
* 4. numFlanking: int: number of flanking bps to add on each end of the anchors.
*
* OUTPUT:
* 1. intervals file based on gene coordinates.
* 2. intervals file based on gene coordinates + numflanking bps
* 3. csv file based on gene coordinates
* 4. csv file based on gene coordinates + numflanking bps
*
* @author lcj34
*
*/
public class CreateIntervalsFileFromGffPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(CreateIntervalsFileFromGffPlugin.class);
private PluginParameter myRefFile = new PluginParameter.Builder("refFile", null, String.class).guiName("Ref Genome File").required(true).inFile()
.description("Fasta file containing reference genome").build();
private PluginParameter myGeneFile = new PluginParameter.Builder("geneFile", null, String.class).guiName("Gene File").required(true).inFile()
.description("Tab delimited .txt file containing gene-only GFF data from reference GFF file for all desired chromosomes, ").build();
private PluginParameter myOutputDir = new PluginParameter.Builder("outputDir", null, String.class).guiName("Output Directory").required(true).outFile()
.description("Directory where output files will be written").build();
private PluginParameter myNumFlanking = new PluginParameter.Builder("numFlanking", 1000, Integer.class).guiName("Number of Flanking BPs")
.description("Number of flanking basepairs to add at each end of the gene sequence").build();
public CreateIntervalsFileFromGffPlugin() {
super(null, false);
}
public CreateIntervalsFileFromGffPlugin(Frame parentFrame) {
super(parentFrame, false);
}
public CreateIntervalsFileFromGffPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public DataSet processData(DataSet input) {
System.out.println(" CreateIntervalsFileFromGff using gene file: " + geneFile() + ", create ref GenomeSequence");
GenomeSequence myRefSequence = GenomeSequenceBuilder.instance(refFile());
// Position has both chrom and physical position
RangeMap geneRange = TreeRangeMap.create();
RangeMap flankingRange = TreeRangeMap.create();
try {
// Process all chrom gene input files
BufferedReader genebr = Utils.getBufferedReader(geneFile());
String geneline;
while ((geneline = genebr.readLine()) != null) {
String[] geneTokens = geneline.split("\\t");
String chrom = geneTokens[0];
// Gene column looks as below. Grab just the gene name
//ID=gene:Zm00001d027231;biotype=protein_coding;gene_id=Zm00001d027231;logic_name=maker_gene
String description = geneTokens[8];
String genename = description.split(";")[0].split(":")[1];
Chromosome curChrom = new Chromosome(chrom);
Position startPos = new GeneralPosition.Builder(curChrom,Integer.parseInt(geneTokens[3])).build();
Position endPos = new GeneralPosition.Builder(curChrom,Integer.parseInt(geneTokens[4])).build();
addRange(geneRange, Range.closed(startPos, endPos),genename);
}
genebr.close();
// create anchors with flanking
System.out.println("Call createFlankingList");
flankingRange = createFlankingList( geneRange,numFlanking(), myRefSequence);
if (flankingRange == null) {
System.out.println("CreateFlankingList failed - exit");
return null;
}
// Step 4: write the CSV/Intervals files
System.out.println("Begin writing files");
// Files to write.
String anchorFileString = outputDir() + "anchorCoordinates_withFlanking_allchrs.csv";
BufferedWriter flankingAnchorbw = Utils.getBufferedWriter(anchorFileString);
String anchorFileJustGenes = outputDir() + "anchorCoordinates_justGenes_allchrs.csv";
BufferedWriter genesAnchorbw = Utils.getBufferedWriter(anchorFileJustGenes);
String intervalsFlankingString = outputDir() + "anchorCoordinates_withFlanking_allchrs.intervals";
BufferedWriter flankingIntervalsbw = Utils.getBufferedWriter(intervalsFlankingString);
String intervalsJustGenes = outputDir() + "anchorCoordinates_justGenes_allchrs.intervals";
BufferedWriter genesIntervalsbw = Utils.getBufferedWriter(intervalsJustGenes);
// add header only once, then call to write files
String anchorFileHeader = "Chr,StartPos,EndPos,GeneStart,GeneEnd,GeneName\n";
flankingAnchorbw.write(anchorFileHeader);
genesAnchorbw.write(anchorFileHeader);
writeFiles(geneRange, flankingRange, flankingAnchorbw, genesAnchorbw,
flankingIntervalsbw, genesIntervalsbw);
flankingAnchorbw.close();
genesAnchorbw.close();
flankingIntervalsbw.close();
genesIntervalsbw.close();
// Step 5: write plugin parameters file
printParametersFile( outputDir(), refFile(), geneFile(), numFlanking());
} catch (Exception exc) {
exc.printStackTrace();
}
System.out.println("\n\nFInished all chrom files!");
return null;
}
private static void addRange(RangeMap geneRange, Range range, String gene) {
List, String>> overlaps = new ArrayList<>(
geneRange.subRangeMap(range).asMapOfRanges().entrySet());
//if overlaps has length, merge ranges together
if (overlaps.size() != 0) {
Map.Entry, String> overlappingEntry = geneRange.getEntry(overlaps.get(0).getKey().lowerEndpoint());
//then use the combined range and assign the call
String newGene = overlappingEntry.getValue() + "-" + gene;
// Update overlappingEntry value with new merged gene value.
// 2nd put is to ensure new entry is merged with the new value
geneRange.put(overlappingEntry.getKey(),newGene);
geneRange.putCoalescing(range, newGene);
}
else {
geneRange.put(range, gene);
}
}
private static Position findFlankingStartPos(RangeMap geneRange,
Range data, int numFlanking, int chromLen) {
int flankCheck = numFlanking*2;
Chromosome chrom = data.lowerEndpoint().getChromosome();
int curStart = data.lowerEndpoint().getPosition();
int lowerflank = data.lowerEndpoint().getPosition() - flankCheck;
Position flankStart2000 = new GeneralPosition.Builder(chrom,lowerflank).build();
Position flankUpToLower = new GeneralPosition.Builder(chrom,curStart-1).build();
Range startCheck = Range.closed(flankStart2000,flankUpToLower);
// Want to add 1000bp (or user defined number) flanking on either side of each entry in geneRange map,
// then add the new range positions to the flankingRange map to be returned.
// Before adding the 1000 bps, need to verify
// 1. there are 1000 bps between the start of the chrom and the start of this entry (or start = 1)
// 2. the distance between this entry and the current entry is >= 2000 (1000 on each end)
// 3. the distance between this entry and the chrom len is at least 1000 (otherwise chromlen = end)
List, String>> overlapsStart = new ArrayList<>(
geneRange.subRangeMap(startCheck).asMapOfRanges().entrySet());
Position newLowerPos = null;
if (overlapsStart.size() > 0) {
// the last overlap should be the closest in position to the current range start
int prevEnd = overlapsStart.get(overlapsStart.size()-1).getKey().upperEndpoint().getPosition() ;
int newFlankNum = curStart-prevEnd;
int newLowerInt = (curStart - newFlankNum/2) + 1;// want start to be 1 past ;
newLowerPos = new GeneralPosition.Builder(chrom,newLowerInt).build();
} else {
int newLowerInt = curStart - numFlanking < 1 ? 1 : curStart - numFlanking;
newLowerPos = new GeneralPosition.Builder(chrom,newLowerInt).build();
}
return newLowerPos;
}
private static Position findFlankingEndPos(RangeMap geneRange,
Range data, int numFlanking, int chromLen) {
Chromosome chrom = data.lowerEndpoint().getChromosome();
int flankCheck = numFlanking*2;
int curEnd = data.upperEndpoint().getPosition();
int upperflank = data.upperEndpoint().getPosition() + flankCheck;
Position curEndPlusOne = new GeneralPosition.Builder(chrom,curEnd+1).build();
Position flankEnd2000 = new GeneralPosition.Builder(chrom,upperflank).build();
Range endCheck = Range.closed(curEndPlusOne,flankEnd2000);
List, String>> overlapsEnd = new ArrayList<>(
geneRange.subRangeMap(endCheck).asMapOfRanges().entrySet());
Position newUpperPos = null;
if (overlapsEnd.size() > 0) {
// the first overlap should be the closest in position to the current range end
int nextStart = overlapsEnd.get(0).getKey().lowerEndpoint().getPosition() ;
int newFlankNum = nextStart-curEnd;
int newUpperInt = (curEnd + newFlankNum/2); // don't add 1 here, gets added at start
newUpperPos = new GeneralPosition.Builder(chrom,newUpperInt).build();
} else {
int newUpperInt = curEnd + numFlanking > chromLen ? chromLen : curEnd + numFlanking;
newUpperPos = new GeneralPosition.Builder(chrom,newUpperInt).build();
}
return newUpperPos;
}
private static RangeMap createFlankingList(RangeMap geneRange,
int numFlanking, GenomeSequence myRefSequence) {
// there are no embedded or overlapped entries in the geneRange map.
// The goal is to add 1000bps flanking to start/end of each entry.
// there may not be 2000bps between each gene entry. If not, split the difference
// between: half to geneA end, half to geneB start. This is done in
// findFlankingStartPos() and findFlankingEndPos()
RangeMap flankingRange = TreeRangeMap.create();
try {
geneRange.asMapOfRanges().entrySet().stream().forEach(range -> {
Range data = range.getKey();
Chromosome chrom = data.lowerEndpoint().getChromosome();
int chromLen = myRefSequence.chromosomeSize(chrom);
// Find new start/end positions with specified nubmer of flanking, add to map
Position newLowerPos = findFlankingStartPos( geneRange, data,numFlanking,chromLen);
Position newUpperPos = findFlankingEndPos( geneRange, data,numFlanking,chromLen);
flankingRange.put(Range.closed(newLowerPos, newUpperPos), range.getValue());
});
if (geneRange.asMapOfRanges().size() != flankingRange.asMapOfRanges().size()) {
System.out.println("ERROR - mergedGeneList size " + geneRange.asMapOfRanges().size()
+ " does NOT equal lastGeneList size " + flankingRange.asMapOfRanges().size());
return null;
}
return flankingRange;
} catch (Exception exc) {
exc.printStackTrace();
return null;
}
}
// This method writes 4 files: anchors file in csv format including gene names for both anchors-with-flanking
// and genes-only; and the same 2 files in "intervals" format for use with GATK haplotype caller.
// Intervals file (for haplotype caller) needs the format:
// 8:12921-13181 (chrom:startpos-endpos)
// CSV file (for loading db) needs format:
// chr,anchorstart,anchorend,genestart,geneend,geneName
private static void writeFiles(RangeMap geneRangeMap, RangeMap anchorRangeMap,
BufferedWriter flankingAnchorbw, BufferedWriter genesAnchorbw,
BufferedWriter flankingIntervalsbw, BufferedWriter genesIntervalsbw){
try {
// These lists should be of the same size and must be printed in sequential order
List> geneList = new ArrayList>(geneRangeMap.asMapOfRanges().keySet());
List> anchorList = new ArrayList>(anchorRangeMap.asMapOfRanges().keySet());
System.out.println("writeFiles: size of geneList: " + geneList.size()
+ ", size of anchorList: " + anchorList.size());
for (int idx = 0; idx < geneList.size(); idx++) {
Range anchorRange = anchorList.get(idx);
int astart = anchorRange.lowerEndpoint().getPosition();
int aend = anchorRange.upperEndpoint().getPosition();
Range geneRange = geneList.get(idx);
int gstart = geneRange.lowerEndpoint().getPosition();
int gend = geneRange.upperEndpoint().getPosition();
String chrom = geneRange.lowerEndpoint().getChromosome().getName();
String gene = geneRangeMap.get(geneRange.lowerEndpoint());
// anchors with flanking
StringBuilder anchorSB = new StringBuilder();
anchorSB.append(chrom).append(",").append(astart).append(",").append(aend).append(",")
.append(gstart).append(",").append(gend).append(",")
.append(gene).append("\n");
flankingAnchorbw.write(anchorSB.toString());
// anchors just-genes
anchorSB.setLength(0);
anchorSB.append(chrom).append(",").append(gstart).append(",").append(gend).append(",")
.append(gstart).append(",").append(gend).append(",")
.append(gene).append("\n");
genesAnchorbw.write(anchorSB.toString());
// interval file - with flanking
StringBuilder intervalSB = new StringBuilder();
intervalSB.append(chrom).append(":").append(astart).append("-").append(aend).append("\n");
flankingIntervalsbw.write(intervalSB.toString());
// interval file - just genes
intervalSB.setLength(0);
intervalSB.append(chrom).append(":").append(gstart).append("-").append(gend).append("\n");
genesIntervalsbw.write(intervalSB.toString());
}
} catch (Exception exc) {
exc.printStackTrace();
}
}
private static void printParametersFile(String outputDir, String refFile, String geneFile, int numFlanking) {
// Create the parameters file as per JIRA PHG_6
String paramFile = outputDir + "CreateIntervalsFileFromGff_parameter.txt";
String geneCSV = outputDir + "anchorCoordinates_justGenes_allchrs.csv";
String geneIntervals = outputDir + "anchorCoordinates_justGenes_allchrs.intervals";
String flankingCSV = outputDir + "anchorCoordinates_withFlanking_allchrs.csv";
String flankingIntervals = outputDir + "anchorCoordinates_withFlanking_allchrs.intervals";
try {
BufferedWriter bw = Utils.getBufferedWriter(paramFile);
String headers = "PluginName\tDateExecuted\tParameterName\tParameter_IO\tParameterType\tValue\tSHA-1\n";
bw.write(headers);
DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss");
LocalDateTime now = LocalDateTime.now();
String formattedTime = dtf.format(now);
String nameDate = "CreateIntervalsFileFromGff\t" + formattedTime + "\t";
String refCheckSum = CheckSum.getProtocolChecksum(refFile, "SHA-1");
StringBuilder paramSB = new StringBuilder().append(nameDate).append("refFile\tinput\tString\t")
.append(refFile).append("\t").append(refCheckSum).append("\n");
bw.write(paramSB.toString());
paramSB.setLength(0);
String geneCheckSum = CheckSum.getProtocolChecksum(geneFile, "SHA-1");
paramSB.append(nameDate).append("geneFile\tinput\tString\t").append(geneFile).append("\t")
.append(geneCheckSum).append("\n");
bw.write(paramSB.toString());
paramSB.setLength(0);
paramSB.append(nameDate).append("numFlanking\tinput\tint\t").append(numFlanking).append("\tNA\n");
bw.write(paramSB.toString());
paramSB.setLength(0);
// List the output files. These files are outputDir + name
String geneCSVcheckSum = CheckSum.getProtocolChecksum(geneCSV, "SHA-1");
paramSB.append(nameDate).append("outputDir\toutput\tString\t").append(geneCSV).append("\t")
.append(geneCSVcheckSum).append("\n");
bw.write(paramSB.toString());
paramSB.setLength(0);
String geneIntervalsCheckSum = CheckSum.getProtocolChecksum(geneIntervals, "SHA-1");
paramSB.append(nameDate).append("outputDir\toutput\tString\t").append(geneIntervals).append("\t")
.append(geneIntervalsCheckSum).append("\n");
bw.write(paramSB.toString());
paramSB.setLength(0);
String flankingCSVCheckSum = CheckSum.getProtocolChecksum(flankingCSV, "SHA-1");
paramSB.append(nameDate).append("outputDir\toutput\tString\t").append(flankingCSV).append("\t")
.append(flankingCSVCheckSum).append("\n");
bw.write(paramSB.toString());
paramSB.setLength(0);
String flankingIntervalsCheckSum = CheckSum.getProtocolChecksum(flankingIntervals, "SHA-1");
paramSB.append(nameDate).append("outputDir\toutput\tString\t").append(flankingIntervals).append("\t")
.append(flankingIntervalsCheckSum).append("\n");
bw.write(paramSB.toString());
paramSB.setLength(0);
bw.close();
} catch (Exception exc) {
exc.printStackTrace();
}
}
/**
* @param args
*/
// public static void main(String[] args) {
//
// String refFile = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/Zea_mays.AGPv4.dna.toplevel.fa";
// String outputBase = "/Users/lcj34/notes_files/repgen/wgs_pipeline/anchorsFromV4gff/allGeneAnchorFiles/july2017_phg_intervals_RangeMaps/";
// //String geneFile = "/Volumes/Samsung_T1/wgs_pipeline/v4_gff_files/allGeneBedFiles";
// String geneFile = "/Volumes/Samsung_T1/wgs_pipeline/v4_gff_files/gffv4Rel34_all10chroms_gene.txt";
// int numFlanking = 1000; // number flanking bps to add on each end
//
// mainProcessMergeOverlapsAddGapDifference(refFile,geneFile,outputBase,numFlanking);
// }
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(CreateIntervalsFileFromGff.class);
// }
/**
* Convenience method to run plugin with one return object.
*/
// TODO: Replace with specific type.
public String runPlugin(DataSet input) {
return (String) performFunction(input).getData(0).getData();
}
@Override
public ImageIcon getIcon() {
// TODO Auto-generated method stub
return null;
}
@Override
public String getButtonName() {
// TODO Auto-generated method stub
return null;
}
@Override
public String getToolTipText() {
// TODO Auto-generated method stub
return null;
}
/**
* Fasta file containing reference genome
*
* @return Ref Genome File
*/
public String refFile() {
return myRefFile.value();
}
/**
* Set Ref Genome File. Fasta file containing reference
* genome
*
* @param value Ref Genome File
*
* @return this plugin
*/
public CreateIntervalsFileFromGffPlugin refFile(String value) {
myRefFile = new PluginParameter<>(myRefFile, value);
return this;
}
/**
* Tab delimited .txt file containing gene-only GFF data
* from reference GFF file,
*
* @return Gene File
*/
public String geneFile() {
return myGeneFile.value();
}
/**
* Set Gene File. Tab delimited .txt file containing gene-only
* GFF data from reference GFF file,
*
* @param value Gene File
*
* @return this plugin
*/
public CreateIntervalsFileFromGffPlugin geneFile(String value) {
myGeneFile = new PluginParameter<>(myGeneFile, value);
return this;
}
/**
* Directory where output files will be written
*
* @return Output Directory
*/
public String outputDir() {
return myOutputDir.value();
}
/**
* Set Output Directory. Directory where output files
* will be written
*
* @param value Output Directory
*
* @return this plugin
*/
public CreateIntervalsFileFromGffPlugin outputDir(String value) {
myOutputDir = new PluginParameter<>(myOutputDir, value);
return this;
}
/**
* Number of flanking basepairs to add at each end of
* the gene sequence
*
* @return Number of Flanking BPs
*/
public Integer numFlanking() {
return myNumFlanking.value();
}
/**
* Set Number of Flanking BPs. Number of flanking basepairs
* to add at each end of the gene sequence
*
* @param value Number of Flanking BPs
*
* @return this plugin
*/
public CreateIntervalsFileFromGffPlugin numFlanking(Integer value) {
myNumFlanking = new PluginParameter<>(myNumFlanking, value);
return this;
}
}