net.maizegenetics.pangenome.db_loading.CreateGeneIntervalsFromConservedGFFPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.db_loading;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import javax.swing.ImageIcon;
import org.apache.log4j.Logger;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Utils;
/**
* Creates bed file with chromosome and coordinates to be used to create PHG reference ranges.
* Gene names are included with each reference range. The gene name column
* is ignored when loading the bed file - it is there for when we want to associate
* reference ranges with the genes on which they were based.
*
* @author lcj34
*
*/
public class CreateGeneIntervalsFromConservedGFFPlugin extends AbstractPlugin{
private static final Logger myLogger = Logger.getLogger(CreateGeneIntervalsFromConservedGFFPlugin.class);
private PluginParameter myGffFile = new PluginParameter.Builder("gffFile", null, String.class).guiName("Gene File").required(true).inFile()
.description("Tab delimited .txt file containing gene-only GFF data from reference GFF file for all desired chromosomes, ").build();
private PluginParameter myOutputFile = new PluginParameter.Builder("outputFile", null, String.class).guiName("Output File").required(true).outFile()
.description("Full path of file to be written").build();
private PluginParameter myConservedGeneFile = new PluginParameter.Builder("conservedGeneFile", null, String.class).guiName("Conserved Gene File").required(true).inFile()
.description("File containing list of conserved genes to include as anchor regions ").build();
public CreateGeneIntervalsFromConservedGFFPlugin() {
super(null, false);
}
public CreateGeneIntervalsFromConservedGFFPlugin(Frame parentFrame) {
super(parentFrame, false);
}
public CreateGeneIntervalsFromConservedGFFPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
public static void main(String[] args) {
GeneratePluginCode.generate(CreateGeneIntervalsFromConservedGFFPlugin.class);
}
@Override
public DataSet processData(DataSet input) {
myLogger.info(" CreateGeneIntervalsFromConservedGFFPlugin using gene file: " + gffFile() + ", create ref GenomeSequence");
// Position has both chrom and physical position
RangeMap geneRange = TreeRangeMap.create();
// String anchorFileJustGenes = outputDir() + "anchorCoordinates_conservedGenes_allchrs.bed";
String anchorFileJustGenes = outputFile();
List conservedGeneList = new ArrayList<>();
try (BufferedReader gffbr = Utils.getBufferedReader(gffFile());
BufferedReader conservedbr = Utils.getBufferedReader(conservedGeneFile());
BufferedWriter genesAnchorbw = Utils.getBufferedWriter(anchorFileJustGenes)){
// Process all chrom gene input files
String line = null;
while ((line = conservedbr.readLine()) != null) {
conservedGeneList.add(line);
}
// This would re-sort incorrectly - string of start puts 10 before 9
//Collections.sort(conservedGeneList); // gff file already in order
String geneline;
while ((geneline = gffbr.readLine()) != null) {
String[] geneTokens = geneline.split("\\t");
String chrom = geneTokens[0];
// Gene column looks as below. Grab just the gene name
//ID=gene:Zm00001d027231;biotype=protein_coding;gene_id=Zm00001d027231;logic_name=maker_gene
String description = geneTokens[8];
String genename = description.split(";")[0].split(":")[1];
if (!conservedGeneList.contains(genename)) continue; // only process conserved genes
Chromosome curChrom = Chromosome.instance(chrom);
Position startPos = new GeneralPosition.Builder(curChrom,Integer.parseInt(geneTokens[3])).build();
Position endPos = new GeneralPosition.Builder(curChrom,Integer.parseInt(geneTokens[4])).build();
addRange(geneRange, Range.closed(startPos, endPos),genename);
}
// Step 4: write the bed files
myLogger.info("Begin writing files");
// DON'T WRITE - header, is giving me problems
String anchorFileHeader = "#Chr\tGeneStart\tGeneEnd\tGeneNames\n";
//genesAnchorbw.write(anchorFileHeader);
writeFiles(geneRange, genesAnchorbw);
} catch (Exception exc) {
exc.printStackTrace();
}
myLogger.info("\n\nFinished all chrom files!");
return null;
}
private static void addRange(RangeMap geneRange, Range range, String gene) {
List, String>> overlaps = new ArrayList<>(
geneRange.subRangeMap(range).asMapOfRanges().entrySet());
//if overlaps has length, merge ranges together
if (overlaps.size() != 0) {
Map.Entry, String> overlappingEntry = geneRange.getEntry(overlaps.get(0).getKey().lowerEndpoint());
//then use the combined range and assign the call
String newGene = overlappingEntry.getValue() + "-" + gene;
// Update overlappingEntry value with new merged gene value.
// 2nd put is to ensure new entry is merged with the new value
geneRange.put(overlappingEntry.getKey(),newGene);
geneRange.putCoalescing(range, newGene);
}
else {
geneRange.put(range, gene);
}
}
private static void writeFiles(RangeMap geneRangeMap,BufferedWriter genesAnchorbw){
try {
// These lists should be of the same size and must be printed in sequential order
List> geneList = new ArrayList>(geneRangeMap.asMapOfRanges().keySet());
myLogger.info("writeFiles: size of geneList: " + geneList.size() );
for (int idx = 0; idx < geneList.size(); idx++) {
Range geneRange = geneList.get(idx);
int gstart = geneRange.lowerEndpoint().getPosition()-1; // bed file is 0-based, gff was 1-based
int gend = geneRange.upperEndpoint().getPosition(); // bed files is inclusive/exclusive, gff was inclusive/inclusive
String chrom = geneRange.lowerEndpoint().getChromosome().getName();
String gene = geneRangeMap.get(geneRange.lowerEndpoint());
StringBuilder anchorSB = new StringBuilder();
// anchors just-genes
// NOTE: genes are written twice because the load script is still
// written for a db table that contains gene start/end. This needs
// to change.
// NOTE _ we do NOT need genes written twice
anchorSB.setLength(0);
anchorSB.append(chrom).append("\t")
.append(gstart).append("\t").append(gend).append("\t")
.append(gene).append("\n");
genesAnchorbw.write(anchorSB.toString());
}
} catch (Exception exc) {
exc.printStackTrace();
}
}
@Override
public ImageIcon getIcon() {
// TODO Auto-generated method stub
return null;
}
@Override
public String getButtonName() {
// TODO Auto-generated method stub
return null;
}
@Override
public String getToolTipText() {
// TODO Auto-generated method stub
return null;
}
/**
* Tab delimited .txt file containing gene-only GFF data
* from reference GFF file for all desired chromosomes,
*
*
* @return Gene File
*/
public String gffFile() {
return myGffFile.value();
}
/**
* Set Gene File. Tab delimited .txt file containing gene-only
* GFF data from reference GFF file for all desired chromosomes,
*
*
* @param value Gene File
*
* @return this plugin
*/
public CreateGeneIntervalsFromConservedGFFPlugin gffFile(String value) {
myGffFile = new PluginParameter<>(myGffFile, value);
return this;
}
/**
* Name of output file to be written
*
* @return Output File
*/
public String outputFile() {
return myOutputFile.value();
}
/**
* Set Output File. Name of file to be written
*
* @param value Output Directory
*
* @return this plugin
*/
public CreateGeneIntervalsFromConservedGFFPlugin outputFile(String value) {
myOutputFile = new PluginParameter<>(myOutputFile, value);
return this;
}
/**
* File containing list of conserved genes to include
* as anchor regions
*
* @return Conserved Gene File
*/
public String conservedGeneFile() {
return myConservedGeneFile.value();
}
/**
* Set Conserved Gene File. File containing list of conserved
* genes to include as anchor regions
*
* @param value Conserved Gene File
*
* @return this plugin
*/
public CreateGeneIntervalsFromConservedGFFPlugin conservedGeneFile(String value) {
myConservedGeneFile = new PluginParameter<>(myConservedGeneFile, value);
return this;
}
}