
net.maizegenetics.analysis.gobii.HapBreakpoints_IFLFilePlugin Maven / Gradle / Ivy
/**
*
*/
package net.maizegenetics.analysis.gobii;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import javax.swing.ImageIcon;
import org.apache.log4j.Logger;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Utils;
/**
* THis is defined in tas-1098
*
* This plugin takes a haplotype breakpoint file and creates intermediate files
* for the hap_breakpoint and breakpoint_set tables. These three tables are currently
* proprietary tables created by Buckler lab to be added to the GOBII postgres DB.
*
* THe tables are created from the create_hapBrkptTables.sql on cbsudc01 in directory
* /workdir/lcj34/postgresFiles/gobii_ifl_filesToLoad/gobii_hapbreakpoints
*
* THe tables to be created have these entries:
* hapbreakpoint Table:
* hap_breakpoint_id int
* taxa(GID) int
* position_range int4range (start/stop stored as an integer range)
* donor1 (GID) int
* donor2 (GID) int
* breakpoint_set_id int (maps to breapoint_set table)
*
* breakpoint_set Table:
* breakpoint_set_id int
* name text
* method text (method used to created breakpoint file ,e.g FILLIN)
* mapset_id int
*
* projection_align Table:
* projection_align_id int
* name text
* het_resolution text
* breakpoint_set_id int
* dataset_id int (gives us the donor file)
*
* The thought is that GOBII IFL scripts will operate successfully on them. It will
* require the tables to be previously created in the DB, and mapping files to be
* created and stored on the db server that can be used to process these files
* for bulk loading.
*
* THis method will take the set name and use it to populate the breakpoint table.
* Need to create the index into each table like GOBII does, with auto increment
* and they live in the pg_catalog.
*
* Restrictions: The breakfile format must be as defined by Ed. The first line
* must contain 2 tab-delimited values, the first indicating the number of donors
* and the second indicating the number of "blocks" to process. All lines beginning
* with a "#" are considered comment lines.
*
* There are 2 mapping files required: one for donors, and one for taxa. in the Ames
* example, the donor mapping file is the same file used to curate the WGS dataset 5,
* which is named ZeaWGS_hmp321_raw_AGPv3. The taxa mapping file is the file Cinta
* provides with each subset, e.g. for Ames in Cornell box.
*
* If the TaxaColumn in the mapping file contains a libraryID (e.g. name:libraryID)
* the code removes it, leaving taxa set to just the "name" portion. THis is because
* Kelly's pa.txt.gz files do NOT have the library portion, though some of Cinta's
* files do.
*
* The breakpoint file has donors at top: Donors are Feb-48,PHG84, PHG83, etc.
* Their GIDs come from the WGS mapping file Cinta gave for curating that large vcf set.
* 1210 2711
* #Donor Haplotypes
* 0 Feb-48
* 1 PHG84
* 2 PHG83
*
* And taxa breakpoint blocks at the bottom:
* "taxa" is the first column. It gets it GID from the germplasminformation_Ames_20160810.txt
* file Cinta provided in Cornell Box with the Ames data: Below, 12E and 37 are taxa.
* #Block are defined chr:startPos:endPos:donor1:donor2 (-1 means no hypothesis)
* 12E 1:299497909:299752426:958:1070 1:299777120:300064510:79:958 1:300064521:300335541:1050:1202 1:300351838:300818858:162:7
* 37 2:233507851:234268676:1015:1015 2:234268677:234350614:820:820 2:234350639:234411604:950:950 2:234411637:234464003:191:1
*
*
* The third table, the projection_alignment table, will be populated with
* another plugin. Users will create their own projection alignment analyses
* for the breakpoint sets of their choice.
*
* -------------
* Sept 14, 2016: Trying with Cinta's "name2" field, as the taxa are not
* matching from the taxa column for the taxa mapping file. Donors are working.
*
* @author lcj34
*
*/
public class HapBreakpoints_IFLFilePlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(PreProcessGOBIIMappingFilePlugin.class);
// private PluginParameter dbConfigFile= new PluginParameter.Builder<>("dbConfigFile",null,String.class).guiName("dbConfigFile").required(true)
// .description("DB config file containing connection information to the GOBII instance").build();
private PluginParameter breakFile= new PluginParameter.Builder<>("breakFile",null,String.class).guiName("Breakpoint File").required(true)
.description("Full path to a single file containing breakpoint blocks for projection alignment OR to a directory containing breakpoint files,\n Files must end with *.pa.txt or *.pa.txt.gz").build();
private PluginParameter setName= new PluginParameter.Builder<>("setName",null,String.class).guiName("Breakpoint Set Name").required(true)
.description("Name to be given to this set of breakpoints. This name will be stored in the breakpointSet table.").build();
private PluginParameter mapset = new PluginParameter.Builder<>("mapset",null,String.class).guiName("mapset").required(true)
.description("Name of the mapset to which these breakpoints refer. Must match an existing name in the mapset table, e.g AGPV3").build();
private PluginParameter src_dataset = new PluginParameter.Builder<>("src_dataset",null,String.class).guiName("Source Data Set").required(true)
.description("Name of the dataset from which these breakpoints were created. Must match an existing name in the dataset table").build();
private PluginParameter outputDir= new PluginParameter.Builder<>("outputDir",null,String.class).guiName("Path of output directory").required(true)
.description("Full path name of directory to which output files will be written, must end with a /").build();
private PluginParameter method= new PluginParameter.Builder<>("method",null,String.class).guiName("Breakpoint Method").required(true)
.description("Method used to created the breakpoints, e.g. FILLIN, beagle, etc").build();
private PluginParameter donorMapFile = new PluginParameter.Builder<>("donorMapFile",null,String.class).guiName("mappingFile").required(true)
.description("tab-delimited File containing a TaxaColumn name to be used to map breakFile donors with a GID. \nThis may be the same file that was used for adding germplasm and marker data.").build();
private PluginParameter taxaMapFile = new PluginParameter.Builder<>("taxaMapFile",null,String.class).guiName("mappingFile").required(true)
.description("tab-delimited File containing a TaxaColumn name to be used to map breakFile taxa with a GID. \nThis may be a different file than was used for adding germplasm and marker data.").build();
List infiles;
boolean printTaxa = false;
public HapBreakpoints_IFLFilePlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
public HapBreakpoints_IFLFilePlugin() {
super(null, false);
}
@Override
protected void postProcessParameters() {
File out= new File(outputDir.value());
if (!out.getParentFile().exists()) out.getParentFile().mkdirs();
// create list of directory files
File dirList = new File(breakFile());
if (!(dirList.exists())) {
throw new IllegalStateException("Input file or directory not found !! " + breakFile()) ;
}
if (dirList.isDirectory()) {
System.out.println("LCJ - breakfile is a directory, creating list of files ...");
String inputFileGlob="glob:*{pa.txt,pa.txt.gz}";
infiles= DirectoryCrawler.listPaths(inputFileGlob, Paths.get(breakFile.value()).toAbsolutePath());
if (infiles.size() < 1) {
throw new IllegalStateException("no .txt files found in input directory !!");
}
} else {
infiles=new ArrayList<>();
Path filePath= Paths.get(breakFile()).toAbsolutePath();
infiles.add(filePath);
}
Collections.sort(infiles);
System.out.println("LCJ - size of infiles: " + infiles.size());
}
//This takes the mapping file, returns a hashmap of taxaname/GID - both strings
// For now, I will strip off anything from a colon and beyond in the TaxaColumn.
// Cinta added both taxa and libraryID, thinking that is what the vcf/hmp file
// would have. Some do. Others don't. Kelly's PA files do NOT have the library ID
// appended, so no taxa is found. If there is much variation between files,
// then whether to expect the libraryID will need to be a parameter to the plugin.
public HashMap createNameGidMap(String mappingFile, String taxaField) {
HashMap taxaMapArray = new HashMap();
BufferedReader taxaMapFilebr = Utils.getBufferedReader(mappingFile, 1 << 22);
String taxaMapLine;
try {
int taxaIdx=-1, gidIdx=-1;
taxaMapLine = taxaMapFilebr.readLine();
if (taxaMapLine != null) {
// find header columns. We just want taxaColumn and GID
String [] headers = taxaMapLine.split("\\t");
int idx = 0;
for (String header : headers) {
if (header.trim().toUpperCase().equals(taxaField)) {
taxaIdx = idx;
} else if (header.trim().toUpperCase().equals("GID")) {
gidIdx = idx;
}
idx++;
}
if (taxaIdx == -1 || gidIdx == -1) {
System.out.println("LCJ - createNameGIDMap returning NULL!! taxaIdx: " + taxaIdx + " gidIdx: " + gidIdx);
return null; // error - missing needed column
}
}
// Now read file and get values
while((taxaMapLine = taxaMapFilebr.readLine() )!= null) {
// this is the mapping file - read and pull taxaname/gid, store in map
String[] data = taxaMapLine.split("\\t");
String taxaname = data[taxaIdx].trim();
int colonIndex = taxaname.indexOf(":");
if (colonIndex > 0) { // remove libraryId if it was appended
taxaname = taxaname.substring(0, colonIndex);
}
String gid = data[gidIdx].trim();
if (printTaxa)
System.out.println(" createNameGIDMap, adding taxa:" + taxaname + " with gid " + gid);
taxaMapArray.put(taxaname, gid);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("LCJ - size of taxaMapARray: " + taxaMapArray.entrySet().size());
return taxaMapArray;
}
@Override
public DataSet processData(DataSet input) {
// Connect to db
try {
// Connection dbConnection = GOBIIDbUtils.connectToDB(dbConfigFile());
// if (dbConnection == null) {
// throw new IllegalStateException("PreProcessGOBIIMappingFilePlugin: Problem connecting to database.");
// }
// // get the dataset id;
// StringBuilder sb = new StringBuilder();
// sb.append("select mapset_id from mapset where name = '");
// sb.append(mapset());
// sb.append("';");
// ResultSet rs = dbConnection.createStatement().executeQuery(sb.toString());
// int mapsetId = -1;
// while (rs.next()) {
// mapsetId = rs.getInt("mapset_id");
// }
// if (mapsetId < 0) {
// System.out.println("Could not find mapsetId from mapset name " + mapset() + " please check name and try again !!");
// return null;
// }
// Read the taxa mapping file, create array of names/maps
System.out.println("LCJ - calling createNameGidMap for taxaMapFile");
HashMap taxaNameGidMap = createNameGidMap(taxaMapFile(),"NAME");
if (taxaNameGidMap == null) {
//if (taxaNameGidMap != null) { // RETURN CODE ABOVE - this is just for debugging, printing names
System.out.println("ERROR - could not create taxaname-gid mapping");
return null;
}
//printTaxa = true;
System.out.println("\nLCJ - calling createNameGIDMap for donorMapFile");
HashMap donorNameGidMap = createNameGidMap(donorMapFile(),"TAXACOLUMN");
if (donorNameGidMap == null) {
System.out.println("ERROR - could not create taxaname-gid mapping");
return null;
}
System.out.println("HapBreakpoints:processData begin\n");
String hapbrkFile = outputDir() + setName() + ".hapbreakpoints";
String breakSetFile = outputDir() + setName() + ".breakpoint_set";
BufferedWriter writerhap = Utils.getBufferedWriter(hapbrkFile);
BufferedWriter writerbrkset = Utils.getBufferedWriter(breakSetFile);
BufferedWriter taxaNotFoundbw = Utils.getBufferedWriter("/Users/lcj34/notes_files/gobiiANDBms/gobii_loading/gobii_ifl_files/gobii_hapbreakpoints/taxaNotFound.txt");
BufferedWriter taxaFoundbw = Utils.getBufferedWriter("/Users/lcj34/notes_files/gobiiANDBms/gobii_loading/gobii_ifl_files/gobii_hapbreakpoints/taxaFound.txt");
// Create header lines for each intermediate file
// Create string builders for the 3 files - we'll append data, then write
StringBuilder hapSB = new StringBuilder();
StringBuilder brksetSB = new StringBuilder();
// write header lines: NOTE: breakpoint_set_name is for IFL to grab breakpoint_set_id value
// this needs to appear in a hap_breakpoint.nmap file
// also need to define the duplicate files
hapSB.append("taxa\tchr\tposition_range\tdonor1\tdonor2\tstatus\tbreakpoint_set_name\n");
writerhap.write(hapSB.toString());
hapSB.setLength(0);
// breakpoint_set table intermediate file. NOTE: mapset_name should be used by IFL to grab mapset_id
// this means we don't really need to connect to DB - let IFL do it.
// This also means don't need dbconfig parameter. Remove it when verify is good without.
brksetSB.append("name\tmethod\tmapset_name\tsource_dataset_name\tstatus\n"); // header line
brksetSB.append(setName()); // begin data line
brksetSB.append("\t");
brksetSB.append(method());
brksetSB.append("\t");
brksetSB.append(mapset());
brksetSB.append("\t");
brksetSB.append(src_dataset());
brksetSB.append("\t1\n"); // status and newline
writerbrkset.write(brksetSB.toString()); // file has header and just this one line
writerbrkset.close();
BufferedReader breakerbr = null;
String[] donorList = null;
int totalTaxaFound = 0;
int totalTaxaNOTFound = 0;
System.out.println("processData: number of infiles: " + infiles.size());
for (Path filepath: infiles) {
String curFile = filepath.toString();
System.out.println("LCJ - processing file: " + curFile);
breakerbr = Utils.getBufferedReader(curFile, 1 << 22);
String mline = breakerbr.readLine(); // initial line containing num taxa(donor) and num blocks
int numDonors = Integer.parseInt(mline.substring(0, mline.indexOf("\t")).trim()); // get number of donors from file header line
int numBlocks = Integer.parseInt(mline.substring(mline.indexOf("\t")+1).trim());
// Create list of donor GIDs. - should be same list for all files
// Or will it be? I think we should create the donor list from each
// file as each file has a list
// Process: Read each success line, up to then number of lines
// the file said were there. Grab the value after the tab (the donor nam)
// - search for this name in the donorHashMap. Store the donor GID
// into the donorList array. So the number in the "block" for the donor
// is the index into the array, and this gives you the GID.
System.out.println(" ... creatingDonorLists ..., number of donors is " + numDonors);
donorList = new String[numDonors];
boolean allFound = true;
int donorCount = 0;
for (int idx = 0;donorCount < numDonors;idx++){
mline = breakerbr.readLine();
if (mline.startsWith("#")) continue; // skip comment line
String[] donorInfo = mline.split("\\t");
String donorGid = donorNameGidMap.get(donorInfo[1].trim());
if (donorGid == null) {
System.out.println("LCJ - no donorGId found in donorMap for donor " + donorInfo[1]);
donorList[donorCount] = "49999999"; // dummy to get things working
allFound = false;
} else {
donorList[donorCount] = donorGid;
}
donorCount++;
}
if (!allFound) return null;
System.out.println("Processing the breakpoint file ...");
// Continue processing breakpoint file. We processed all the donors, now process each block line
while ( (mline=breakerbr.readLine() )!= null) {
hapSB.setLength(0); // reset line to 0
if (mline.startsWith("#")) continue; // skip comment lines
// Each line contains a taxa, followed by a "block"
// Blocks are defined as chr:startPos:endPos:donor1:donor2 (-1 means no hypothesis)
// taxa and blocks are tab separated
String tokens[] = mline.split("\t");
String gidForTaxa = "-1";
String taxa = tokens[0].trim();
if (taxaNameGidMap.containsKey(taxa)) {
gidForTaxa = taxaNameGidMap.get(taxa);
}
if (gidForTaxa.equals("-1")) {
totalTaxaNOTFound++;
taxaNotFoundbw.write(taxa);
taxaNotFoundbw.write("\n");
// System.out.println("ERROR: gidForTaxa map does not contain taxa " + taxa + " from block list: " + mline);
//return null; // LCJ - want to return this error - for now I want to see if we can process anything
gidForTaxa = "49999999"; // add a dummy for now - FIX THIS WITH REAL TAXA
//continue; keep going - I want to see what I can process
}
totalTaxaFound++;
taxaFoundbw.write(taxa);
taxaFoundbw.write("\n");
//System.out.println("SUCCESS: found taxa " + taxa );
// Process blocks within each line
// Starting blockIdx at 1 so we pass over the taxa in 0, which was already processed
for (int blockIdx = 1;blockIdx < tokens.length; blockIdx++) {
hapSB.append(gidForTaxa); // add taxa gid
hapSB.append("\t");
// process all the blocks in this line
String block = tokens[blockIdx];
String[] blockTokens = block.split(":");
hapSB.append(blockTokens[0]); // add chromsome
hapSB.append("\t");
String posRange = "[" + blockTokens[1] + "," + blockTokens[2] + ")\t";
hapSB.append(posRange); // includes the tab
int donor1 = Integer.parseInt(blockTokens[3]);
if (donor1 >= 0 && donor1 < donorList.length) {
hapSB.append(donorList[donor1]);
} else {
System.out.println("LCJ - donor1 in block is out of range: " + block);
hapSB.append("-1"); // -1 for not found
}
hapSB.append("\t");
int donor2 = Integer.parseInt(blockTokens[4]);
if (donor2 >= 0 && donor2 < donorList.length) {
hapSB.append(donorList[donor2]);
} else {
System.out.println("LCJ - donor2 in block is out of range: " + block);
hapSB.append("-1"); // will be obvious is isn't a GID, which start at 42000000
}
hapSB.append("\t1\t"); // adding tabs and default status code
hapSB.append(setName()); // from input parameter
hapSB.append("\n");
}
// write string to file
writerhap.write(hapSB.toString());
}
}
if (breakerbr != null) breakerbr.close();
writerhap.close(); // writerbrkset closed above
taxaFoundbw.close();
taxaNotFoundbw.close();
System.out.println("LCJ - end of plugin, total taxa found within blocks: " + totalTaxaFound
+ " total taxa NOT found: " + totalTaxaNOTFound);
} catch (Exception exc) {
System.out.println("processData: caught exception processing or writing files");
exc.printStackTrace();
}
return null;
}
@Override
public ImageIcon getIcon() {
// TODO Auto-generated method stub
return null;
}
@Override
public String getButtonName() {
// TODO Auto-generated method stub
return null;
}
@Override
public String getToolTipText() {
// TODO Auto-generated method stub
return null;
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
public static void main(String[] args) {
GeneratePluginCode.generate(HapBreakpoints_IFLFilePlugin.class);
}
/**
* Convenience method to run plugin with one return object.
*/
// TODO: Replace with specific type.
// public runPlugin(DataSet input) {
// return () performFunction(input).getData(0).getData();
// }
// /**
// * DB connection config file
// *
// * @return dbConfigFile
// */
// public String dbConfigFile() {
// return dbConfigFile.value();
// }
//
// /**
// * Set dbConfigFile. DB connection config file
// *
// * @param value dbConfigFile
// *
// * @return this plugin
// */
// public HapBreakpoints_IFLFilePlugin dbConfigFile(String value) {
// dbConfigFile = new PluginParameter<>(dbConfigFile, value);
// return this;
// }
/**
* Full path to file containing breakpoint blocks for
* projection alignment
*
* @return Breakpoint File
*/
public String breakFile() {
return breakFile.value();
}
/**
* Set Breakpoint File. Full path to file containing breakpoint
* blocks for projection alignment
*
* @param value Breakpoint File
*
* @return this plugin
*/
public HapBreakpoints_IFLFilePlugin breakFile(String value) {
breakFile = new PluginParameter<>(breakFile, value);
return this;
}
/**
* Name to be given to this set of breakpoints. This
* name will be stored in the breakpointSet table.
*
* @return Breakpoint Set Name
*/
public String setName() {
return setName.value();
}
/**
* Set Breakpoint Set Name. Name to be given to this set
* of breakpoints. This name will be stored in the breakpointSet
* table.
*
* @param value Breakpoint Set Name
*
* @return this plugin
*/
public HapBreakpoints_IFLFilePlugin setName(String value) {
setName = new PluginParameter<>(setName, value);
return this;
}
/**
* Name of the mapset to which these breakpoints refer.
* Must match an existing name in the mapset table, e.g
* AGPV3
*
* @return mapset
*/
public String mapset() {
return mapset.value();
}
/**
* Set mapset. Name of the mapset to which these breakpoints
* refer. Must match an existing name in the mapset table,
* e.g AGPV3
*
* @param value mapset
*
* @return this plugin
*/
public HapBreakpoints_IFLFilePlugin mapset(String value) {
mapset = new PluginParameter<>(mapset, value);
return this;
}
/**
* Name of the dataset from which these breakpoints were created.
* Must match an existing name in the dataset table.
*
* @return dataset
*/
public String src_dataset() {
return src_dataset.value();
}
/**
* Set src_dataset. Name of the dataset from which these breakpoints
* were created. Must match an existing name in the dataset table,
*
* @param value src_dataset
*
* @return this plugin
*/
public HapBreakpoints_IFLFilePlugin src_dataset(String value) {
src_dataset = new PluginParameter<>(src_dataset, value);
return this;
}
/**
* Full path name of directory to which output files will
* be written, must end with a /
*
* @return Path of output directory
*/
public String outputDir() {
return outputDir.value();
}
/**
* Set Path of output directory. Full path name of directory
* to which output files will be written, must end with
* a /
*
* @param value Path of output directory
*
* @return this plugin
*/
public HapBreakpoints_IFLFilePlugin outputDir(String value) {
outputDir = new PluginParameter<>(outputDir, value);
return this;
}
/**
* Method used to created the breakpoints, e.g. FILLIN,
* beagle, etc
*
* @return Breakpoint Method
*/
public String method() {
return method.value();
}
/**
* Set Breakpoint Method. Method used to created the breakpoints,
* e.g. FILLIN, beagle, etc
*
* @param value Breakpoint Method
*
* @return this plugin
*/
public HapBreakpoints_IFLFilePlugin method(String value) {
method = new PluginParameter<>(method, value);
return this;
}
/**
* tab-delimited File containing a TaxaColumn name to
* be used to map breakFile donors with a GID.
* This may be the same file that was used for adding
* germplasm and marker data.
*
* @return mappingFile
*/
public String donorMapFile() {
return donorMapFile.value();
}
/**
* Set mappingFile. tab-delimited File containing a TaxaColumn
* name to be used to map breakFile donors with a GID.
*
* This may be the same file that was used for adding
* germplasm and marker data.
*
* @param value mappingFile
*
* @return this plugin
*/
public HapBreakpoints_IFLFilePlugin donorMapFile(String value) {
donorMapFile = new PluginParameter<>(donorMapFile, value);
return this;
}
/**
* tab-delimited File containing a TaxaColumn name to
* be used to map breakFile taxa with a GID.
* This may be a different file than was used for adding
* germplasm and marker data.
*
* @return mappingFile
*/
public String taxaMapFile() {
return taxaMapFile.value();
}
/**
* Set mappingFile. tab-delimited File containing a TaxaColumn
* name to be used to map breakFile taxa with a GID.
* This may be a different file than was used for adding
* germplasm and marker data.
*
* @param value mappingFile
*
* @return this plugin
*/
public HapBreakpoints_IFLFilePlugin taxaMapFile(String value) {
taxaMapFile = new PluginParameter<>(taxaMapFile, value);
return this;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy