net.maizegenetics.analysis.gobii.PreProcessGOBIIMappingFilePlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage
disequilibrium.
The newest version!
/**
*
*/
package net.maizegenetics.analysis.gobii;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.sql.Connection;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.List;
import javax.swing.ImageIcon;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Utils;
/**
* This plugin should be run prior to creating the intermediate files for marker and
* dnarun.
*
* There are 3 purposes to this plugin's. Using the mapping file created for the dataset:
*
* 1. Identify duplicate/missing germplasm/dnasample entries, create intermediate
* file for germplasm and dnasmple tables, load any missing entries. Duplicates are skipped.
* 2. Identify duplicate libraryPrepIds. Write a list of duplicate libraryPrepIds, write
* to a file.
* 3. Provide mapping data to load new marker/dnarun related tables. Create intermediate
* files, load via GOBII IFL scripts
*
* For the first 2 purposes, the database must be queried. Missing entries
* entries are defined as below:
* germplasm table: From the db,Get list of distinct MGIDs (they should all be distinct). use this
* list to compare to MGIDs in the file. For any MGIDs that don't appear, create a
* line in the *.germplasm intermediate file used to add values.
* dnasample table: From the db, Get a list of dnasample names. These names are a string comprised
* of these components: GID:plate:well. From the input file, for each entry, create
* a concatenanted string of GID:plate:well. compare to list from db. For any names
* that don't appear, create a line in the *.dnasample intermediate file for loading.
* This file needs the "name" field to be a concatenation of GID:plate:well as this will be
* unique and GOBII dnasample.dupmap looks at only the name field. Code can be MGID if
* we need that stored (which I think we do). It takes "external code" column instead of
* germplasm_id as that maps to the external_code field in the germplasm table when GOBII IFL
* looks to find the germplasm_id from DB.
* This file also needs project_name, which comes from the mapping file.
* dnarun table: From the db, Get a list of all dnasample.name fields. These should be
* distinct library prep id. Compare to libraryPrepIds from the mapping file. IF there
* are duplicate, write to a file to show the biologist.
*
* NOTES: GOBII uses dnasample.name and dnasample.num to determine duplicates
* BL is not populating dnasample.num. "num" has been removed from the dnasample.dupmap
* file when running this. For some reason, with it present, but all values "null", the
* script believed the values were different and I ended up duplicating all dnasamples when
* sending the file through the GOBII scripts. When I removed this line, the scripts only
* checked the "name" field and project id and it worked.
*
* For step 3: The intermediate files are created by the MarkerDNARunMGID_fromHMPIFIFIlePLugin.java.
* Note the dnasample and germplasm entries must be loaded to the db before loading the marker/
* dnarun intermediate files or the necssary db ids will not be found..
*
* @author lcj34
*
*/
public class PreProcessGOBIIMappingFilePlugin extends AbstractPlugin {
private static final Logger myLogger = LogManager.getLogger(PreProcessGOBIIMappingFilePlugin.class);
private PluginParameter dbConfigFile= new PluginParameter.Builder<>("dbConfigFile",null,String.class).guiName("dbConfigFile").required(true)
.description("DB connection config file").build();
private PluginParameter datasetName= new PluginParameter.Builder<>("datasetName",null,String.class).guiName("dataset name").required(true)
.description("Name of existing database dataset. Will be used to pull dataset_id from the db. This id is incorporated into the output file names.").build();
private PluginParameter mappingFile = new PluginParameter.Builder<>("mappingFile",null,String.class).guiName("mappingFile").required(true)
.description("tab-delimited File containing columns: taxaColumn, name, source,MGID, GID,libraryID, plate_code, well, species, type, project").build();
private PluginParameter outputDir= new PluginParameter.Builder<>("outputDir",null,String.class).guiName("Path of output directory").required(true)
.description("Full path name of output directory, must end with a /").build();
public PreProcessGOBIIMappingFilePlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
public PreProcessGOBIIMappingFilePlugin() {
super(null, false);
}
@Override
public DataSet processData(DataSet input) {
// THis method will create germplasm and dnasample IFL intermediate files for any
// data represented in the mapping file that does not already exist in the germplasm or
// dnasample table. We expect most if not all to be in there, but as time goes by, there
// may be some that need to be added.
// Additionally, this method will create an output file containing a list of library prep Ids
// that already exist in the dnarun table as "name". The user is responsible for checking this
// list and taking appropriate action. The libraryPrepIds should be unique and should only
// occur once in this db. If we've already processed this, we want to know.
// QUSTION FOR CINTA: did we say there may be a reason we run the same sample/libraryPrepId
// multiple times for different analysis? Or does that analysis result in a different library
// prep ID, so this test is still valid ??
// process the input data file
try {
// Connect to db
Connection dbConnection = GOBIIDbUtils.connectToDB(dbConfigFile());
if (dbConnection == null) {
throw new IllegalStateException("PreProcessGOBIIMappingFilePlugin: Problem connecting to database.");
}
// get the dataset id;
StringBuilder sb = new StringBuilder();
sb.append("select dataset_id from dataset where name = '");
sb.append(datasetName());
sb.append("';");
ResultSet rs = dbConnection.createStatement().executeQuery(sb.toString());
int datasetId = -1;
while (rs.next()) {
datasetId = rs.getInt("dataset_id");
}
if (datasetId < 0) {
System.out.println("Could not find datasetId from datasetName " + datasetName() + " please check name and try again !!");
return null;
}
System.out.println("LCJ - processing dataset number " + datasetId);
String germplasmFile = outputDir() + "DS_" + datasetId + ".germplasm"; // the outputDir should include the DS_
String dnasampleFile = outputDir() + "DS_" + datasetId + ".dnasample";
String dupLibIDFile = outputDir() + "DS_" + datasetId + ".dup_libraryPrepIds";
BufferedWriter writergp = Utils.getBufferedWriter(germplasmFile);
BufferedWriter writerdna = Utils.getBufferedWriter(dnasampleFile);
BufferedWriter writerlib = Utils.getBufferedWriter(dupLibIDFile);
// create germplasm query: We want all of them to compare to those in our mapping file
// Select external_code from germplasm. WE are storing nothing in name, and GID in
// the external code field. Every run with a new GID gets loaded into germplasm
StringBuilder builder = new StringBuilder();
builder.append("select external_code from germplasm;");
String query = builder.toString();
myLogger.info("processData: query statement: " + query);
System.out.println("PreProcessGOBIIMappingFilePlugin: execute query: " + query);
rs = dbConnection.createStatement().executeQuery(query);
// GOBII IFL scripts need a header in the dnarun file (but not in
// the .variant or .marker files !!
List existingGermplasm = new ArrayList();
while (rs.next()) {
existingGermplasm.add(rs.getString("external_code"));
}
// NOTE: If there are no current entries in the germplasm list (e.g.
// prior to populating the first dataset in the db) then all from the
// mapping file will be added.
// create dnasample query:
// select name from dnasample: This will eventually be an extraction_id,
// from the lab where it is processed, or a created one by Cinta (or me)
// for now (july 11,2016) we have the name as GID:plate:wel
builder = new StringBuilder();
builder.append("select name from dnasample;");
query = builder.toString();
myLogger.info("processData: query statement: " + query);
System.out.println("PreProcessGOBIIMappingFilePlugin: execute query: " + query);
rs = dbConnection.createStatement().executeQuery(query);
List existingDNASample = new ArrayList();
while (rs.next()) {
existingDNASample.add(rs.getString("name"));
}
// create dnarun query.
builder = new StringBuilder();
builder.append("select name from dnarun;");
query = builder.toString();
myLogger.info("processData: query statement: " + query);
System.out.println("PreProcessGOBIIMappingFilePlugin: execute query: " + query);
rs = dbConnection.createStatement().executeQuery(query);
List existingLib = new ArrayList();
while (rs.next()) {
existingLib.add(rs.getString("name"));
}
// AFTER we have the lists, go through the mapping file ONCE, reading
// all the colums we need for each line, and comparing.
// Get the GID value. If it is NOT on the existingGermplasm list, then
// create a line for the germplasm intermediate file and add it.
// Check the GID:plate:well - concatenate, check against the dnasample name,
// add missing names to dnasample intermediate file
// Finally, get the libraryPrepID; check against the dnarun name. if a
// duplicate, write to the dup_libID file
// Create string builders for the 3 files - we'll append data, then write
StringBuilder germplasmSB = new StringBuilder();
StringBuilder dnasampleSB = new StringBuilder();
// write header lines:
germplasmSB.append("name\texternal_code\tspecies_name\ttype_name\tcreated_by\tcreated_date\tmodified_by\tmodified_date\tstatus\tcode\n");
writergp.write(germplasmSB.toString());
// THis doesn't work without all columns. In order to ignore "num" as a duplicate column,
// the dnasample.dupmap file was changed: the "num" entry was removed.
dnasampleSB.append("name\tcode\tplatename\tnum\twell_row\twell_col\tproject_name\texternal_code\tcreated_by\tcreated_date\tmodified_by\tmodified_date\tstatus\n");
writerdna.write(dnasampleSB.toString());
// column names
int taxaIdx=-1, nameIdx=-1, sourceIdx=-1, mgidIdx=-1, gidIdx=-1, libIdx=-1, plateIdx=-1;
int wellIdx=-1, speciesIdx=-1, typeIdx=-1, projectIdx = -1, sampleNameIdx = -1;
BufferedReader germplasmBR = Utils.getBufferedReader(mappingFile());
String mline = germplasmBR.readLine(); // header line
// write header line to duplicate library prep id.
writerlib.write(mline); // will the \n still be there?
System.out.println("\nPreprocessGObii: getting header columns from mline: " + mline);
String [] headers = mline.split("\\t");
//parse headers
if (mline.contains("TaxaColumn")) {
int idx = 0;
for (String header : headers) {
if (header.trim().toUpperCase().equals("TAXACOLUMN")) {
taxaIdx = idx;
} else if (header.trim().toUpperCase().equals("NAME")) {
nameIdx = idx;
}else if (header.trim().toUpperCase().equals("SOURCE")) {
sourceIdx = idx;
}else if (header.trim().toUpperCase().equals("MGID")) {
mgidIdx = idx;
}else if (header.trim().toUpperCase().equals("GID")) {
gidIdx = idx;
}else if (header.trim().toUpperCase().equals("LIBRARYID")) {
libIdx = idx;
}else if (header.trim().toUpperCase().equals("PLATE_CODE")) {
plateIdx = idx;
}else if (header.trim().toUpperCase().equals("WELL")) {
wellIdx = idx;
}else if (header.trim().toUpperCase().equals("SPECIES")) {
speciesIdx = idx;
}else if (header.trim().toUpperCase().equals("TYPE")) {
typeIdx = idx;
}else if (header.trim().toUpperCase().equals("PROJECT")) {
projectIdx = idx;
}else if (header.trim().toUpperCase().equals("SAMPLENAME")) {
sampleNameIdx = idx;
}
idx++;
}
} else {
System.out.println("Mappingfile is missing header line !!!");
return null;
}
// LCJ ***** FIx the use of SampleName whenvery ou get a good xcel sheet. I HATE EXCeL !!
if (taxaIdx == -1 || nameIdx == -1 || sourceIdx == -1 || mgidIdx == -1 || gidIdx == -1 || libIdx == -1 ||
// plateIdx == -1 || wellIdx == -1 || speciesIdx == -1 || typeIdx == -1 || projectIdx == -1 ) {
plateIdx == -1 || wellIdx == -1 || speciesIdx == -1 || typeIdx == -1 || projectIdx == -1 || sampleNameIdx == -1) {
System.out.println("\nMappingfile is missing required header line. Expecting columns: TaxaColumn, name, source, MGID, GID, libraryID, plate_code, well, species, type, project, SampleName\n");
return null;
}
System.out.println("PreprocessGobii: processing mapping file: " + mappingFile());
// read all lines, check for missing germplasm/dnasample or duplice libprepIDs
int dnaNotAdded = 0;
int germplasmNotAdded = 0;
int totalLines = 0;
List addedGermplasm = new ArrayList();
while ((mline = germplasmBR.readLine()) != null) {
String[] data = mline.split("\\t");
totalLines++;
// Check germplasm
// There shoudl only be entries in this file that have assigned GIDs. Sometimes there
// is a problem, and Robert didn't get something merged. However, I don't want these
// in the file. Should not need the check for data[gidIdx] != null
if (data[gidIdx].trim() != null &&
!(existingGermplasm.contains(data[gidIdx].trim())) &&
!(addedGermplasm.contains(data[gidIdx].trim()))) {
// tab over fields not filled in, add values for others. Name is skipped (first column)
// external_code (2nd column) is now GID
String gpentry = "\t" + data[gidIdx].trim() + "\t" + data[speciesIdx].trim() + "\t" + data[typeIdx].trim() + "\t\t\t\t\t1\t0\n";
writergp.write(gpentry);
addedGermplasm.add(data[gidIdx].trim()); // record that we already have this GID - don't want dups!
} else {
System.out.println("LCJ - not adding " + data[nameIdx].trim() + " to germplasm file");
germplasmNotAdded++;
}
//System.out.println("\n");
// LCJ **** Replace sampleName = sampleNameIdx line when excel file is fixed !!
String sampleName = null;
if (sampleNameIdx == -1 || data[sampleNameIdx].trim().equals("")) {
sampleName = data[gidIdx].trim() + ":" + data[plateIdx].trim() + ":" + data[wellIdx].trim();
} else {
sampleName = data[sampleNameIdx].trim();
}
// These were 2 original values of sampleName. We want the value from sampleNameIdx from
// the mapping file. BUt if the sampleName column doesn't exist, or the value is null
// (Cinta had a file will some missing values) then use the gid with plate and well. Could
// be that some of those columns are null as well ....
//String sampleName = data[sampleNameIdx].trim();
// String sampleName = data[gidIdx].trim() + ":" + data[plateIdx].trim() + ":" + data[wellIdx].trim();
if (data[gidIdx] != null && !(existingDNASample.contains(sampleName))) {
// tab over fields not filled in, add values for others,
// "dummycode" is stored for code. format is:
// name,code,platename,num,well_row,well_col,project_name,external_code,created_by,created_date,modified_by,modified_date,status
String wellRow = data[wellIdx].trim();
String wellCol = "";
if (!(wellRow).equals("")){ // this field may be blank
wellRow = wellRow.substring(0,1);
wellCol = data[wellIdx].substring(1);
}
String dnaentry = sampleName + "\tdummycode\t" + data[plateIdx].trim() + "\t\t" + wellRow + "\t" + wellCol + "\t"
+ data[projectIdx].trim() + "\t" + data[gidIdx].trim() + "\t" + "6\t\t\t\t1\n";
writerdna.write(dnaentry);
} else {
System.out.println("LCJ - not adding " + data[nameIdx].trim() + " to dnasample file");
dnaNotAdded++;
}
if (data[gidIdx].trim() != null && (existingLib.contains(data[libIdx].trim()))) {
// Found a duplicate library prep ID - record it
writerlib.write(mline);
}
}
writergp.close();
writerdna.close();
writerlib.close();
System.out.println("\nFiles written to " + germplasmFile + " and " + dnasampleFile);
System.out.println("Total mapping file lines: " + totalLines + " Not added to germplasm:"
+ germplasmNotAdded + ", not added to dnasample:" + dnaNotAdded + "\n");
} catch (Exception exc) {
System.out.println("PreProcessGOBIIMappingFilePlugin: caught exception processing or writing files");
exc.printStackTrace();
}
return null;
}
@Override
public ImageIcon getIcon() {
// TODO Auto-generated method stub
return null;
}
@Override
public String getButtonName() {
// TODO Auto-generated method stub
return null;
}
@Override
public String getToolTipText() {
// TODO Auto-generated method stub
return null;
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(PreProcessGOBIIMappingFilePlugin.class);
// }
/**
* Convenience method to run plugin with one return object.
*/
// TODO: Replace with specific type.
// public runPlugin(DataSet input) {
// return () performFunction(input).getData(0).getData();
// }
/**
* DB connection config file
*
* @return dbConfigFile
*/
public String dbConfigFile() {
return dbConfigFile.value();
}
/**
* Set dbConfigFile. DB connection config file
*
* @param value dbConfigFile
*
* @return this plugin
*/
public PreProcessGOBIIMappingFilePlugin dbConfigFile(String value) {
dbConfigFile = new PluginParameter<>(dbConfigFile, value);
return this;
}
/**
* Name of dataset whose marker and dnarun IDs are to
* be pulled
*
* @return dataset name
*/
public String datasetName() {
return datasetName.value();
}
/**
* Set dataset name. Name of dataset whose marker and
* dnarun IDs are to be pulled
*
* @param value dataset name
*
* @return this plugin
*/
public PreProcessGOBIIMappingFilePlugin datasetName(String value) {
datasetName = new PluginParameter<>(datasetName, value);
return this;
}
/**
* tab-delimited File containing columns: taxaColumn,
* name, MGID, GID,libraryID, plate_code, well, species,
* type, project_id, experiment_name, platform_name, reference_name
* and dataset_name
*
* @return mappingFile
*/
public String mappingFile() {
return mappingFile.value();
}
/**
* Set mappingFile. tab-delimited File containing columns:
* taxaColumn, name, MGID, GID,libraryID, plate_code,
* well, species, type, project_id, experiment_name, platform_name,
* reference_name and dataset_name
*
* @param value mappingFile
*
* @return this plugin
*/
public PreProcessGOBIIMappingFilePlugin mappingFile(String value) {
mappingFile = new PluginParameter<>(mappingFile, value);
return this;
}
/**
* Full path name of output directory, must end with a
* /
*
* @return Path of output directory
*/
public String outputDir() {
return outputDir.value();
}
/**
* Set Path of output directory. Full path name of output
* directory, must end with a /
*
* @param value Path of output directory
*
* @return this plugin
*/
public PreProcessGOBIIMappingFilePlugin outputDir(String value) {
outputDir = new PluginParameter<>(outputDir, value);
return this;
}
}