net.maizegenetics.analysis.gobii.UpdateMarkerAndDNA_idxes Maven / Gradle / Ivy
/**
*
*/
package net.maizegenetics.analysis.gobii;
import java.io.BufferedWriter;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import net.maizegenetics.util.Utils;
/**
* Once I have the datasets fixed, this class should not be needed.
*
* What it does: Initially the marker_idx and dnarun_idx columns of the dataset_marker
* and dataset_dnarun tables respectively were not populated. They are now needed and
* are populated. Kevin Palis created a couple scripts to handle populating these fields
* in tables when they were missing. These scripts live with the gobii_ifl_scripts
* on CBSU, and are called update_marker_idx.py and update_dnarun_idx.py. For GOBII,
* then are in the rpository at the same level as the gobii_ifl.py scrips.
*
* The file below creates an intermediate file that will be worked on by the preprocess_ifile.py
* script. You can also run the gobii_ifl.py script instead if you uncomment the "return" statement
* that occurs after the preprocess_ifile.py script has been called.
*
* Here is the order:
* 1. Run this class to create the needed files (DS_X.mh5i and DS_X.sh5i)
* 2. sftp these files to cbsudc01.tc.cornell into /workdir/lcj34/postgresFiles/update_idxes_files dir
* 3. Run the file through gobii_ifl.scripts (change the script to return after the preprocess_ifl.py step !!)
* python gobii_ifl.py -c postgresql://lcj34:@localhost:5432/gobii_maize2 -i /workdir/lcj34/postgresFiles/update_idxes_files/DS_5.sh5i -o /tmp/ -v
* 4. Run the /tmp/ppd_* file created in step 3 through the update_dnarun_idx.py or update_marker_idx.py script
* python update_dnarun_idx.py "postgresql://lcj34:@cbsudc01.tc.cornell.edu/gobii_maize2" /tmp/ppd_DS_5.sh5i 5
* 5. Verify the db has values for dataset_marker.marker_idx and dataset_dnarun.dnarun_idx for
* the specified dataset_id.
* 6. Change the gobii_ifl.py script to re-comment the "return" after the preprocess_ifl call
*
* @author lcj34
*
*/
public class UpdateMarkerAndDNA_idxes {
private static final Logger myLogger = LogManager.getLogger(UpdateMarkerAndDNA_idxes.class);
public static void createIdxValues(String configFile, String outputDir, int datasetID, int platformID, int experimentID) {
// connect to db
// what should these files be called ??
String dnarunFile = outputDir + "DS_" + datasetID + ".sh5i"; // the outputDir should include the DS_
String markerFile = outputDir + "DS_" + datasetID + ".mh5i";
// process the input data file
try {
BufferedWriter writerRunID = Utils.getBufferedWriter(dnarunFile);
BufferedWriter writerMarkerID = Utils.getBufferedWriter(markerFile);
long time=System.nanoTime();
// Connect to db
Connection dbConnection = GOBIIDbUtils.connectToDB(configFile);
if (dbConnection == null) {
throw new IllegalStateException("UpdateMarkerAndDNA_idxes: Problem connecting to database.");
}
// create dnarun_idx query: here we need the dnarun.name field
StringBuilder builder = new StringBuilder();
builder.append("select name from dnarun,dataset_dnarun where dataset_dnarun.dnarun_id=dnarun.dnarun_id and dataset_dnarun.dataset_id='");
builder.append(datasetID);
builder.append("' order by dnarun.dnarun_id;");
String query = builder.toString();
myLogger.info("processData: query statement for dnarun: " + query);
System.out.println("UpdateMarkerAndDNA_idxes: execute query: " + query);
dbConnection.setAutoCommit(false); // required for Cursor processing (fetchSize)
Statement st = dbConnection.createStatement();
st.setFetchSize(100000); // should return results in batches
ResultSet rs = st.executeQuery(query);
// ResultSet rs = dbConnection.createStatement().executeQuery(query);
writerRunID.write("dnarun_name\td_name\texperiment_id\n"); // header
while (rs.next()) {
String dnarun_n = rs.getString("name");
writerRunID.write(dnarun_n);
writerRunID.write("\t");
writerRunID.write(dnarun_n); // write name out twice as first one is converted.
writerRunID.write("\t");
writerRunID.write(Integer.toString(experimentID));
writerRunID.write("\n");
}
System.out.printf("TotalTime for dnarun_name query %g sec%n", (double) (System.nanoTime() - time) / 1e9);
// create marker query:
// select marker_id from dataset_marker, dataset where dataset.name = datasetName and dataset.dataset_id = dataset_marker.dataset_id
builder = new StringBuilder();
builder.append("select name from marker, dataset_marker where marker.marker_id=dataset_marker.marker_id and dataset_marker.dataset_id='");
builder.append(datasetID);
builder.append("' order by marker.marker_id;");
query = builder.toString();
myLogger.info("processData: query statement: " + query);
System.out.println("UpdateMarkerAndDNA_idxes: execute query: " + query);
st = dbConnection.createStatement();
st.setFetchSize(100000); // shouldn't need to set this again
rs = st.executeQuery(query);
// rs = dbConnection.createStatement().executeQuery(query);
writerMarkerID.write("marker_name\tm_name\tplatform_id\n"); // header
while (rs.next()) {
String marker_n = rs.getString("name");
writerMarkerID.write(marker_n);
writerMarkerID.write("\t");
writerMarkerID.write(marker_n); // write same name twice
writerMarkerID.write("\t");
writerMarkerID.write(Integer.toString(platformID));
writerMarkerID.write("\n");
}
writerRunID.close();
writerMarkerID.close();
System.out.printf("TotalTime for marker_name query: %g sec%n", (double) (System.nanoTime() - time) / 1e9);
} catch (Exception exc) {
System.out.println("UpdateMarkerAndDNA_idxes: caught exception processing writing files");
exc.printStackTrace();
}
System.out.println("\nFiles written to " + dnarunFile + " and " + markerFile);
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
String configFile = "/Users/lcj34/notes_files/gobiiANDBms/gobii_loading/dbConfigFile_maize2.txt";
//String datasetName = "ZeaGBSv27impV5_20160209_AGPv2_282";
String outputDir = "/Users/lcj34/notes_files/gobiiANDBms/gobii_loading/update_idxes/";
int datasetID = 5;
int platformID = 3; // needed for marker file
int experimentID = 4; // needed for dnarun file
createIdxValues(configFile,outputDir,datasetID,platformID, experimentID); // first do the marker file
}
}