Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
de.julielab.xmlData.dataBase.ThreadedColumnsToRetrieveIterator Maven / Gradle / Ivy
Go to download
A utility for managing documents stored in a PostgreSQL database. The documents are imported into a
PostgreSQL DB as full texts with the goal to be able to retrieve the documents by their PubMedID efficiently.
For more sophisticated tasks, a user configuration file can be delivered which can take control of the table
schema to use, the PostgreSQL schema to use and the actual database server to connect to as well as the concrete
database.
package de.julielab.xmlData.dataBase;
import de.julielab.xml.JulieXMLConstants;
import de.julielab.xml.JulieXMLTools;
import de.julielab.xmlData.config.FieldConfig;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Exchanger;
/**
* An iterator that returns documents stored in the database identified by the
* primary keys delivered in a list upon creation of the iterator. The returned
* data corresponds to the columns marked to be retrieved in the table schema also given at iterator creation.
* The iterator employs two threads to retrieve new documents from the database
* while other documents, fetched before, can be processed concurrently. The
* idea is that the database can work in parallel to the program working with
* the retrieved documents.
*
* @author hellrich
*/
public class ThreadedColumnsToRetrieveIterator extends DBCThreadedIterator {
private final static Logger LOG = LoggerFactory.getLogger(ThreadedColumnsToRetrieveIterator.class);
private DataBaseConnector dbc;
private static int arrayResToListThreadCounter;
private static int arrayFromDBThreadCounter;
// To query by PMID, uses two other threads
public ThreadedColumnsToRetrieveIterator(DataBaseConnector dbc, CoStoSysConnection conn, List ids, String table, String schemaName) {
this.dbc = dbc;
String[] tables = new String[1];
String[] schemaNames = new String[1];
tables[0] = table;
schemaNames[0] = schemaName;
backgroundThread = new ArrayResToListThread(conn, listExchanger, ids, tables, null, schemaNames);
update();
}
public ThreadedColumnsToRetrieveIterator(DataBaseConnector dbc, List ids, String table, String schemaName) {
this(dbc, null, ids, table, schemaName);
}
public ThreadedColumnsToRetrieveIterator(DataBaseConnector dbc, List ids, String table, String whereClause, String schemaName) {
this(dbc, null, ids, table, whereClause, schemaName);
}
// To query by PMID, uses two other threads
public ThreadedColumnsToRetrieveIterator(DataBaseConnector dbc, CoStoSysConnection conn, List ids, String table, String whereClause, String schemaName) {
this.dbc = dbc;
String[] tables = new String[1];
String[] schemaNames = new String[1];
tables[0] = table;
schemaNames[0] = schemaName;
backgroundThread = new ArrayResToListThread(conn, listExchanger, ids, tables, whereClause, schemaNames);
update();
}
/**
* Retrieves data from the database over multiple tables. All tables will be joined on the given IDs.
* The columns to be retrieved for each table is determined by its table schema. For this purpose, the
* tables
and schemaName
arrays are required to be parallel.
*
* @param dbc A DataBaseConnector instance
* @param conn An active database connection
* @param ids A list of primary keys identifying the items to retrieve.
* @param tables The tables from which the items should be retrieved that are identified by ids
.
* @param schemaNames A parallel array to tables
thas specifies the table schema name of each table.
* @return The joined data from the requested tables.
*/
public ThreadedColumnsToRetrieveIterator(DataBaseConnector dbc, CoStoSysConnection conn, List ids, String[] tables, String[] schemaNames) {
this.dbc = dbc;
backgroundThread = new ArrayResToListThread(conn, listExchanger, ids, tables, null, schemaNames);
update();
}
public ThreadedColumnsToRetrieveIterator(DataBaseConnector dbc, List ids, String[] tables, String[] schemaNames) {
this(dbc, null, ids, tables, schemaNames);
}
/*
* (non-Javadoc)
*
* @see de.julielab.xmlData.dataBase.DBCThreadedIterator#destroy()
*/
@Override
public void close() {
((ArrayResToListThread) backgroundThread).end();
}
@Override
public void join() throws InterruptedException {
((ArrayResToListThread) backgroundThread).join();
}
/**
* This class converts a ResultSet , retrieved from the database, into a
* list, that is then returned.
* This class is a Thread and serves as an intermediate layer between
* the program that uses the resulting list and another thread that is doing the
* actual database querying. This class has to {@link Exchanger}: One
* Exchanger for the result list that is returned to the caller.
* Another Exchanger is used to retrieve database results in the form
* of {@link ResultSet} instances from the thread querying the database. In
* between, this class converts the ResultSet to a List .
*
* @author hellrich
*/
private class ArrayResToListThread extends Thread implements ConnectionClosable {
private final Logger log = LoggerFactory.getLogger(ArrayResToListThread.class);
private final ArrayFromDBThread arrayFromDBThread;
private Exchanger> listExchanger;
private Exchanger resExchanger = new Exchanger();
private ResultSet currentRes;
private ArrayList currentList;
private String[] schemaName;
private boolean joined = false;
private volatile boolean end = false;
ArrayResToListThread(CoStoSysConnection conn, Exchanger> listExchanger, List keyList, String[] tables,
String whereClause, String[] schemaName) {
this.listExchanger = listExchanger;
this.schemaName = schemaName;
if (tables.length > 1) {
this.joined = true;
}
// start the thread that is actually querying the database
arrayFromDBThread = new ArrayFromDBThread(conn, resExchanger, keyList, tables, whereClause, schemaName);
try {
// retrieve the first result without yet running the thread;
// when we have the result, we begin to create the result list
// out of the first retrieved ResultSet and return the list,
// then get the next results and so on...
currentRes = resExchanger.exchange(null);
} catch (InterruptedException e) {
e.printStackTrace();
}
setName("ArrayRestoListThread-" + ++arrayResToListThreadCounter);
setDaemon(true);
start();
}
@SuppressWarnings("unchecked")
public void run() {
Pair>> numColumnsAndFields = dbc.getNumColumnsAndFields(joined, schemaName);
int numColumns = numColumnsAndFields.getLeft();
List> fields = numColumnsAndFields.getRight();
int i = 0;
byte[][] retrievedData = null;
try {
while (currentRes != null && !end) {
currentList = new ArrayList<>();
// convert the current database ResultSet into a list.
while (currentRes.next()) {
retrievedData = new byte[numColumns][];
for (i = 0; i < retrievedData.length; i++) {
retrievedData[i] = currentRes.getBytes(i + 1);
if (Boolean.parseBoolean(fields.get(i).get(JulieXMLConstants.GZIP))
&& retrievedData[i] != null)
retrievedData[i] = JulieXMLTools.unGzipData(retrievedData[i]);
}
currentList.add(retrievedData);
} // end ResultSet to List conversion
// Offer the created result list to the calling program;
// as soon as the result has been given away, we
// continue fetching more documents from the database
// below, allowing the calling program to process the
// current result and already fetching the next
if (!currentList.isEmpty())
listExchanger.exchange(currentList);
// Get the next ResultSet from the database
currentRes = resExchanger.exchange(null);
}
listExchanger.exchange(null); // stop signal
} catch (InterruptedException | SQLException | IOException e) {
log.error(
"Exception occured while reading " + "data from result set, index {}. "
+ "Corresponding field in schema definition is: {}. Read data was: \"{}\"",
new Object[]{i + 1, fields.get(i), new String(retrievedData[i])});
e.printStackTrace();
} catch (NullPointerException e) {
log.debug("NPE on: Index {}, field {}, data {}",
new Object[]{i, fields.get(i), retrievedData != null ? retrievedData[i] : null});
throw e;
}
log.debug("ArrayResToListThread has finished");
}
/**
* Must be called when the thread is no longer required. Otherwise, it will
* continue querying the database.
*/
public void end() {
arrayFromDBThread.end();
end = true;
}
@Override
public void closeConnection() {
arrayFromDBThread.closeConnection();
}
}
/**
* This class is last Thread in the
* Iterator - ResultSet to List converter - ResultSet from database retriever
* chain and thus is the class doing the database querying.
* Upon creation, this class starts itself as a demon Thread . It
* queries {@link DataBaseConnector#queryBatchSize} IDs and offers the
* ResultSet in an {@link Exchanger} for the intermediate
* Thread .
*
* @author hellrich
*/
private class ArrayFromDBThread extends Thread implements ConnectionClosable {
private final Logger log = LoggerFactory.getLogger(ArrayFromDBThread.class);
private final boolean externalConnectionGiven;
private Iterator keyIter;
private Exchanger resExchanger;
private StringBuilder queryBuilder;
private ResultSet currentRes;
private String selectFrom;
private CoStoSysConnection conn;
private String whereClause = null;
private FieldConfig fieldConfig;
private volatile boolean end;
private boolean joined = false;
private String dataTable;
private String dataSchema;
public ArrayFromDBThread(CoStoSysConnection conn, Exchanger resExchanger, List keyList, String[] table,
String whereClause, String[] schemaName) {
externalConnectionGiven = conn != null;
this.conn = conn;
this.resExchanger = resExchanger;
keyIter = keyList.iterator();
this.queryBuilder = new StringBuilder();
this.whereClause = whereClause;
this.dataTable = table[0];
this.dataSchema = schemaName[0];
if (table.length > 1 && schemaName.length > 1) {
this.joined = true;
}
buildSelectFrom(table, schemaName);
setName("ArrayFromDBThread-"+ ++arrayFromDBThreadCounter);
setDaemon(true);
start();
}
/**
* Create the basic SQL query structure used to query documents from the
* database.
*
* @param table
* @param schemaName
*/
private void buildSelectFrom(String[] table, String[] schemaName) {
// Build SELECT if there is only one table.
if (!joined) {
fieldConfig = dbc.getFieldConfiguration(dataSchema);
selectFrom = "SELECT " + StringUtils.join(fieldConfig.getColumnsToRetrieve(), ",") + " FROM "
+ dataTable + " WHERE ";
// Build SELECT if multiple tables will be joined.
// This will be in the form
// 'SELECT dataTable.pmid, otherTable1.data, otherTable2.data
// FROM dataTable
// LEFT JOIN otherTable1 ON dataTable.pmid=otherTable1.pmid
// LEFT JOIN otherTable2 ON dataTable.pmid=othertable2.pmid
// WHERE (dataTable.pmid=1) OR (dataTable.pmid=2) OR ...'
} else {
String[] primaryKey = null;
ArrayList select = new ArrayList();
ArrayList leftJoin = new ArrayList();
for (int i = 0; i < table.length; i++) {
fieldConfig = dbc.getFieldConfiguration(schemaName[i]);
String[] columnsToRetrieve = fieldConfig.getColumnsToRetrieve();
for (int j = 0; j < columnsToRetrieve.length; j++) {
String column = table[i] + "." + columnsToRetrieve[j];
select.add(column);
}
if (i == 0) {
// Get the names of the primary keys once, since they
// should be identical for all tables.
primaryKey = fieldConfig.getPrimaryKey();
} else {
String primaryKeyMatch = "";
for (int j = 0; j < primaryKey.length; j++) {
primaryKeyMatch = table[0] + "." + primaryKey[j] + "=" + table[i] + "." + primaryKey[j];
if (!(j == primaryKey.length - 1))
primaryKeyMatch = primaryKeyMatch + " AND ";
}
String join = "LEFT JOIN " + table[i] + " ON " + primaryKeyMatch;
leftJoin.add(join);
}
}
selectFrom = "SELECT " + StringUtils.join(select, ",") + " FROM " + table[0] + " "
+ StringUtils.join(leftJoin, " ") + " WHERE ";
log.trace("Querying data via SQL: {}", selectFrom);
}
}
/**
* Fetches results as long as there are unprocessed documents in the given
* subset table.
*/
public void run() {
if (!externalConnectionGiven)
this.conn = dbc.obtainOrReserveConnection();
else
this.conn.incrementUsageNumber();
try {
while (keyIter.hasNext() && !end) {
currentRes = getFromDB();
resExchanger.exchange(currentRes);
}
resExchanger.exchange(null); // Indicates end
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
closeConnection();
}
log.debug("ArrayFromDBThread has finished");
}
/**
* Builds the final SQL query specifying the exact primary keys for retrieval,
* performs the actual query and returns the respective ResultSet
*
* @return
*/
private ResultSet getFromDB() {
ResultSet res = null;
String sql = null;
try {
queryBuilder.delete(0, queryBuilder.capacity());
Statement stmt = conn.createStatement();
List pkConjunction = new ArrayList();
for (int i = 0; keyIter.hasNext() && i < dbc.getQueryBatchSize(); ++i) {
// get the next row of primary key values, e.g.
//
// pmid | systemID
// --------------------
// 1564 | FSU <--- this is stored in "keys"
Object[] keys = keyIter.next();
String[] nameValuePairs;
nameValuePairs = new String[keys.length];
// build an array of pairs like
// ["pmid = 1563", "systemID = FSU"]
if (!joined) {
for (int j = 0; j < keys.length; ++j) {
String fieldName = fieldConfig.getPrimaryKey()[j];
nameValuePairs[j] = String.format("%s = '%s'", fieldName, keys[j]);
}
} else {
for (int j = 0; j < keys.length; ++j) {
String fieldName = fieldConfig.getPrimaryKey()[j];
nameValuePairs[j] = String.format("%s = '%s'", dataTable + "." + fieldName, keys[j]);
}
}
// make a conjunction of the name value pairs:
// "(pmid = 1563 AND systemID = FSU)"
pkConjunction.add("(" + StringUtils.join(nameValuePairs, " AND ") + ")");
}
queryBuilder.append(selectFrom);
queryBuilder.append("(");
queryBuilder.append(StringUtils.join(pkConjunction, " OR "));
queryBuilder.append(")");
if (whereClause != null)
queryBuilder.append(" AND " + whereClause);
sql = queryBuilder.toString();
LOG.trace("Fetching data with command \"{}\"", sql);
res = stmt.executeQuery(sql);
} catch (SQLException e) {
e.printStackTrace();
LOG.error("SQL: " + sql);
}
return res;
}
public void end() {
end = true;
closeConnection();
}
@Override
public void closeConnection() {
conn.close();
}
}
}