Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
A utility for managing documents stored in a PostgreSQL database. The documents are imported into a
PostgreSQL DB as full texts with the goal to be able to retrieve the documents by their PubMedID efficiently.
For more sophisticated tasks, a user configuration file can be delivered which can take control of the table
schema to use, the PostgreSQL schema to use and the actual database server to connect to as well as the concrete
database.
* Copyright (c) 2010, JULIE Lab.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Common Public License v1.0
*
* Author: hellrich
*
* Current version: 1.0
* Since version: 1.0
*
* Creation date: 19.11.2010
**/
package de.julielab.xmlData.dataBase;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import com.zaxxer.hikari.HikariPoolMXBean;
import de.julielab.hiddenConfig.HiddenConfig;
import de.julielab.xml.JulieXMLConstants;
import de.julielab.xml.JulieXMLTools;
import de.julielab.xmlData.Constants;
import de.julielab.xmlData.cli.TableNotFoundException;
import de.julielab.xmlData.config.ConfigReader;
import de.julielab.xmlData.config.DBConfig;
import de.julielab.xmlData.config.FieldConfig;
import de.julielab.xmlData.config.FieldConfigurationManager;
import de.julielab.xmlData.dataBase.util.TableSchemaMismatchException;
import de.julielab.xmlData.dataBase.util.UnobtainableConnectionException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.management.JMX;
import javax.management.MBeanServer;
import javax.management.MalformedObjectNameException;
import javax.management.ObjectName;
import javax.sql.DataSource;
import java.io.*;
import java.lang.management.ManagementFactory;
import java.sql.*;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/**
* This class creates a connection with a database and allows for convenient
* queries and commands.
* Database layout and returned columns are specified by a configuration file.
* The class was developed for a PostgreSQL back-end, using another database
* server may require modifications.
* Queries use up to 3 threads for higher performance and a connection pool is
* used for higher performance if multiple instances are deployed simultaneous.
*
* Visit
* http://commons.apache.org/dbcp/apidocs/org/apache/commons/dbcp/package-
* summary.html#package_description<\code> for more information about the
* connection pooling.
*
* @author hellrich, faessler
*/
public class DataBaseConnector {
public static final String DEFAULT_PIPELINE_STATE = "";
/**
* Used as a hack for the not-yet-published EMNLP-Paper. In the meantime, a more
* sophisticated system has been implemented (EF, 18.01.2012)
*/
@Deprecated
public static final int META_IN_ARRAY = 2;
/**
* This is the definition of subset tables except the primary key.
*/
public static final LinkedHashMap subsetColumns;
/**
* Size of the batches used for data retrieval from the database, value is
* optimized for xml-clobs in postgres on 2010 hardware.
*/
private static final int DEFAULT_QUERY_BATCH_SIZE = 1000;
/**
* Size of the byte buffer used for reading xml into vtd (xml parser)
*/
private final static int BUFFER_SIZE = 1000;
private static final String DEFAULT_FIELD = "xml";
private static final String DEFAULT_TABLE = Constants.DEFAULT_DATA_TABLE_NAME;
private static final int commitBatchSize = 10000;
private static final int RETRIEVE_MARK_LIMIT = 1000;
private static final int ID_SUBLIST_SIZE = 1000;
private static final Map pools = new ConcurrentHashMap<>();
/**
* A set of field definitions read from a configuration XML file. Contains the
* name of each field as well as a source for the field's value.
*/
// private FieldConfig fieldConfig;
// For import
private static Logger LOG = LoggerFactory.getLogger(DataBaseConnector.class);
private static Thread commitThread = null;
static {
subsetColumns = new LinkedHashMap<>();
subsetColumns.put(Constants.LOG, "text");
subsetColumns.put(Constants.IS_PROCESSED, "boolean DEFAULT false");
subsetColumns.put(Constants.IN_PROCESS, "boolean DEFAULT false");
subsetColumns.put(Constants.LAST_COMPONENT, "text DEFAULT '" + DEFAULT_PIPELINE_STATE + "'");
subsetColumns.put(Constants.HAS_ERRORS, "boolean DEFAULT false");
subsetColumns.put(Constants.PID, "character varying(10)");
subsetColumns.put(Constants.HOST_NAME, "character varying(100)");
subsetColumns.put(Constants.PROCESSING_TIMESTAMP, "timestamp without time zone");
}
/**
* Sometimes it is necessary to manage multiple data tables with different field
* schemas. fieldConfigs contains all field schema names in the configuration,
* mapped to the corresponding FieldConfig instance.
*/
private FieldConfigurationManager fieldConfigs;
private DBConfig dbConfig;
private String activeDataSchema;
private String activeDataTable;
private String activeTableSchema;
private byte[] effectiveConfiguration;
private int queryBatchSize = DEFAULT_QUERY_BATCH_SIZE;
private String dbURL;
private String user;
private String password;
private DataSource dataSource;
private ConfigReader config;
/**************************************************************************
*************************** Constructors ********************************
**************************************************************************/
public DataBaseConnector(String configPath) throws FileNotFoundException {
this(findConfigurationFile(configPath));
}
/**
* This class creates a connection with a database and allows for convenient
* queries and commands.
*
* @param configStream used to read the configuration for this connector instance
*/
public DataBaseConnector(InputStream configStream) {
config = new ConfigReader(configStream);
dbConfig = config.getDatabaseConfig();
this.dbURL = dbConfig.getUrl();
this.fieldConfigs = config.getFieldConfigs();
this.activeDataSchema = config.getActiveDataSchema();
this.activeDataTable = this.activeDataSchema + "." + config.getActiveDataTable();
this.activeTableSchema = config.getActiveSchemaName();
this.effectiveConfiguration = config.getMergedConfigData();
if (!StringUtils.isBlank(dbConfig.getActiveDatabase()) && (StringUtils.isBlank(user) || StringUtils.isBlank(password))) {
HiddenConfig hc = new HiddenConfig();
this.user = hc.getUsername(dbConfig.getActiveDatabase());
this.password = hc.getPassword(dbConfig.getActiveDatabase());
LOG.info("Connecting to " + this.dbURL + " as " + this.user);
} else {
LOG.warn(
"No active database configured in configuration file or configuration file is empty or does not exist.");
}
}
/**
* This class creates a connection with a database and allows for convenient
* queries and commands.
*
* @param configStream used to read the configuration for this connector instance
* @param queryBatchSize background threads are utilized to speed up queries, this
* parameter determines the number of pre-fetched entries
*/
public DataBaseConnector(InputStream configStream, int queryBatchSize) {
this(configStream);
this.queryBatchSize = queryBatchSize;
}
/**
* This class creates a connection with a database and allows for convenient
* queries and commands.
*
* @param dbUrl the url of the database
* @param user the username for the db
* @param password the password for the username
* @param fieldDefinition InputStream containing data of a configuration file
*/
public DataBaseConnector(String dbUrl, String user, String password, String pgSchema, InputStream fieldDefinition) {
this(dbUrl, user, password, pgSchema, DEFAULT_QUERY_BATCH_SIZE, fieldDefinition);
}
public DataBaseConnector(String serverName, String dbName, String user, String password, String pgSchema,
InputStream fieldDefinition) {
this(serverName, dbName, user, password, pgSchema, DEFAULT_QUERY_BATCH_SIZE, fieldDefinition);
}
/**
* This class creates a connection with a database and allows for convenient
* queries and commands.
*
* @param dbUrl the url of the database
* @param user the username for the db
* @param password the password for the username
* @param queryBatchSize background threads are utilized to speed up queries, this
* parameter determines the number of pre-fetched entries
* @param configStream used to read the configuration for this connector instance
*/
public DataBaseConnector(String dbUrl, String user, String password, String pgSchema, int queryBatchSize,
InputStream configStream) {
this(configStream, queryBatchSize);
// Manually entered values have priority.
setCredentials(dbUrl, user, password, pgSchema);
}
public DataBaseConnector(String serverName, String dbName, String user, String password, String pgSchema,
int queryBatchSize, InputStream configStream) {
this(configStream, queryBatchSize);
// Manually entered values have priority.
String dbUrl = null;
if (dbName != null && serverName != null)
dbUrl = "jdbc:postgresql://" + serverName + ":5432/" + dbName;
else {
if (dbName != null)
dbUrl = dbConfig.getUrl().replaceFirst("/[^/]+$", "/" + dbName);
if (serverName != null)
dbUrl = dbConfig.getUrl().replaceFirst("(.*//)[^/:]+(.*)", "$1" + serverName + "$2");
}
setCredentials(dbUrl, user, password, pgSchema);
}
/**
* This class creates a connection with a database and allows for convenient
* queries and commands.
*
* @param dbUrl the url of the database
* @param user the username for the db
* @param password the password for the username
*/
public DataBaseConnector(String dbUrl, String user, String password) {
this(dbUrl, user, password, null, DEFAULT_QUERY_BATCH_SIZE, null);
}
private static InputStream findConfigurationFile(String configPath) throws FileNotFoundException {
LOG.debug("Loading DatabaseConnector configuration file from path \"{}\"", configPath);
File dbcConfigFile = new File(configPath);
InputStream is;
if (dbcConfigFile.exists()) {
LOG.debug("Found database configuration at file {}", dbcConfigFile);
is = new FileInputStream(configPath);
} else {
String cpResource = configPath.startsWith("/") ? configPath : "/" + configPath;
LOG.debug("The database configuration file could not be found as a file at {}. Trying to lookup configuration as a classpath resource at {}", dbcConfigFile, cpResource);
is = DataBaseConnector.class.getResourceAsStream(cpResource);
if (is != null)
LOG.debug("Found database configuration file as classpath resource at {}", cpResource);
}
if (is == null) {
throw new IllegalArgumentException("DatabaseConnector configuration " + configPath + " could not be found as file or a classpath resource.");
}
return is;
}
public ConfigReader getConfig() {
return config;
}
/**
* @param dbUrl
* @param user
* @param password
* @param pgSchema
*/
private void setCredentials(String dbUrl, String user, String password, String pgSchema) {
if (dbUrl != null)
this.dbURL = dbUrl;
if (user != null)
this.user = user;
if (password != null)
this.password = password;
if (pgSchema != null)
setActivePGSchema(pgSchema);
if ((dbUrl != null) || (user != null) || (password != null) || (pgSchema != null))
LOG.info("Connecting to " + this.dbURL + " as " + this.user + " in Postgres Schema " + pgSchema);
}
public void setHost(String host) {
if (host != null) {
dbURL = dbURL.replaceFirst("(.*//)[^/:]+(.*)", "$1" + host + "$2");
LOG.debug("Setting database host to {}. DB URL is now {}", host, dbURL);
}
}
public void setPort(String port) {
setPort(Integer.parseInt(port));
}
public void setPort(Integer port) {
if (port != null) {
this.dbURL = dbURL.replaceFirst(":[0-9]+", ":" + port);
LOG.debug("Setting database port to {}. DB URL is now {}", port, dbURL);
}
}
public void setUser(String user) {
this.user = user;
LOG.debug("Setting database user for {} to {}", this.dbURL, user);
}
public void setPassword(String password) {
this.password = password;
LOG.debug("Changing database password.");
}
/**
* @return A Connection to the database.
*/
public Connection getConn() {
Connection conn = null;
if (null == dataSource) {
LOG.debug("Setting up connection pool data source");
HikariConfig hikariConfig = new HikariConfig();
hikariConfig.setPoolName("costosys-" + System.nanoTime());
hikariConfig.setJdbcUrl(dbURL);
hikariConfig.setUsername(user);
hikariConfig.setPassword(password);
hikariConfig.setConnectionTestQuery("SELECT TRUE");
hikariConfig.setMaximumPoolSize(dbConfig.getMaxConnections());
// required to be able to get the number of idle connections, see below
hikariConfig.setRegisterMbeans(true);
HikariDataSource ds = pools.compute(dbURL, (url, source) -> source == null ? new HikariDataSource(hikariConfig) : source);
if (ds.isClosed()) {
ds = new HikariDataSource(hikariConfig);
}
pools.put(dbURL, ds);
dataSource = ds;
}
try {
LOG.trace("Waiting for SQL connection to become free...");
if (LOG.isTraceEnabled()) {
// from https://github.com/brettwooldridge/HikariCP/wiki/MBean-(JMX)-Monitoring-and-Management
MBeanServer mBeanServer = ManagementFactory.getPlatformMBeanServer();
try {
String poolNameStr = ((HikariDataSource) dataSource).getPoolName();
ObjectName poolName = new ObjectName("com.zaxxer.hikari:type=Pool (" + poolNameStr + ")");
HikariPoolMXBean poolProxy = JMX.newMXBeanProxy(mBeanServer, poolName, HikariPoolMXBean.class);
int totalConnections = poolProxy.getTotalConnections();
int idleConnections = poolProxy.getIdleConnections();
int activeConnections = poolProxy.getActiveConnections();
int threadsAwaitingConnection = poolProxy.getThreadsAwaitingConnection();
LOG.trace("Pool {} has {} total connections", poolName, totalConnections);
LOG.trace("Pool {} has {} idle connections left", poolName, idleConnections);
LOG.trace("Pool {} has {} active connections", poolName, activeConnections);
LOG.trace("Pool {} has {} threads awaiting a connection", poolName, threadsAwaitingConnection);
} catch (MalformedObjectNameException e) {
e.printStackTrace();
}
}
conn = dataSource.getConnection();
// conn = DriverManager.getConnection(fullURI);
LOG.trace("SQL connection obtained.");
Statement stm = conn.createStatement();
if (!schemaExists(dbConfig.getActivePGSchema(), conn))
createSchema(dbConfig.getActivePGSchema(), conn);
if (!schemaExists(dbConfig.getActiveDataPGSchema(), conn))
createSchema(dbConfig.getActiveDataPGSchema(), conn);
stm.execute(String.format("SET search_path TO %s", dbConfig.getActivePGSchema()));
stm.close();
} catch (SQLException e) {
LOG.error("Could not connect with " + dbURL);
throw new UnobtainableConnectionException("No database connection could be obtained from the connection " +
"pool. This can have one of two causes: Firstly, the application might just use all connections " +
"concurrently. Then, a higher number of maximum active database connections in the CoStoSys " +
"configuration might help. This " +
"number is currently set to " + config.getDatabaseConfig().getMaxConnections() + ". The other " +
"possibility are programming errors where connections are retrieved but not closed. Closing " +
"connections means to return them to the pool. It must always be made sure that connections are " +
"closed when they are no longer required. If database iterators are used. i.e. subclasses of " +
"DBCIterator, make sure to fully read the iterators. Otherwise, they might keep a permanent " +
"connection to the database while waiting to be consumed.", e);
}
return conn;
}
/**
* @return the activeDataTable
*/
public String getActiveDataTable() {
return activeDataTable;
}
/**
*
* Returns the effective XML configuration as a byte[].
*
*
* The effective configuration consists of the default configuration and the
* given user configuration as well (merged by the ConfigReader in the
* constructor).
*
*
* @return the effectiveConfiguration
*/
public byte[] getEffectiveConfiguration() {
return effectiveConfiguration;
}
public String getActiveDataPGSchema() {
return activeDataSchema;
}
public String getActivePGSchema() {
return dbConfig.getActivePGSchema();
}
public void setActivePGSchema(String pgSchema) {
dbConfig.setActivePGSchema(pgSchema);
}
public String getActiveTableSchema() {
return activeTableSchema;
}
public void setActiveTableSchema(String schemaName) {
this.activeTableSchema = schemaName;
}
public FieldConfig getActiveTableFieldConfiguration() {
return fieldConfigs.get(activeTableSchema);
}
/**
*
* Retrieves from a subset-table limit primary keys whose rows are
* not marked to be in process or finished being processed and sets the rows of
* the retrieved primary keys as being "in process".
*
*
* The table is locked during this transaction. Locking and marking ensure that
* every primary key will be returned exactly once. Remember to remove the marks
* if you want to use the subset again ;)
*
*
* @param subsetTableName - name of a table, conforming to the subset standard
* @param hostName - will be saved in the subset table
* @param pid - will be saved in the subset table
* @return An ArrayList of pmids which have not yet been processed
*/
public List
*
* The table is locked during this transaction. Locking and marking ensure that
* every primary key will be returned exactly once. Remember to remove the marks
* if you want to use the subset again ;)
*
*
* @param subsetTableName - name of a table, conforming to the subset standard
* @param hostName - will be saved in the subset table
* @param pid - will be saved in the subset table
* @param limit - batchsize for marking/retrieving
* @param order - determines an ordering. Default order (which may change over
* time) when this parameter is null or empty.
* @return An ArrayList of primary keys which have not yet been processed.
*/
public List retrieveAndMark(String subsetTableName, String schemaName, String readerComponent,
String hostName, String pid, int limit, String order) throws TableSchemaMismatchException {
checkTableDefinition(subsetTableName, schemaName);
List ids = new ArrayList<>(limit);
String sql = null;
Connection conn = null;
boolean idsRetrieved = false;
while (!idsRetrieved) {
try {
FieldConfig fieldConfig = fieldConfigs.get(schemaName);
conn = getConn();
conn.setAutoCommit(false);
Statement st = conn.createStatement();
String orderCommand = order == null ? "" : order;
if (!orderCommand.equals("") && !orderCommand.trim().toUpperCase().startsWith("ORDER BY"))
orderCommand = "ORDER BY " + orderCommand;
String joinStatement = Stream.of(fieldConfig.getPrimaryKey()).map(pk -> {
return "t." + pk + "=subquery." + pk;
}).collect(Collectors.joining(" AND "));
String returnColumns = Stream.of(fieldConfig.getPrimaryKey()).map(pk -> {
return "t." + pk;
}).collect(Collectors.joining(","));
// following
// http://dba.stackexchange.com/questions/69471/postgres-update-limit-1
sql = "UPDATE " + subsetTableName + " AS t SET " + Constants.IN_PROCESS + " = TRUE, "
+ Constants.LAST_COMPONENT + " = '" + readerComponent + "', " + Constants.HOST_NAME + " = \'"
+ hostName + "\', " + Constants.PID + " = \'" + pid + "\'," + Constants.PROCESSING_TIMESTAMP
+ " = 'now' FROM (SELECT " + fieldConfig.getPrimaryKeyString() + " FROM " + subsetTableName
+ " WHERE " + Constants.IN_PROCESS + " = FALSE AND "
// eigentlich wollen wir anstelle von FOR UPDATE sogar:
// FOR UPDATE SKIP LOCKED in PostgreSQL 9.5 <---!!
+ Constants.IS_PROCESSED + " = FALSE " + orderCommand + " LIMIT " + limit
+ " FOR UPDATE) AS subquery WHERE " + joinStatement + " RETURNING " + returnColumns;
try (ResultSet res = st.executeQuery(sql)) {
String[] pks = fieldConfig.getPrimaryKey();
while (res.next()) {
Object[] values = new String[pks.length];
for (int i = 0; i < pks.length; i++) {
values[i] = res.getObject(i + 1);
}
ids.add(values);
}
idsRetrieved = true;
}
conn.commit();
} catch (SQLException e) {
// It is possible to run into deadlocks with the above query. Then, one process
// will be canceled and we get an exception. If so, just log is and try again.
if (!e.getMessage().contains("deadlock detected") && (e.getNextException() == null
|| !e.getNextException().getMessage().contains("deadlock detected"))) {
LOG.error(
"Error while retrieving document IDs and marking them to be in process. Sent SQL command: {}.",
sql, e);
SQLException nextException = e.getNextException();
if (null != nextException)
LOG.error("Next exception: {}", nextException);
// this is not the deadlock error; break the loop
break;
} else {
LOG.debug(
"Database deadlock has been detected while trying to retrieve document IDs and marking them to be processed. Tying again.");
// We need to close the current, failed, transaction and start a new one for the
// new try.
try {
conn.commit();
} catch (SQLException e1) {
e1.printStackTrace();
}
}
} finally {
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("The following IDs were retrieved from table {}: {}", subsetTableName, ids.stream().map(Arrays::toString).collect(Collectors.joining("; ")));
}
return ids;
}
/**
* @param subsetTableName
* @return
* @see #countUnprocessed(String)
*/
public int countUnprocessed(String subsetTableName) {
return countUnprocessed(subsetTableName, activeTableSchema);
}
/**
* Counts the unprocessed rows in a subset table
*
* @param subsetTableName - name of the subset table
* @return - number of rows
*/
public int countUnprocessed(String subsetTableName, String schemaName) {
FieldConfig fieldConfig = fieldConfigs.get(schemaName);
int rows = 0;
Connection conn = getConn();
try {
ResultSet res = conn.createStatement().executeQuery(
// as we are just looking for any unprocessed documents it
// is
// sufficient - even in the case of multiple primary key
// elements - to use the name of the first element
// in this command
"SELECT count(" + fieldConfig.getPrimaryKey()[0] + ")" + " FROM " + subsetTableName + " WHERE "
+ Constants.PROCESSED + " = FALSE;");
if (res.next())
rows = res.getInt(1);
} catch (SQLException e) {
e.printStackTrace();
} finally {
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
return rows;
}
public int countRowsOfDataTable(String tableName, String whereCondition) {
return countRowsOfDataTable(tableName, whereCondition, activeTableSchema);
}
public int countRowsOfDataTable(String tableName, String whereCondition, String schemaName) {
FieldConfig fieldConfig = fieldConfigs.get(schemaName);
int rows = 0;
Connection conn = getConn();
try {
if (whereCondition != null) {
whereCondition = whereCondition.trim();
if (!whereCondition.toUpperCase().startsWith("WHERE"))
whereCondition = " WHERE " + whereCondition;
else
whereCondition = " " + whereCondition;
} else
whereCondition = "";
ResultSet res = conn.createStatement().executeQuery(
"SELECT count(" + fieldConfig.getPrimaryKeyString() + ")" + " FROM " + tableName + whereCondition);
if (res.next())
rows = res.getInt(1);
} catch (SQLException e) {
e.printStackTrace();
} finally {
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
return rows;
}
public boolean hasUnfetchedRows(String tableName) {
return hasUnfetchedRows(tableName, activeTableSchema);
}
/**************************************************************************
******************************** Utility **********************************
***************************************************************************/
public boolean hasUnfetchedRows(String tableName, String schemaName) {
FieldConfig fieldConfig = fieldConfigs.get(schemaName);
Connection conn = getConn();
try {
ResultSet res = conn.createStatement()
.executeQuery("SELECT " + fieldConfig.getPrimaryKeyString() + " FROM " + tableName + " WHERE "
+ Constants.IN_PROCESS + " = FALSE AND " + Constants.IS_PROCESSED + " = FALSE LIMIT 1");
return res.next();
} catch (SQLException e) {
e.printStackTrace();
} finally {
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
return false;
}
/**
* Deletes entries from a table
*
* @param table name of the table
* @param ids primary key arrays defining the entries to delete
* @see #deleteFromTableSimplePK(String, List)
*/
public void deleteFromTable(String table, List ids) {
String sql = "DELETE FROM " + table + " WHERE ";
modifyTable(sql, ids);
}
/**
* Deletes entries from a table where the primary key of this table must consist
* of exactly one column. For deletion from tables which contain a
* multi-column-primary-key see {@link #deleteFromTable(String, List)}.
*
* @param table name of the table
* @param ids primary key arrays defining the entries to delete
* @see #deleteFromTable(String, List)
*/
public void deleteFromTableSimplePK(String table, List ids) {
String sql = "DELETE FROM " + table + " WHERE ";
// Convert the given list to a list of object arrays, so it fits to
// 'modifyTable'.
List objectIds = new ArrayList(ids.size());
for (T id : ids)
objectIds.add(new Object[]{id});
modifyTable(sql, objectIds);
}
/**
* Modifies a subset table, marking entries as processed.
*
* @param table name of the subset table
* @param ids primary key arrays defining the entries to delete
*/
public void markAsProcessed(String table, List ids) {
String sql = "UPDATE " + table + " SET " + Constants.PROCESSED + " = TRUE WHERE ";
modifyTable(sql, ids);
}
/**
*
* Executes a given SQL command (must end with "WHERE "!) an extends the
* WHERE-clause with the primary keys, set to the values in ids.
*
*
* Assumes that the form of the primary keys matches the definition given in the
* active table schema in the configuration.
*
*
* @param sql a valid SQL command, ending with "WHERE "
* @param ids list of primary key arrays
* @see #modifyTable(String, List)
*/
public void modifyTable(String sql, List ids) {
modifyTable(sql, ids, activeTableSchema);
}
/**
*
* Executes a given SQL command (must end with "WHERE "!) an extends the
* WHERE-clause with the primary keys, set to the values in ids.
*
*
* @param sql a valid SQL command, ending with "WHERE "
* @param ids list of primary key arrays
* @param schemaName name of the schema which defines the primary keys
*/
public void modifyTable(String sql, List ids, String schemaName) {
FieldConfig fieldConfig = fieldConfigs.get(schemaName);
Connection conn = getConn();
String where = StringUtils.join(fieldConfig.expandPKNames("%s = ?"), " AND ");
String fullSQL = sql + where;
PreparedStatement ps = null;
try {
conn.setAutoCommit(false);
ps = conn.prepareStatement(fullSQL);
} catch (SQLException e) {
LOG.error("Couldn't prepare: " + fullSQL);
e.printStackTrace();
}
String[] pks = fieldConfig.getPrimaryKey();
for (Object[] id : ids) {
for (int i = 0; i < id.length; ++i) {
try {
setPreparedStatementParameterWithType(i + 1, ps, id[i], pks[i], fieldConfig);
} catch (SQLException e) {
e.printStackTrace();
}
}
try {
ps.addBatch();
} catch (SQLException e) {
e.printStackTrace();
}
}
try {
ps.executeBatch();
conn.commit();
} catch (SQLException e) {
e.printStackTrace();
} finally {
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
/**
* just calls ps.setObject(position, value);
*
* @param position
* @param ps
* @param value
* @param fieldName
* @param fieldConfig
* @throws SQLException
*/
private void setPreparedStatementParameterWithType(int position, PreparedStatement ps, Object value,
String fieldName, FieldConfig fieldConfig) throws SQLException {
ps.setObject(position, value);
}
/**
* Returns the name of a table referenced by an SQL-foreign-key.
*
* @param referencingTable the name of the table for which the foreign keys shall be checked
* @return the name of the first referenced table or null if there
* is no referenced table (i.e. the passed table name denotes a data
* table).
* @throws IllegalArgumentException When referencingTable is null.
*/
public String getReferencedTable(String referencingTable) {
if (referencingTable == null)
throw new IllegalArgumentException("Name of referencing table may not be null.");
String referencedTable = null;
Connection conn = getConn();
try {
String pgSchema = dbConfig.getActivePGSchema();
String tableName = referencingTable;
if (referencingTable.contains(".")) {
pgSchema = referencingTable.replaceFirst("\\..*$", "");
tableName = referencingTable.substring(referencingTable.indexOf('.') + 1);
}
// Lowercasing of the table name since case matters but postgres
// does lowercase on table creation.
ResultSet imported = conn.getMetaData().getImportedKeys("", pgSchema, tableName.toLowerCase());
if (imported.next()) {
String pkTableSchema = imported.getString(2);
String pkTableName = imported.getString(3);
referencedTable = pkTableSchema != null ? pkTableSchema + "." + pkTableName : pkTableName;
}
} catch (SQLException e1) {
e1.printStackTrace();
} finally {
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
return referencedTable;
}
/**
* Creates a PostgreSQL schema
*
* This private method is called by the SQL Connection source, thus
* it takes the Connection as a parameter instead of getting a
* Connection on its own.
*
*
* @param schemaName The name of the PostgreSQL schema to create.
* @param conn Connection to the database which should be checked for the
* existence of the schema schemaName.
*/
private void createSchema(String schemaName, Connection conn) {
String sqlStr = "CREATE SCHEMA " + schemaName;
try {
conn.createStatement().execute(sqlStr);
LOG.info("PostgreSQL schema \"{}\" does not exist, it is being created.", schemaName);
} catch (SQLException e) {
LOG.error(sqlStr);
e.printStackTrace();
}
}
/**
* Creates the PostgreSQL schema schemaName in the active database.
*
* @param schemaName The name of the PostgreSQL schema to create.
*/
public void createSchema(String schemaName) {
Connection conn = getConn();
createSchema(schemaName, conn);
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
/**
* Creates a new table according to the field schema definition corresponding to
* the active schema name determined in the configuration.
*
* @param tableName the name of the new table
* @throws SQLException
*/
public void createTable(String tableName, String comment) throws SQLException {
createTable(tableName, activeTableSchema, comment);
}
/**
* Creates a new table according to the field schema definition corresponding to
* the name schemaName given in the configuration file.
*
* @param tableName the name of the new table
* @throws SQLException
*/
public void createTable(String tableName, String schemaName, String comment) throws SQLException {
FieldConfig fieldConfig = fieldConfigs.get(schemaName);
ArrayList columns = getTableCreationColumns(tableName, fieldConfig);
createTable(tableName, columns, comment);
// additionally, restrict the primary key to be unique
// (I don't know why this is necessary, but it is required
// for a referencing table which references several columns,
// that these columns own a UNIQUE constraint.)
if (fieldConfig.getPrimaryKey().length > 0)
alterTable(String.format("ADD CONSTRAINT %s_unique UNIQUE (%s)", tableName.replace(".", ""),
fieldConfig.getPrimaryKeyString()), tableName);
}
/**
*
* Creates a new table according to the field schema definition corresponding to
* the name schemaName and with foreign key references to the
* primary key of referenceTableName.
*
*
* The primary key of the tables tableName and
* referenceTableName must be equal. The foreign key constraint is
* configured for ON DELETE CASCADE which means, when in the referenced
* table rows are deleted, there are also deleted in the table created by this
* method call.
*
*
* @param tableName The name of the new table.
* @param referenceTableName The table to be referenced by this table.
* @param schemaName The table schema determining the structure (especially the primary
* key) of the new table.
* @param comment A comment for the new table.
* @throws SQLException
*/
public void createTable(String tableName, String referenceTableName, String schemaName, String comment)
throws SQLException {
FieldConfig fieldConfig = fieldConfigs.get(schemaName);
ArrayList columns = getTableCreationColumns(tableName, fieldConfig);
columns.add(String.format("CONSTRAINT %s_fkey FOREIGN KEY (%s) REFERENCES %s ON DELETE CASCADE",
tableName.replace(".", ""), fieldConfig.getPrimaryKeyString(), referenceTableName));
createTable(tableName, columns, comment);
// additionally, restrict the primary key to be unique
// (I don't know why this is necessary, but it is required
// for a referencing table which references several columns,
// that these columns own a UNIQUE constraint.)
if (fieldConfig.getPrimaryKey().length > 0)
alterTable(String.format("ADD CONSTRAINT %s_unique UNIQUE (%s)", tableName.replace(".", ""),
fieldConfig.getPrimaryKeyString()), tableName);
}
/**
* Creates the columns to create a table according to the table schema given by
* fieldConfig for use with {@link #createTable(String, List, String)}.
*
* @param tableName
* @param fieldConfig
* @return
*/
private ArrayList getTableCreationColumns(String tableName, FieldConfig fieldConfig) {
ArrayList columns = new ArrayList();
for (Map field : fieldConfig.getFields()) {
StringBuilder columnStrBuilder = new StringBuilder();
columnStrBuilder.append(field.get(JulieXMLConstants.NAME));
columnStrBuilder.append(" ");
columnStrBuilder.append(field.get(JulieXMLConstants.TYPE));
columns.add(columnStrBuilder.toString());
}
if (fieldConfig.getPrimaryKey().length > 0)
columns.add(String.format("CONSTRAINT %s_pkey PRIMARY KEY (%s)", tableName.replace(".", ""),
fieldConfig.getPrimaryKeyString()));
return columns;
}
/**
* Creates a new table with custom columns.
*
* @param tableName the name of the new table
* @param columns a list of Strings, each containing name, type and constraint of a
* column, e.g. "foo integer primary key" as required for a valid sql
* command.
* @throws SQLException
*/
private void createTable(String tableName, List columns, String comment) throws SQLException {
Connection conn = getConn();
StringBuilder sb = new StringBuilder("CREATE TABLE " + tableName + " (");
for (String column : columns)
sb.append(", " + column);
sb.append(");");
String sqlString = sb.toString().replaceFirst(", ", "");
try {
Statement st = conn.createStatement();
st.execute(sqlString);
st.execute("COMMENT ON TABLE " + tableName + " IS \'" + comment + "\';");
} catch (SQLException e) {
System.err.println(sqlString);
e.printStackTrace();
throw e;
} finally {
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
/**
*
* Does the same as {@link #createSubsetTable(String, String, Integer, String, String)}
* with the exception that the assumed table schema is that of the active schema
* defined in the configuration file.
*
*
* @param subsetTable name of the subset table
* @param supersetTable name of the referenced table
* @param maxNumberRefHops the maximum number of times a foreign key reference to a data
* table may be followed
* @param comment will be added to the table in the database, used to make tables
* reproducable
* @throws SQLException
*/
public void createSubsetTable(String subsetTable, String supersetTable, Integer maxNumberRefHops, String comment)
throws SQLException {
createSubsetTable(subsetTable, supersetTable, maxNumberRefHops, comment, activeTableSchema);
}
/**
*
* Does the same as {@link #createSubsetTable(String, String, Integer, String, String)}
* with the exception that the assumed table schema is that of the active schema
* defined in the configuration file and the first referenced data table is used as data table.
*
*
* @param subsetTable name of the subset table
* @param supersetTable name of the referenced table
* @param comment will be added to the table in the database, used to make tables
* reproducable
* @throws SQLException
*/
public void createSubsetTable(String subsetTable, String supersetTable, String comment) throws SQLException {
createSubsetTable(subsetTable, supersetTable, null, comment, activeTableSchema);
}
/**
*
* Creates an empty table referencing the primary key of the data table given by
* superSetTable or, if this is a subset table itself, the data
* table referenced by that table.
*
*
* To fill the empty subset table with data, use one of the
* init[...] methods offered by this class.
*
*
* Subset tables have a particular table scheme. They define a foreign key to
* the primary key of the referenced data table. There are the following
* additional columns:
*
*
*
Name
*
Type
*
*
*
is_in_process
*
boolean
*
*
*
is_processed
*
boolean
*
*
*
last_component
*
text
*
*
*
log
*
text
*
*
*
has errors
*
boolean
*
*
*
pid
*
character varying(10)
*
*
*
host_name
*
character varying(100)
*
*
*
processing_timestamp
*
timestamp without time zone
*
*
*
*
* The subset table can be used for processing, e.g. by UIMA CollectionReaders,
* which store information about the processing in it.
*
* The actual data is located in the referenced table.
*
* @param subsetTable name of the subset table
* @param supersetTable name of the referenced table
* @param posOfDataTable the position of the datatable that should be referenced; the 1st
* would be nearest data table, i.e. perhaps supersetTable
* itself. The 2nd would be the datatable referenced by the first
* data table on the reference path.
* @param schemaName name of the table schema to work with (determined in the
* configuration file)
* @param comment will be added to the table in the database, used to make tables
* reproducable
* @throws SQLException
*/
public void createSubsetTable(String subsetTable, String supersetTable, Integer posOfDataTable, String comment,
String schemaName) throws SQLException {
FieldConfig fieldConfig = fieldConfigs.get(schemaName);
String effectiveDataTable = getReferencedTable(supersetTable, posOfDataTable);
ArrayList columns = new ArrayList();
List