com.norconex.committer.sql.SQLCommitter Maven / Gradle / Ivy
Show all versions of norconex-committer-sql Show documentation
/* Copyright 2017-2018 Norconex Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.norconex.committer.sql;
import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLClassLoader;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Properties;
import java.util.Set;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import org.apache.commons.configuration.HierarchicalConfiguration;
import org.apache.commons.configuration.XMLConfiguration;
import org.apache.commons.dbcp2.BasicDataSource;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.ResultSetHandler;
import org.apache.commons.dbutils.handlers.ScalarHandler;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;
import org.apache.commons.text.StrSubstitutor;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import com.norconex.committer.core.AbstractCommitter;
import com.norconex.committer.core.AbstractMappedCommitter;
import com.norconex.committer.core.CommitterException;
import com.norconex.committer.core.IAddOperation;
import com.norconex.committer.core.ICommitOperation;
import com.norconex.committer.core.IDeleteOperation;
import com.norconex.commons.lang.encrypt.EncryptionKey;
import com.norconex.commons.lang.encrypt.EncryptionUtil;
import com.norconex.commons.lang.time.DurationParser;
import com.norconex.commons.lang.xml.EnhancedXMLStreamWriter;
/**
*
* Commit documents to a SQL database.
*
*
* Handling of missing table/fields
*
* By default, this Committer will throw an exception when trying to insert
* values into non-existing database table or fields. It is recommended your
* make sure your database table exists and the document fields being sent
* to the committer match your database fields.
*
*
* Alternatively, you can provide the necessary SQLs to create a new
* table as well as new fields as needed using
* {@link #setCreateTableSQL(String)} and {@link #setCreateFieldSQL(String)}
* respectively. Make sure to use the following placeholder variables
* as needed in the provided SQL(s) to have them automatically replaced by
* this Committer.
*
*
*
*
* - ${tableName}
* -
* Your table name, to be replaced with the value supplied with
* {@link #setTableName(String)}.
*
*
* - ${targetReferenceField}
* -
* The field that will hold your document reference. This usually is
* your table primary key. Default is {@value #DEFAULT_SQL_ID_FIELD} and
* can be overwritten with {@link #setTargetReferenceField(String)}.
*
*
* - ${targetContentField}
* -
* The field that will hold your document content (or "body").
* Default is {@value #DEFAULT_SQL_CONTENT_FIELD} and can be
* overwritten with {@link #setTargetContentField(String)}.
*
*
* - ${fieldName}
* -
* A field name to be created if you provided an SQL for creating new
* fields.
*
*
*
*
* Authentication
*
* For databases requiring authentication, the password
can
* optionally be encrypted using {@link EncryptionUtil}
* (or command-line "encrypt.bat" or "encrypt.sh").
* In order for the password to be decrypted properly, you need
* to specify the encryption key used to encrypt it. The key can be stored
* in a few supported locations and a combination of
* passwordKey
* and passwordKeySource
must be specified to properly
* locate the key. The supported sources are:
*
*
*
* passwordKeySource
* passwordKey
*
*
* key
* The actual encryption key.
*
*
* file
* Path to a file containing the encryption key.
*
*
* environment
* Name of an environment variable containing the key.
*
*
* property
* Name of a JVM system property containing the key.
*
*
*
* XML configuration usage:
*
* <committer class="com.norconex.committer.sql.SQLCommitter">
* <!-- Mandatory settings -->
* <driverClass>
* (Class name of the JDBC driver to use.)
* </driverClass>
* <connectionUrl>
* (JDBC connection URL.)
* </connectionUrl>
* <tableName>
* (The target database table name where documents will be committed.)
* </tableName>
*
* <!-- Other settings -->
* <driverPath>
* (Path to JDBC driver. Not required if already in classpath.)
* </driverPath>
* <properties>
* <property key="(property name)">(Property value.)</property>
* <!-- You can have multiple property. -->
* </properties>
*
* <createTableSQL>
* <!--
* The CREATE statement used to create a table if it does not
* already exist. If you need fields of specific types,
* specify them here. The following variables are expected
* and will be replaced with the configuration options of the same name:
* ${tableName}, ${targetReferenceField} and ${targetContentField}.
* Example:
* -->
* CREATE TABLE ${tableName} (
* ${targetReferenceField} VARCHAR(32672) NOT NULL,
* ${targetContentField} CLOB,
* PRIMARY KEY ( ${targetReferenceField} ),
* title VARCHAR(256)
* )
* </createTableSQL>
* <createFieldSQL>
* <!--
* The ALTER statement used to create missing table fields.
* The ${tableName} variable and will be replaced with
* the configuration option of the same name. The ${fieldName}
* variable will be replaced by newly encountered field names.
* Example:
* -->
* ALTER TABLE ${tableName} ADD ${fieldName} VARCHAR(32672)
* </createFieldSQL>
* <multiValuesJoiner>
* (One or more characters to join multi-value fields.
* Default is "|".)
* </multiValuesJoiner>
* <fixFieldNames>
* (Attempts to prevent insertion errors by converting characters that
* are not underscores or alphanumeric to underscores.
* Will also remove all non alphabetic characters that begins
* a field name.)
* </fixFieldNames>
* <fixFieldValues>
* (Attempts to prevent insertion errors by truncating values
* that are larger than their defined maximum field length.)
* </fixFieldValues>
*
* <!-- Use the following if authentication is required. -->
* <username>(Optional user name)</username>
* <password>(Optional user password)</password>
* <!-- Use the following if password is encrypted. -->
* <passwordKey>(the encryption key or a reference to it)</passwordKey>
* <passwordKeySource>[key|file|environment|property]</passwordKeySource>
*
* <sourceReferenceField keep="[false|true]">
* (Optional name of field that contains the document reference, when
* the default document reference is not used.
* Once re-mapped, this metadata source field is
* deleted, unless "keep" is set to true
.)
* </sourceReferenceField>
* <targetReferenceField>
* (Name of the database target field where the store a document unique
* identifier (sourceReferenceField). If not specified,
* default is "id". Typically is a tableName primary key.)
* </targetReferenceField>
* <sourceContentField keep="[false|true]">
* (If you wish to use a metadata field to act as the document
* "content", you can specify that field here. Default
* does not take a metadata field but rather the document content.
* Once re-mapped, the metadata source field is deleted,
* unless "keep" is set to true
.)
* </sourceContentField>
* <targetContentField>
* (Target repository field name for a document content/body.
* Default is "content". Since document content can sometimes be
* quite large, a CLOB field is usually best advised.)
* </targetContentField>
* <commitBatchSize>
* (Max number of documents to send to the database at once.)
* </commitBatchSize>
* <queueDir>(optional path where to queue files)</queueDir>
* <queueSize>(max queue size before committing)</queueSize>
* <maxRetries>(max retries upon commit failures)</maxRetries>
* <maxRetryWait>(max delay in milliseconds between retries)</maxRetryWait>
* </committer>
*
*
* XML configuration entries expecting millisecond durations
* can be provided in human-readable format (English only), as per
* {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s").
*
*
* Usage example:
*
* The following example uses an H2 database and creates the table and fields
* as they are encountered, storing all new fields as VARCHAR, making sure
* those new fields are no longer than 5000 characters.
*
*
* <committer class="com.norconex.committer.sql.SQLCommitter">
* <driverPath>/path/to/driver/h2.jar</driverPath>
* <driverClass>org.h2.Driver</driverClass>
* <connectionUrl>jdbc:h2:file:///path/to/db/h2</connectionUrl>
* <tableName>test_table</tableName>
* <createTableSQL>
* CREATE TABLE ${tableName} (
* ${targetReferenceField} VARCHAR(32672) NOT NULL,
* ${targetContentField} CLOB,
* PRIMARY KEY ( ${targetReferenceField} )
* )
* </createTableSQL>
* <createFieldSQL>
* ALTER TABLE ${tableName} ADD ${fieldName} VARCHAR(5000)
* </createFieldSQL>
* <fixFieldValues>true</fixFieldValues>
* </committer>
*
*
* @author Pascal Essiembre
*/
public class SQLCommitter extends AbstractMappedCommitter {
private static final Logger LOG = LogManager.getLogger(SQLCommitter.class);
/** Default SQL primary key field */
public static final String DEFAULT_SQL_ID_FIELD = "id";
/** Default SQL content field */
public static final String DEFAULT_SQL_CONTENT_FIELD = "content";
/** Default multi-value join string */
public static final String DEFAULT_MULTI_VALUES_JOINER = "|";
private static final String[] NO_REFLECT_FIELDS = new String[] {
"existingFields", "tableVerified", "datasource",
"password", "passwordKey"
};
private String driverPath;
private String driverClass;
private String connectionUrl;
private String username;
private String password;
private EncryptionKey passwordKey;
private final Properties properties = new Properties();
private String tableName;
private String createTableSQL;
private String createFieldSQL;
private boolean fixFieldNames;
private boolean fixFieldValues;
private String multiValuesJoiner = DEFAULT_MULTI_VALUES_JOINER;
// When we create missing ones... so we do not check if exists each time.
// key = field name; value = field size
private final Map existingFields = new HashMap<>();
// If we could confirm whether the tableName exists
private boolean tableVerified;
private BasicDataSource datasource;
/**
* Constructor.
*/
public SQLCommitter() {
super();
setTargetReferenceField(DEFAULT_SQL_ID_FIELD);
setTargetContentField(DEFAULT_SQL_CONTENT_FIELD);
}
public String getDriverPath() {
return driverPath;
}
public void setDriverPath(String driverPath) {
this.driverPath = driverPath;
}
public String getDriverClass() {
return driverClass;
}
public void setDriverClass(String driverClass) {
this.driverClass = driverClass;
}
public String getConnectionUrl() {
return connectionUrl;
}
public void setConnectionUrl(String connectionUrl) {
this.connectionUrl = connectionUrl;
}
public String getUsername() {
return username;
}
public void setUsername(String username) {
this.username = username;
}
public String getPassword() {
return password;
}
public void setPassword(String password) {
this.password = password;
}
public EncryptionKey getPasswordKey() {
return passwordKey;
}
public void setPasswordKey(EncryptionKey passwordKey) {
this.passwordKey = passwordKey;
}
public Properties getProperties() {
return properties;
}
public String getTableName() {
return tableName;
}
public void setTableName(String tableName) {
this.tableName = tableName;
}
public String getCreateTableSQL() {
return createTableSQL;
}
public void setCreateTableSQL(String createTableSQL) {
this.createTableSQL = createTableSQL;
}
public String getCreateFieldSQL() {
return createFieldSQL;
}
public void setCreateFieldSQL(String createFieldSQL) {
this.createFieldSQL = createFieldSQL;
}
public String getMultiValuesJoiner() {
return multiValuesJoiner;
}
public void setMultiValuesJoiner(String multiValuesJoiner) {
this.multiValuesJoiner = multiValuesJoiner;
}
public boolean isFixFieldNames() {
return fixFieldNames;
}
public void setFixFieldNames(boolean fixFieldNames) {
this.fixFieldNames = fixFieldNames;
}
public boolean isFixFieldValues() {
return fixFieldValues;
}
public void setFixFieldValues(boolean fixFieldValues) {
this.fixFieldValues = fixFieldValues;
}
@Override
public void commit() {
super.commit();
closeIfDone();
}
//TODO The following is a workaround to not having
// a close() method (or equivalent) on the Committers yet.
// So we check that the caller is not itself, which means it should
// be the parent framework, which should in theory, call this only
// once. This is safe to do as the worst case scenario is that a new
// client is re-created.
// Remove this method once proper init/close is added to Committers
private void closeIfDone() {
StackTraceElement[] els = Thread.currentThread().getStackTrace();
for (StackTraceElement el : els) {
if (AbstractCommitter.class.getName().equals(el.getClassName())
&& "commitIfReady".equals(el.getMethodName())) {
return;
}
}
close();
}
public synchronized void close() {
if (datasource != null) {
try {
datasource.close();
} catch (SQLException e) {
throw new CommitterException("Could not close datasource.", e);
}
}
}
@Override
protected void commitBatch(List batch) {
LOG.info("Sending " + batch.size()
+ " commit operations to SQL database.");
try {
QueryRunner q = new QueryRunner(nullSafeDataSource());
ensureTable(q);
for (ICommitOperation op : batch) {
if (op instanceof IAddOperation) {
addOperation((IAddOperation) op, q);
} else if (op instanceof IDeleteOperation) {
deleteOperation((IDeleteOperation) op, q);
} else {
close();
throw new CommitterException("Unsupported operation:" + op);
}
}
LOG.info("Done sending commit operations to database.");
} catch (CommitterException e) {
close();
throw e;
} catch (Exception e) {
close();
throw new CommitterException(
"Could not commit batch to database.", e);
}
}
private void addOperation(IAddOperation add, QueryRunner q)
throws SQLException {
String docId = add.getMetadata().getString(getTargetReferenceField());
if (StringUtils.isBlank(docId)) {
docId = add.getReference();
}
List fields = new ArrayList<>();
List values = new ArrayList<>();
for (Entry> entry : add.getMetadata().entrySet()) {
String field = entry.getKey();
String value = StringUtils.join(
entry.getValue(), getMultiValuesJoiner());
fields.add(fixFieldName(field));
values.add(value);
}
String sql = "INSERT INTO " + tableName + "("
+ StringUtils.join(fields, ",")
+ ") VALUES (" + StringUtils.repeat("?", ", ", values.size())
+ ")";
if (LOG.isTraceEnabled()) {
LOG.trace("SQL: " + sql);
}
sqlInsertDoc(q, sql, docId, fields, values);
}
private void deleteOperation(IDeleteOperation del, QueryRunner q)
throws SQLException {
runDelete(q, del.getReference());
}
private void sqlInsertDoc(QueryRunner q, String sql, String docId,
List fields, List values) throws SQLException {
ensureFields(q, fields);
Object[] args = new Object[values.size()];
int i = 0;
for (String value : values) {
args[i] = fixFieldValue(fields.get(i), value);
i++;
}
// If it already exists, delete it first.
if (recordExists(q, docId)) {
LOG.debug("Record exists. Deleting it first (" + docId + ").");
runDelete(q, docId);
}
q.update(sql, args);
}
private String fixFieldName(String fieldName) {
if (!fixFieldNames) {
return fieldName;
}
String newName = fieldName.replaceAll("\\W+", "_");
newName = newName.replaceFirst("^[\\d_]+", "");
if (LOG.isDebugEnabled() && !newName.equals(fieldName)) {
LOG.debug("Field name modified: " + fieldName + " -> " + newName);
}
return newName;
}
private String fixFieldValue(String fieldName, String value) {
if (!fixFieldValues) {
return value;
}
Integer size = existingFields.get(
StringUtils.lowerCase(fieldName, Locale.ENGLISH));
if (size == null) {
return value;
}
String newValue = StringUtils.truncate(value, size);
if (LOG.isDebugEnabled() && !newValue.equals(value)) {
LOG.debug("Value truncated: " + value + " -> " + newValue);
}
return newValue;
}
//--- Verifying/creating tables/fields -------------------------------------
private boolean tableExists(QueryRunner q) {
try {
// for table existence, we cannot rely enough on return value
runExists(q, null);
return true;
} catch (SQLException e) {
return false;
}
}
private boolean recordExists(QueryRunner q, String docId)
throws SQLException {
return runExists(q, getTargetReferenceField() + " = ?", docId);
}
private boolean runExists(QueryRunner q, String where, Object... values)
throws SQLException {
String sql = "SELECT 1 FROM " + tableName;
if (StringUtils.isNotBlank(where)) {
sql += " WHERE " + where;
}
LOG.debug(sql);
Number val = (Number) q.query(sql, new ScalarHandler<>(), values);
return val != null && val.longValue() == 1;
}
private void runDelete(QueryRunner q, String docId) throws SQLException {
String deleteSQL = "DELETE FROM " + tableName
+ " WHERE " + getTargetReferenceField() + " = ?";
LOG.trace(deleteSQL);
q.update(deleteSQL, docId);
}
private synchronized void ensureTable(QueryRunner q) throws SQLException {
// if table was verified or no CREATE statement specified,
// return right away.
if (tableVerified || StringUtils.isBlank(createTableSQL)) {
return;
}
LOG.info("Checking if table \"" + tableName + "\" exists...");
if (!tableExists(q)) {
LOG.info("Table \"" + tableName + "\" does not exist. "
+ "Attempting to create it...");
String sql = interpolate(getCreateTableSQL(), null);
LOG.debug(sql);
q.update(sql);
LOG.info("Table created.");
} else {
LOG.info("Table \"" + tableName + "\" exists.");
}
loadFieldsMetadata(q);
tableVerified = true;
}
private synchronized void ensureFields(QueryRunner q, List fields)
throws SQLException {
// If not SQL to create field, we assume they should all exist.
if (StringUtils.isBlank(getCreateFieldSQL())) {
return;
}
Set currentFields = existingFields.keySet();
boolean hasNew = false;
for (String field : fields) {
if (!currentFields.contains(
StringUtils.lowerCase(field, Locale.ENGLISH))) {
// Create field
createField(q, field);
hasNew = true;
}
}
// Update fields metadata
if (hasNew) {
loadFieldsMetadata(q);
}
}
private void createField(QueryRunner q, String field) throws SQLException {
try {
String sql = interpolate(getCreateFieldSQL(), field);
LOG.trace(sql);
q.update(sql);
LOG.info("New field \"" + field + "\" created.");
} catch (SQLException e) {
LOG.info("New field \"" + field + "\" could not be created.");
throw e;
}
}
private void loadFieldsMetadata(QueryRunner q) throws SQLException {
// Add existing field info
q.query("SELECT * FROM " + tableName, new ResultSetHandler(){
@Override
public Void handle(ResultSet rs) throws SQLException {
ResultSetMetaData metadata = rs.getMetaData();
for (int i = 1; i <= metadata.getColumnCount(); i++) {
existingFields.put(StringUtils.lowerCase(
metadata.getColumnLabel(i), Locale.ENGLISH),
metadata.getColumnDisplaySize(i));
}
return null;
}
});
}
private String interpolate(String text, String fieldName) {
Map vars = new HashMap<>();
vars.put("tableName", getTableName());
vars.put("targetReferenceField", getTargetReferenceField());
vars.put("targetContentField", getTargetContentField());
if (StringUtils.isNotBlank(fieldName)) {
vars.put("fieldName", fieldName);
}
return StrSubstitutor.replace(text, vars);
}
private synchronized BasicDataSource nullSafeDataSource() {
if (datasource == null) {
if (StringUtils.isBlank(getDriverClass())) {
throw new CommitterException("No driver class specified.");
}
if (StringUtils.isBlank(getConnectionUrl())) {
throw new CommitterException("No connection URL specified.");
}
if (StringUtils.isBlank(getTableName())) {
throw new CommitterException("No table name specified.");
}
BasicDataSource ds = new BasicDataSource();
// if path is blank, we assume it is already in classpath
if (StringUtils.isNotBlank(driverPath)) {
try {
ds.setDriverClassLoader(new URLClassLoader(
new URL[] { new File(driverPath).toURI().toURL() },
getClass().getClassLoader()));
} catch (MalformedURLException e) {
throw new CommitterException(
"Invalid driver path: " + driverPath, e);
}
}
ds.setDriverClassName(driverClass);
ds.setUrl(connectionUrl);
ds.setDefaultAutoCommit(true);
ds.setUsername(username);
ds.setPassword(EncryptionUtil.decrypt(
getPassword(), getPasswordKey()));
for (String key : properties.stringPropertyNames()) {
ds.addConnectionProperty(key, properties.getProperty(key));
}
datasource = ds;
}
return datasource;
}
@Override
protected void saveToXML(XMLStreamWriter writer) throws XMLStreamException {
EnhancedXMLStreamWriter w = new EnhancedXMLStreamWriter(writer);
w.writeElementString("driverPath", getDriverPath());
w.writeElementString("driverClass", getDriverClass());
w.writeElementString("connectionUrl", getConnectionUrl());
w.writeElementString("username", getUsername());
w.writeElementString("password", getPassword());
if (!properties.isEmpty()) {
w.writeStartElement("properties");
for (Entry