All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.datacleaner.beans.writers.UpdateTableAnalyzer Maven / Gradle / Ivy

There is a newer version: 6.0.0
Show newest version
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.beans.writers;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

import javax.inject.Inject;
import javax.inject.Named;

import org.apache.commons.lang.ArrayUtils;
import org.apache.metamodel.BatchUpdateScript;
import org.apache.metamodel.UpdateableDataContext;
import org.apache.metamodel.create.TableCreationBuilder;
import org.apache.metamodel.csv.CsvDataContext;
import org.apache.metamodel.insert.RowInsertionBuilder;
import org.apache.metamodel.query.FilterItem;
import org.apache.metamodel.query.OperatorType;
import org.apache.metamodel.query.SelectItem;
import org.apache.metamodel.schema.Column;
import org.apache.metamodel.schema.Schema;
import org.apache.metamodel.schema.Table;
import org.apache.metamodel.update.RowUpdationBuilder;
import org.apache.metamodel.util.Action;
import org.apache.metamodel.util.FileHelper;
import org.apache.metamodel.util.Resource;
import org.datacleaner.api.Analyzer;
import org.datacleaner.api.Categorized;
import org.datacleaner.api.ColumnProperty;
import org.datacleaner.api.ComponentContext;
import org.datacleaner.api.Concurrent;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.ExecutionLogMessage;
import org.datacleaner.api.FileProperty;
import org.datacleaner.api.FileProperty.FileAccessMode;
import org.datacleaner.api.HasLabelAdvice;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.MappedProperty;
import org.datacleaner.api.Provided;
import org.datacleaner.api.SchemaProperty;
import org.datacleaner.api.TableProperty;
import org.datacleaner.api.Validate;
import org.datacleaner.components.categories.WriteSuperCategory;
import org.datacleaner.connection.CsvDatastore;
import org.datacleaner.connection.FileDatastore;
import org.datacleaner.connection.SchemaNavigator;
import org.datacleaner.connection.UpdateableDatastore;
import org.datacleaner.connection.UpdateableDatastoreConnection;
import org.datacleaner.data.MetaModelInputColumn;
import org.datacleaner.descriptors.FilterDescriptor;
import org.datacleaner.descriptors.TransformerDescriptor;
import org.datacleaner.desktop.api.PrecedingComponentConsumer;
import org.datacleaner.job.builder.AnalysisJobBuilder;
import org.datacleaner.util.WriteBuffer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Named("Update table")
@Description("Update records in a table in a registered datastore. This component allows you to map the values "
        + "available in the flow with the columns of the target table, in order to update the values of these columns "
        + "in the datastore.\nTo understand the configuration of the Update table component, consider a typical SQL "
        + "update statement:\n
UPDATE table SET name = 'John Doe' WHERE id = 42
\nHere we see " + "that there is a condition (WHERE id=42) and a value to update (name should become 'John Doe'). This is what " + "the two inputs are referring to. But obviously you are not dealing with constant values like 'John Doe' or " + "'42'. You have a field in your DC job that you want to map to fields in your database.\nUsually the " + "'condition value' would be a mapping of the key that you have in your job towards the key that is in the " + "database. The 'values to update' property would include the columns that you wish to update based on the " + "values you have in your job.") @Categorized(superCategory = WriteSuperCategory.class) @Concurrent(true) public class UpdateTableAnalyzer implements Analyzer, Action>, HasLabelAdvice, PrecedingComponentConsumer { private static final String PROPERTY_NAME_VALUES = "Values"; private static final String PROPERTY_NAME_CONDITION_VALUES = "Condition values"; private static final File TEMP_DIR = FileHelper.getTempDir(); private static final String ERROR_MESSAGE_COLUMN_NAME = "update_table_error_message"; private static final Logger logger = LoggerFactory.getLogger(UpdateTableAnalyzer.class); @Inject @Configured(value = PROPERTY_NAME_VALUES, order = 1) @Description("Values to update in the table") InputColumn[] values; @Inject @Configured(order = 2) @Description("Names of columns in the target table, on which the values will be updated.") @ColumnProperty @MappedProperty(PROPERTY_NAME_VALUES) String[] columnNames; @Inject @Configured(value = PROPERTY_NAME_CONDITION_VALUES, order = 3) @Description("Values that make up the condition of the table update") InputColumn[] conditionValues; @Inject @Configured(order = 4) @Description("Names of columns in the target table, which form the conditions of the update.") @ColumnProperty @MappedProperty(PROPERTY_NAME_CONDITION_VALUES) String[] conditionColumnNames; @Inject @Configured(order = 5) @Description("Datastore to write to") UpdateableDatastore datastore; @Inject @Configured(order = 6, required = false) @Description("Schema name of target table") @SchemaProperty String schemaName; @Inject @Configured(order = 7, required = false) @Description("Table to target (update)") @TableProperty String tableName; @Inject @Configured(order = 8, value = "Buffer size") @Description( "How much data to buffer before committing batches of data. Large batches often perform better, " + "but require more memory.") WriteBufferSizeOption bufferSizeOption = WriteBufferSizeOption.MEDIUM; @Inject @Configured(value = "How to handle updation errors?", order = 9) ErrorHandlingOption errorHandlingOption = ErrorHandlingOption.STOP_JOB; @Inject @Configured(value = "Error log file location", required = false, order = 10) @Description("Directory or file path for saving erroneous records") @FileProperty(accessMode = FileAccessMode.SAVE, extension = ".csv") File errorLogFile = TEMP_DIR; @Inject @Configured(required = false, order = 11) @Description("Additional values to write to error log") InputColumn[] additionalErrorLogValues; @Inject @Provided ComponentContext _componentContext; private Column[] _targetColumns; private Column[] _targetConditionColumns; private WriteBuffer _writeBuffer; private AtomicInteger _updatedRowCount; private AtomicInteger _errorRowCount; private CsvDataContext _errorDataContext; @Validate public void validate() { if (values.length != columnNames.length) { throw new IllegalStateException("Values and column names should have equal length"); } if (conditionValues.length != conditionColumnNames.length) { throw new IllegalStateException("Condition values and condition column names should have equal length"); } } @Initialize public void init() throws IllegalArgumentException { if (logger.isDebugEnabled()) { logger.debug("At init() time, InputColumns are: {}", Arrays.toString(values)); } _errorRowCount = new AtomicInteger(); _updatedRowCount = new AtomicInteger(); if (errorHandlingOption == ErrorHandlingOption.SAVE_TO_FILE) { _errorDataContext = createErrorDataContext(); } final int bufferSize = bufferSizeOption.calculateBufferSize(values.length); logger.info("Row buffer size set to {}", bufferSize); _writeBuffer = new WriteBuffer(bufferSize, this); try (UpdateableDatastoreConnection con = datastore.openConnection()) { final SchemaNavigator schemaNavigator = con.getSchemaNavigator(); final List columnsNotFound = new ArrayList<>(); _targetColumns = schemaNavigator.convertToColumns(schemaName, tableName, columnNames); for (int i = 0; i < _targetColumns.length; i++) { if (_targetColumns[i] == null) { columnsNotFound.add(columnNames[i]); } } _targetConditionColumns = schemaNavigator.convertToColumns(schemaName, tableName, conditionColumnNames); for (int i = 0; i < _targetConditionColumns.length; i++) { if (_targetConditionColumns[i] == null) { columnsNotFound.add(conditionColumnNames[i]); } } if (!columnsNotFound.isEmpty()) { throw new IllegalArgumentException("Could not find column(s): " + columnsNotFound); } } } @Override public String getSuggestedLabel() { if (datastore == null || tableName == null) { return null; } return datastore.getName() + " - " + tableName; } private void validateCsvHeaders(final CsvDataContext dc) { final Schema schema = dc.getDefaultSchema(); if (schema.getTableCount() == 0) { // nothing to worry about, we will create the table ourselves return; } final Table table = schema.getTables()[0]; // verify that table names correspond to what we need! for (final String columnName : columnNames) { final Column column = table.getColumnByName(columnName); if (column == null) { throw new IllegalStateException("Error log file does not have required column header: " + columnName); } } for (final String columnName : conditionColumnNames) { final Column column = table.getColumnByName(columnName); if (column == null) { throw new IllegalStateException("Error log file does not have required column header: " + columnName); } } if (additionalErrorLogValues != null) { for (final InputColumn inputColumn : additionalErrorLogValues) { final String columnName = translateAdditionalErrorLogColumnName(inputColumn.getName()); final Column column = table.getColumnByName(columnName); if (column == null) { throw new IllegalStateException( "Error log file does not have required column header: " + columnName); } } } final Column column = table.getColumnByName(ERROR_MESSAGE_COLUMN_NAME); if (column == null) { throw new IllegalStateException( "Error log file does not have required column: " + ERROR_MESSAGE_COLUMN_NAME); } } private String translateAdditionalErrorLogColumnName(final String columnName) { if (ArrayUtils.contains(columnNames, columnName)) { return translateAdditionalErrorLogColumnName(columnName + "_add"); } return columnName; } private CsvDataContext createErrorDataContext() { final File file; if (errorLogFile == null || TEMP_DIR.equals(errorLogFile)) { try { file = File.createTempFile("updation_error", ".csv"); } catch (final IOException e) { throw new IllegalStateException("Could not create new temp file", e); } } else if (errorLogFile.isDirectory()) { file = new File(errorLogFile, "updation_error_log.csv"); } else { file = errorLogFile; } final CsvDataContext dc = new CsvDataContext(file); final Schema schema = dc.getDefaultSchema(); if (file.exists() && file.length() > 0) { validateCsvHeaders(dc); } else { // create table if no table exists. dc.executeUpdate(cb -> { TableCreationBuilder tableBuilder = cb.createTable(schema, "error_table"); for (final String columnName : columnNames) { tableBuilder = tableBuilder.withColumn(columnName); } for (final String columnName : conditionColumnNames) { tableBuilder = tableBuilder.withColumn(columnName); } if (additionalErrorLogValues != null) { for (final InputColumn inputColumn : additionalErrorLogValues) { final String columnName = translateAdditionalErrorLogColumnName(inputColumn.getName()); tableBuilder = tableBuilder.withColumn(columnName); } } tableBuilder = tableBuilder.withColumn(ERROR_MESSAGE_COLUMN_NAME); tableBuilder.execute(); }); } return dc; } @Override public void run(final InputRow row, final int distinctCount) { if (logger.isDebugEnabled()) { logger.debug("At run() time, InputColumns are: {}", Arrays.toString(values)); } final Object[] rowData; if (additionalErrorLogValues == null) { rowData = new Object[values.length + conditionColumnNames.length]; } else { rowData = new Object[values.length + conditionColumnNames.length + additionalErrorLogValues.length]; } for (int i = 0; i < values.length; i++) { rowData[i] = row.getValue(values[i]); } for (int i = 0; i < conditionValues.length; i++) { rowData[i + values.length] = row.getValue(conditionValues[i]); } if (additionalErrorLogValues != null) { for (int i = 0; i < additionalErrorLogValues.length; i++) { final Object value = row.getValue(additionalErrorLogValues[i]); rowData[values.length + +conditionColumnNames.length + i] = value; } } try { // perform conversion in a separate loop, since it might crash and // the // error data will be more complete if first loop finished. for (int i = 0; i < values.length; i++) { rowData[i] = TypeConverter.convertType(rowData[i], _targetColumns[i]); if (logger.isDebugEnabled()) { logger.debug("Value for {} set to: {}", columnNames[i], rowData[i]); } } for (int i = 0; i < conditionValues.length; i++) { final int index = i + values.length; rowData[index] = TypeConverter.convertType(rowData[index], _targetConditionColumns[i]); if (logger.isDebugEnabled()) { logger.debug("Value for {} set to: {}", conditionColumnNames[i], rowData[index]); } } } catch (final RuntimeException e) { for (int i = 0; i < distinctCount; i++) { errorOccurred(rowData, e); } return; } if (logger.isDebugEnabled()) { logger.debug("Adding row data to buffer: {}", Arrays.toString(rowData)); } for (int i = 0; i < distinctCount; i++) { _writeBuffer.addToBuffer(rowData); } } @Override public WriteDataResult getResult() { _writeBuffer.flushBuffer(); final int updatedRowCount = _updatedRowCount.get(); final FileDatastore errorDatastore; if (_errorDataContext != null) { final Resource resource = _errorDataContext.getResource(); errorDatastore = new CsvDatastore(resource.getName(), resource); } else { errorDatastore = null; } return new WriteDataResultImpl(0, updatedRowCount, datastore, schemaName, tableName, _errorRowCount.get(), errorDatastore); } /** * Method invoked when flushing the buffer */ @Override public void run(final Iterable buffer) throws Exception { try (UpdateableDatastoreConnection con = datastore.openConnection()) { final Column[] updateColumns = con.getSchemaNavigator().convertToColumns(schemaName, tableName, columnNames); final Column[] whereColumns = con.getSchemaNavigator().convertToColumns(schemaName, tableName, conditionColumnNames); if (logger.isDebugEnabled()) { logger.debug("Updating columns: {}", Arrays.toString(updateColumns)); } final UpdateableDataContext dc = con.getUpdateableDataContext(); dc.executeUpdate((BatchUpdateScript) callback -> { int updateCount = 0; for (final Object[] rowData : buffer) { RowUpdationBuilder updationBuilder = callback.update(updateColumns[0].getTable()); for (int i = 0; i < updateColumns.length; i++) { final Object value = rowData[i]; updationBuilder = updationBuilder.value(updateColumns[i], value); } for (int i = 0; i < whereColumns.length; i++) { final Object value = rowData[i + updateColumns.length]; final Column whereColumn = whereColumns[i]; final FilterItem filterItem = new FilterItem(new SelectItem(whereColumn), OperatorType.EQUALS_TO, value); updationBuilder = updationBuilder.where(filterItem); } if (logger.isDebugEnabled()) { logger.debug("Updating: {}", Arrays.toString(rowData)); } try { updationBuilder.execute(); updateCount++; _updatedRowCount.incrementAndGet(); } catch (final RuntimeException e) { errorOccurred(rowData, e); } } if (updateCount > 0) { _componentContext.publishMessage(new ExecutionLogMessage(updateCount + " updates executed")); } }); } } protected void errorOccurred(final Object[] rowData, final RuntimeException e) { _errorRowCount.incrementAndGet(); if (errorHandlingOption == ErrorHandlingOption.STOP_JOB) { throw e; } else { logger.warn("Error occurred while updating record. Writing to error stream", e); _errorDataContext.executeUpdate(cb -> { RowInsertionBuilder insertBuilder = cb.insertInto(_errorDataContext.getDefaultSchema().getTables()[0]); for (int i = 0; i < columnNames.length; i++) { insertBuilder = insertBuilder.value(columnNames[i], rowData[i]); } if (additionalErrorLogValues != null) { for (int i = 0; i < additionalErrorLogValues.length; i++) { final String columnName = translateAdditionalErrorLogColumnName(additionalErrorLogValues[i].getName()); final Object value = rowData[columnNames.length + i]; insertBuilder = insertBuilder.value(columnName, value); } } insertBuilder = insertBuilder.value(ERROR_MESSAGE_COLUMN_NAME, e.getMessage()); insertBuilder.execute(); }); } } @Override public void configureForTransformedData(final AnalysisJobBuilder analysisJobBuilder, final TransformerDescriptor descriptor) { final List tables = analysisJobBuilder.getSourceTables(); if (tables.size() == 1) { final List sourceColumns = analysisJobBuilder.getSourceColumnsOfTable(tables.get(0)); final List> primaryKeys = new ArrayList<>(); for (final MetaModelInputColumn inputColumn : sourceColumns) { if (inputColumn.getPhysicalColumn().isPrimaryKey()) { primaryKeys.add(inputColumn); } } if (!primaryKeys.isEmpty()) { conditionValues = primaryKeys.toArray(new InputColumn[primaryKeys.size()]); } } } @Override public void configureForFilterOutcome(final AnalysisJobBuilder analysisJobBuilder, final FilterDescriptor descriptor, final String categoryName) { } }