All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.datacleaner.components.remote.RemoteTransformer Maven / Gradle / Ivy

/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.components.remote;

import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.metamodel.schema.ColumnTypeImpl;
import org.apache.metamodel.util.EqualsBuilder;
import org.datacleaner.Version;
import org.datacleaner.api.Close;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.OutputColumns;
import org.datacleaner.api.Validate;
import org.datacleaner.configuration.RemoteServerData;
import org.datacleaner.job.concurrent.PreviousErrorsExistException;
import org.datacleaner.restclient.ComponentConfiguration;
import org.datacleaner.restclient.ComponentRESTClient;
import org.datacleaner.restclient.ComponentsRestClientUtils;
import org.datacleaner.restclient.CreateInput;
import org.datacleaner.restclient.ProcessStatelessInput;
import org.datacleaner.restclient.ProcessStatelessOutput;
import org.datacleaner.restclient.RESTClientException;
import org.datacleaner.restclient.Serializator;
import org.datacleaner.util.batch.BatchRowCollectingTransformer;
import org.datacleaner.util.batch.BatchSink;
import org.datacleaner.util.batch.BatchSource;
import org.datacleaner.util.convert.StringConverter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.module.jsonSchema.JsonSchema;
import com.fasterxml.jackson.module.jsonSchema.types.ArraySchema;
import com.fasterxml.jackson.module.jsonSchema.types.ValueTypeSchema;

/**
 * Transformer that is actually a proxy to a remote transformer sitting at DataCleaner Monitor server.
 * Instances of this transformer can be created only by
 * {@link org.datacleaner.descriptors.RemoteTransformerDescriptorImpl} component descriptors.
 *
 * @Since 9/1/15
 */
public class RemoteTransformer extends BatchRowCollectingTransformer {

    private static final Logger logger = LoggerFactory.getLogger(RemoteTransformer.class);
    private static final ObjectMapper mapper = Serializator.getJacksonObjectMapper();

    private final RemoteServerData serverData;
    private final AtomicBoolean failed = new AtomicBoolean(false);
    private String componentDisplayName;
    private ComponentRESTClient client;
    private final SingleValueErrorAwareCache cachedOutputColumns =
            new SingleValueErrorAwareCache() {
                @Override
                protected OutputColumns fetch(final CreateInput input) throws Exception {
                    return getOutputColumnsInternal(input);
                }
            };
    private Map configuredProperties = new TreeMap<>();

    public RemoteTransformer(final RemoteServerData serverData, final String componentDisplayName) {
        this.serverData = serverData;
        this.componentDisplayName = componentDisplayName;
    }

    @Initialize
    public void initClient() throws RemoteComponentException {
        try {
            logger.debug("Initializing '{}' @{}", componentDisplayName, this.hashCode());
            client = new ComponentRESTClient(serverData.getUrl(), serverData.getUsername(), serverData.getPassword(),
                    Version.getVersion());
        } catch (final Exception e) {
            throw new RemoteComponentException(
                    "Remote component '" + componentDisplayName + "' is temporarily unavailable. \n" + e.getMessage());
        }
    }

    @Close
    public void closeClient() {
        logger.debug("closing '{}' @{}", componentDisplayName, this.hashCode());
        client = null;
    }

    @Validate
    public void validate() throws Exception {
        final CreateInput createInput = new CreateInput();
        createInput.configuration = getConfiguration(getUsedInputColumns());
        try {
            cachedOutputColumns.getCachedValue(createInput);
        } catch (final RESTClientException e) {
            if (e.getCode() == 422) {
                // Validation failed - simplify the error message
                throw new RuntimeException(e.getReason());
            }
        }
    }

    @Override
    public OutputColumns getOutputColumns() {
        final CreateInput createInput = new CreateInput();
        createInput.configuration = getConfiguration(getUsedInputColumns());
        try {
            return cachedOutputColumns.getCachedValue(createInput);
        } catch (final Exception e) {
            logger.debug("Error retrieving columns of transformer '" + componentDisplayName + "': " + e.toString());
            return OutputColumns.NO_OUTPUT_COLUMNS;
        }
    }

    private boolean isOutputColumnEnumeration(final JsonSchema schema) {
        if (schema == null) {
            return false;
        }
        final boolean isArray = schema.isArraySchema();
        final JsonSchema baseSchema;
        if (isArray) {
            baseSchema = ((ArraySchema) schema).getItems().asSingleItems().getSchema();
        } else {
            baseSchema = schema;
        }

        if (baseSchema instanceof ValueTypeSchema) {
            final Set enums = ((ValueTypeSchema) baseSchema).getEnums();
            if (enums != null && !enums.isEmpty()) {
                return true;
            }
        }
        return false;
    }

    private ComponentConfiguration getConfiguration(final List> inputColumns) {
        final ComponentConfiguration configuration = new ComponentConfiguration();
        for (final Map.Entry propertyE : configuredProperties.entrySet()) {
            configuration.getProperties().put(propertyE.getKey(), mapper.valueToTree(propertyE.getValue()));
        }

        for (final InputColumn col : inputColumns) {
            configuration.getColumns().add(ComponentsRestClientUtils
                    .createInputColumnSpecification(col.getName(), col.getDataType(),
                            ColumnTypeImpl.convertColumnType(col.getDataType()).getName(), mapper.getNodeFactory()));
        }
        return configuration;
    }

    private List> getUsedInputColumns() {
        final ArrayList> columns = new ArrayList<>();
        for (final Object propValue : configuredProperties.values()) {
            if (propValue instanceof InputColumn) {
                columns.add((InputColumn) propValue);
            } else if (propValue instanceof InputColumn[]) {
                for (final InputColumn col : ((InputColumn[]) propValue)) {
                    columns.add(col);
                }
            } else if (propValue instanceof Collection) {
                for (final Object value : ((Collection) propValue)) {
                    if (value instanceof InputColumn) {
                        columns.add((InputColumn) value);
                    } else {
                        // don't iterate the rest if the first item is not an input column.
                        break;
                    }
                }
            }
            // TODO: are maps possible?
        }
        return columns;
    }

    private void convertOutputRows(final JsonNode rowSets, final BatchSink> sink,
            final int sinkSize) {
        final OutputColumns outCols = getOutputColumns();
        if (rowSets == null || rowSets.size() < 1) {
            throw new RuntimeException("Expected exactly 1 row in response");
        }

        int rowI = 0;
        for (final JsonNode rowSet : rowSets) {
            if (rowI >= sinkSize) {
                throw new RuntimeException("Expected " + sinkSize + " rows, but got more");
            }

            final List outRowSet = new ArrayList<>();

            for (final JsonNode row : rowSet) {
                final List values = new ArrayList<>();
                int i = 0;
                for (final JsonNode value : row) {
                    // TODO: should JsonNode be the default?
                    Class cl = String.class;
                    if (i < outCols.getColumnCount()) {
                        cl = outCols.getColumnType(i);
                    }
                    values.add(convertOutputValue(value, cl));
                    i++;
                }
                outRowSet.add(values.toArray(new Object[values.size()]));
            }
            sink.setOutput(rowI, outRowSet);
            rowI++;
        }
        if (rowI < sinkSize) {
            throw new RuntimeException("Expected " + sinkSize + " rows, but got only " + rowI);
        }
    }

    private Object convertOutputValue(final JsonNode value, final Class cl) {
        try {
            if (cl == JsonNode.class) {
                return value;
            }
            if (cl == File.class) {
                return StringConverter.simpleInstance().deserialize(value.asText(), cl);
            }
            return mapper.readValue(value.traverse(), cl);
        } catch (final Exception e) {
            throw new RuntimeException("Cannot convert table value of type '" + cl + "': " + value.toString(), e);
        }
    }

    public void setPropertyValue(final String propertyName, final Object value) {
        if (EqualsBuilder.equals(value, configuredProperties.get(propertyName))) {
            return;
        }
        logger.debug("Setting '{}'.'{}' = {}", componentDisplayName, propertyName, value);

        if (value == null) {
            configuredProperties.remove(propertyName);
        } else {
            configuredProperties.put(propertyName, value);
        }
    }

    public Object getPropertyValue(final String propertyName) {
        return configuredProperties.get(propertyName);
    }

    @Override
    public void map(final BatchSource source, final BatchSink> sink) {
        final List> cols = getUsedInputColumns();
        final int size = source.size();
        final Object[] rows = new Object[size];
        for (int i = 0; i < size; i++) {
            final InputRow inputRow = source.getInput(i);
            final Object[] values = new Object[cols.size()];
            for (int j = 0; j < cols.size(); j++) {
                values[j] = inputRow.getValue(cols.get(j));
            }
            rows[i] = values;
        }

        final ProcessStatelessInput input = new ProcessStatelessInput();
        input.configuration = getConfiguration(cols);
        input.data = mapper.valueToTree(rows);

        logger.debug("Processing remotely {} rows", size);

        if (client == null) {
            if (failed.get()) {
                throw new PreviousErrorsExistException();
            }
            throw new RuntimeException("Remote transformer's connection has already been closed. ");
        }
        final ProcessStatelessOutput out;
        try {
            out = client.processStateless(componentDisplayName, input);
        } catch (final RuntimeException e) {
            final boolean alreadyFailed = failed.getAndSet(true);
            if (!alreadyFailed) {
                throw new RuntimeException("Remote transformer failed: " + e.getMessage(), e);
            } else {
                throw new PreviousErrorsExistException();
            }
        }
        convertOutputRows(out.rows, sink, size);
    }

    private OutputColumns getOutputColumnsInternal(final CreateInput createInput) throws Exception {
        logger.debug("Getting output columns from server");
        boolean wasInit = false;
        if (client == null) {
            wasInit = true;
            initClient();
        }
        try {
            final org.datacleaner.restclient.OutputColumns columnsSpec =
                    client.getOutputColumns(componentDisplayName, createInput);

            final OutputColumns outCols = new OutputColumns(columnsSpec.getColumns().size(), Object.class);
            int i = 0;
            for (final org.datacleaner.restclient.OutputColumns.OutputColumn colSpec : columnsSpec.getColumns()) {
                outCols.setColumnName(i, colSpec.name);
                try {
                    outCols.setColumnType(i, Class.forName(colSpec.type));
                } catch (final ClassNotFoundException e) {
                    final Class type;
                    if (isOutputColumnEnumeration(colSpec.schema)) {
                        type = String.class;
                    } else {
                        // For unknown types we specify "Object" as a class
                        // This causes that Jackson will deserialize it not as a JsonNode,
                        // but simple Java Maps, Lists, Strings etc.
                        // We NEED it, because we need the values to be Serializable.
                        // This is because some analyzer results contain the values
                        // and analyzer results must be serializable (e.g. to save it in DC monitor, or
                        // to send it over wire when doing ditributed computing etc.)
                        type = Object.class;
                    }
                    outCols.setColumnType(i, type);
                }
                i++;
            }
            return outCols;
        } finally {
            if (wasInit) {
                closeClient();
            }
        }
    }
}