All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.datacleaner.cli.CliRunner Maven / Gradle / Ivy
/**
* DataCleaner (community edition)
* Copyright (C) 2014 Free Software Foundation, Inc.
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.cli;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Supplier;
import org.apache.commons.vfs2.FileObject;
import org.apache.commons.vfs2.FileSystemException;
import org.apache.metamodel.DataContext;
import org.apache.metamodel.schema.Schema;
import org.apache.metamodel.schema.Table;
import org.apache.metamodel.util.FileHelper;
import org.apache.metamodel.util.ImmutableRef;
import org.apache.metamodel.util.LazyRef;
import org.apache.metamodel.util.Resource;
import org.datacleaner.configuration.ConfigurationReaderInterceptor;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.configuration.DataCleanerConfigurationImpl;
import org.datacleaner.configuration.JaxbConfigurationReader;
import org.datacleaner.connection.Datastore;
import org.datacleaner.connection.DatastoreConnection;
import org.datacleaner.descriptors.AnalyzerDescriptor;
import org.datacleaner.descriptors.ComponentDescriptor;
import org.datacleaner.descriptors.ConfiguredPropertyDescriptor;
import org.datacleaner.descriptors.FilterDescriptor;
import org.datacleaner.descriptors.TransformerDescriptor;
import org.datacleaner.job.JaxbJobReader;
import org.datacleaner.job.builder.AnalysisJobBuilder;
import org.datacleaner.job.runner.AnalysisResultFuture;
import org.datacleaner.job.runner.AnalysisRunner;
import org.datacleaner.job.runner.AnalysisRunnerImpl;
import org.datacleaner.result.AnalysisResultWriter;
import org.datacleaner.spark.SparkRunner;
import org.datacleaner.user.DesktopConfigurationReaderInterceptor;
import org.datacleaner.util.VFSUtils;
import org.datacleaner.util.convert.ResourceConverter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Contains the execution logic to run a job from the command line.
*/
public final class CliRunner implements Closeable {
private static final Logger logger = LoggerFactory.getLogger(CliRunner.class);
private final CliArguments _arguments;
private final Supplier _outputStreamRef;
private final Supplier _writerRef;
private final boolean _closeOut;
/**
* Alternative constructor that will specifically specifies the output
* writer. Should be used only for testing, since normally the CliArguments
* should be used to decide which outputwriter to use
*
* @param arguments
* @param writer
* @param outputStream
*/
protected CliRunner(final CliArguments arguments, final Writer writer, final OutputStream outputStream) {
_arguments = arguments;
if (outputStream == null) {
final String outputFilePath = arguments.getOutputFile();
if (outputFilePath == null) {
_outputStreamRef = null;
_writerRef = new LazyRef() {
@Override
protected Writer fetch() {
return new PrintWriter(System.out);
}
};
} else {
if (_arguments.getRunType() == CliRunType.SPARK) {
_writerRef = null;
_outputStreamRef = null;
} else {
final FileObject outputFile;
try {
outputFile = VFSUtils.getFileSystemManager().resolveFile(outputFilePath);
} catch (final FileSystemException e) {
throw new IllegalStateException(e);
}
_writerRef = new LazyRef() {
@Override
protected Writer fetch() {
try {
final OutputStream out = outputFile.getContent().getOutputStream();
return new OutputStreamWriter(out, FileHelper.DEFAULT_ENCODING);
} catch (UnsupportedEncodingException | FileSystemException e) {
throw new IllegalStateException(e);
}
}
};
_outputStreamRef = new LazyRef() {
@Override
protected OutputStream fetch() {
try {
return outputFile.getContent().getOutputStream();
} catch (final FileSystemException e) {
throw new IllegalStateException(e);
}
}
};
}
}
_closeOut = true;
} else {
_writerRef = new ImmutableRef<>(writer);
_outputStreamRef = new ImmutableRef<>(outputStream);
_closeOut = false;
}
}
public CliRunner(final CliArguments arguments) {
this(arguments, null, null);
}
public void run() throws Throwable {
final String configurationFilePath = _arguments.getConfigurationFile();
final Resource configurationFile = resolveResource(configurationFilePath);
final Resource propertiesResource;
if (_arguments.getPropertiesFile() != null) {
propertiesResource = resolveResource(_arguments.getPropertiesFile());
} else {
propertiesResource = null;
}
final ConfigurationReaderInterceptor configurationReaderInterceptor =
new DesktopConfigurationReaderInterceptor(new File("."), propertiesResource);
final InputStream inputStream = configurationFile.read();
try {
run(new JaxbConfigurationReader(configurationReaderInterceptor).create(inputStream));
} finally {
FileHelper.safeClose(inputStream);
}
}
private Resource resolveResource(final String path) {
return new ResourceConverter(new DataCleanerConfigurationImpl()).fromString(Resource.class, path);
}
public void run(final DataCleanerConfiguration configuration) throws Throwable {
final String jobFilePath = _arguments.getJobFile();
final CliListType listType = _arguments.getListType();
try {
if (jobFilePath != null) {
runJob(configuration);
} else if (listType != null) {
switch (listType) {
case ANALYZERS:
printAnalyzers(configuration);
break;
case TRANSFORMERS:
printTransformers(configuration);
break;
case FILTERS:
printFilters(configuration);
break;
case DATASTORES:
printDatastores(configuration);
break;
case SCHEMAS:
printSchemas(configuration);
break;
case TABLES:
printTables(configuration);
break;
case COLUMNS:
printColumns(configuration);
break;
default:
throw new IllegalArgumentException("Unknown list type: " + listType);
}
} else {
throw new IllegalArgumentException(
"Neither --job-file nor --list-type is specified. Try running with -usage to see usage help.");
}
} catch (final Exception e) {
logger.error("Exception thrown in {}", e, this);
System.err.println("Error:");
e.printStackTrace(System.err);
} finally {
if (configuration != null) {
configuration.getEnvironment().getTaskRunner().shutdown();
}
}
}
private void printColumns(final DataCleanerConfiguration configuration) {
final String datastoreName = _arguments.getDatastoreName();
final String tableName = _arguments.getTableName();
final String schemaName = _arguments.getSchemaName();
if (datastoreName == null) {
System.err.println("You need to specify the datastore name!");
} else if (tableName == null) {
System.err.println("You need to specify a table name!");
} else {
final Datastore ds = configuration.getDatastoreCatalog().getDatastore(datastoreName);
if (ds == null) {
System.err.println("No such datastore: " + datastoreName);
} else {
final DatastoreConnection con = ds.openConnection();
final DataContext dc = con.getDataContext();
final Schema schema;
if (schemaName == null) {
schema = dc.getDefaultSchema();
} else {
schema = dc.getSchemaByName(schemaName);
}
if (schema == null) {
System.err.println("No such schema: " + schemaName);
} else {
final Table table = schema.getTableByName(tableName);
if (table == null) {
write("No such table: " + tableName);
} else {
final List columnNames = table.getColumnNames();
write("Columns:");
write("--------");
for (final String columnName : columnNames) {
write(columnName);
}
}
}
con.close();
}
}
}
private void printTables(final DataCleanerConfiguration configuration) {
final String datastoreName = _arguments.getDatastoreName();
final String schemaName = _arguments.getSchemaName();
if (datastoreName == null) {
System.err.println("You need to specify the datastore name!");
} else {
final Datastore ds = configuration.getDatastoreCatalog().getDatastore(datastoreName);
if (ds == null) {
System.err.println("No such datastore: " + datastoreName);
} else {
final DatastoreConnection con = ds.openConnection();
final DataContext dc = con.getDataContext();
final Schema schema;
if (schemaName == null) {
schema = dc.getDefaultSchema();
} else {
schema = dc.getSchemaByName(schemaName);
}
if (schema == null) {
System.err.println("No such schema: " + schemaName);
} else {
final List tableNames = schema.getTableNames();
if (tableNames == null || tableNames.isEmpty()) {
System.err.println("No tables in schema!");
} else {
write("Tables:");
write("-------");
for (final String tableName : tableNames) {
write(tableName);
}
}
}
con.close();
}
}
}
private void printSchemas(final DataCleanerConfiguration configuration) {
final String datastoreName = _arguments.getDatastoreName();
if (datastoreName == null) {
System.err.println("You need to specify the datastore name!");
} else {
final Datastore ds = configuration.getDatastoreCatalog().getDatastore(datastoreName);
if (ds == null) {
System.err.println("No such datastore: " + datastoreName);
} else {
final DatastoreConnection con = ds.openConnection();
final List schemaNames = con.getDataContext().getSchemaNames();
if (schemaNames == null || schemaNames.isEmpty()) {
write("No schemas in datastore!");
} else {
write("Schemas:");
write("--------");
for (final String schemaName : schemaNames) {
write(schemaName);
}
}
con.close();
}
}
}
private void printDatastores(final DataCleanerConfiguration configuration) {
final String[] datastoreNames = configuration.getDatastoreCatalog().getDatastoreNames();
if (datastoreNames == null || datastoreNames.length == 0) {
write("No datastores configured!");
} else {
write("Datastores:");
write("-----------");
for (final String datastoreName : datastoreNames) {
write(datastoreName);
}
}
}
protected void runJob(final DataCleanerConfiguration configuration) throws Throwable {
if (_arguments.getRunType() == CliRunType.SPARK) {
final SparkRunner sparkRunner = new SparkRunner(_arguments.getConfigurationFile(), _arguments.getJobFile(),
_arguments.getOutputFile());
sparkRunner.runJob();
} else {
final JaxbJobReader jobReader = new JaxbJobReader(configuration);
final String jobFilePath = _arguments.getJobFile();
final Resource jobResource = resolveResource(jobFilePath);
final Map variableOverrides = _arguments.getVariableOverrides();
final AnalysisJobBuilder analysisJobBuilder;
final InputStream inputStream = jobResource.read();
try {
if (_arguments.getDatastoreName() != null) {
final Datastore datastore =
configuration.getDatastoreCatalog().getDatastore(_arguments.getDatastoreName());
if (datastore == null) {
throw new IllegalArgumentException("No such datastore: " + _arguments.getDatastoreName());
}
analysisJobBuilder = jobReader.create(inputStream, variableOverrides, datastore);
} else {
analysisJobBuilder = jobReader.create(inputStream, variableOverrides);
}
} finally {
FileHelper.safeClose(inputStream);
}
final AnalysisRunner runner = new AnalysisRunnerImpl(configuration, new CliProgressAnalysisListener());
final AnalysisResultFuture resultFuture = runner.run(analysisJobBuilder.toAnalysisJob());
resultFuture.await();
if (resultFuture.isSuccessful()) {
final CliOutputType outputType = _arguments.getOutputType();
final AnalysisResultWriter writer = outputType.createWriter();
writer.write(resultFuture, configuration, _writerRef, _outputStreamRef);
} else {
write("ERROR!");
write("------");
final List errors = resultFuture.getErrors();
write(errors.size() + " error(s) occurred while executing the job:");
for (final Throwable throwable : errors) {
write("------");
final StringWriter stringWriter = new StringWriter();
throwable.printStackTrace(new PrintWriter(stringWriter));
write(stringWriter.toString());
}
throw errors.get(0);
}
}
}
protected void printAnalyzers(final DataCleanerConfiguration configuration) {
final Collection> descriptors =
configuration.getEnvironment().getDescriptorProvider().getAnalyzerDescriptors();
if (descriptors == null || descriptors.isEmpty()) {
write("No analyzers configured!");
} else {
write("Analyzers:");
write("----------");
printBeanDescriptors(descriptors);
}
}
private void printTransformers(final DataCleanerConfiguration configuration) {
final Collection> descriptors =
configuration.getEnvironment().getDescriptorProvider().getTransformerDescriptors();
if (descriptors == null || descriptors.isEmpty()) {
write("No transformers configured!");
} else {
write("Transformers:");
write("-------------");
printBeanDescriptors(descriptors);
}
}
private void printFilters(final DataCleanerConfiguration configuration) {
final Collection> descriptors =
configuration.getEnvironment().getDescriptorProvider().getFilterDescriptors();
if (descriptors == null || descriptors.isEmpty()) {
write("No filters configured!");
} else {
write("Filters:");
write("--------");
printBeanDescriptors(descriptors);
}
}
protected void printBeanDescriptors(final Collection extends ComponentDescriptor>> descriptors) {
logger.debug("Printing {} descriptors", descriptors.size());
for (final ComponentDescriptor> descriptor : descriptors) {
write("name: " + descriptor.getDisplayName());
final Set propertiesForInput = descriptor.getConfiguredPropertiesForInput();
if (propertiesForInput.size() == 1) {
final ConfiguredPropertyDescriptor propertyForInput = propertiesForInput.iterator().next();
if (propertyForInput != null) {
if (propertyForInput.isArray()) {
write(" - Consumes multiple input columns (type: " + propertyForInput.getTypeArgument(0)
.getSimpleName() + ")");
} else {
write(" - Consumes a single input column (type: " + propertyForInput.getTypeArgument(0)
.getSimpleName() + ")");
}
}
} else {
write(" - Consumes " + propertiesForInput.size() + " named inputs");
for (final ConfiguredPropertyDescriptor propertyForInput : propertiesForInput) {
if (propertyForInput.isArray()) {
write(" Input columns: " + propertyForInput.getName() + " (type: " + propertyForInput
.getTypeArgument(0).getSimpleName() + ")");
} else {
write(" Input column: " + propertyForInput.getName() + " (type: " + propertyForInput
.getTypeArgument(0).getSimpleName() + ")");
}
}
}
final Set properties = descriptor.getConfiguredProperties();
for (final ConfiguredPropertyDescriptor property : properties) {
if (!property.isInputColumn()) {
write(" - Property: name=" + property.getName() + ", type=" + property.getBaseType().getSimpleName()
+ ", required=" + property.isRequired());
}
}
if (descriptor instanceof FilterDescriptor, ?>) {
final Set categoryNames = ((FilterDescriptor, ?>) descriptor).getOutcomeCategoryNames();
for (final String categoryName : categoryNames) {
write(" - Outcome: " + categoryName);
}
}
}
}
private void write(final String str) {
try {
_writerRef.get().write(str + "\n");
} catch (final IOException e) {
throw new IllegalStateException(e);
}
}
@Override
public void close() {
if (_closeOut) {
close(_writerRef);
close(_outputStreamRef);
}
}
private void close(final Supplier> ref) {
if (ref != null) {
if (ref instanceof LazyRef) {
final LazyRef> lazyRef = (LazyRef>) ref;
if (lazyRef.isFetched()) {
FileHelper.safeClose(ref.get());
}
} else {
FileHelper.safeClose(ref.get());
}
}
}
}