All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.datacleaner.spark.SparkConfigurationReaderInterceptor Maven / Gradle / Ivy

There is a newer version: 6.0.0
Show newest version
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.spark;

import java.net.URI;
import java.util.Arrays;
import java.util.Collection;
import java.util.Map;
import java.util.regex.Matcher;

import org.apache.metamodel.util.Resource;
import org.datacleaner.api.RenderingFormat;
import org.datacleaner.configuration.ConfigurationReaderInterceptor;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.configuration.DataCleanerEnvironment;
import org.datacleaner.configuration.DataCleanerEnvironmentImpl;
import org.datacleaner.configuration.DefaultConfigurationReaderInterceptor;
import org.datacleaner.descriptors.ClasspathScanDescriptorProvider;
import org.datacleaner.descriptors.DescriptorProvider;
import org.datacleaner.job.concurrent.SingleThreadedTaskRunner;
import org.datacleaner.job.concurrent.TaskRunner;
import org.datacleaner.panels.ComponentBuilderPresenterRenderingFormat;
import org.datacleaner.result.renderer.SwingRenderingFormat;
import org.datacleaner.spark.utils.HdfsHelper;
import org.datacleaner.storage.InMemoryStorageProvider;
import org.datacleaner.storage.StorageProvider;
import org.datacleaner.util.convert.HadoopResourceBuilder;

/**
 * {@link ConfigurationReaderInterceptor} for conf.xml in a Spark environment
 */
public class SparkConfigurationReaderInterceptor extends DefaultConfigurationReaderInterceptor {

    private static final TaskRunner TASK_RUNNER = new SingleThreadedTaskRunner();
    private static final Collection>> EXCLUDED_RENDERER_FORMATS =
            Arrays.asList(SwingRenderingFormat.class, ComponentBuilderPresenterRenderingFormat.class);
    private static final DescriptorProvider DESCRIPTOR_PROVIDER =
            new ClasspathScanDescriptorProvider(TASK_RUNNER, EXCLUDED_RENDERER_FORMATS)
                    .scanPackage("org.datacleaner", true).scanPackage("com.hi", true).scanPackage("com.neopost", true);
    private static final StorageProvider STORAGE_PROVIDER = new InMemoryStorageProvider(500, 20);

    private static final DataCleanerEnvironment BASE_ENVIRONMENT =
            new DataCleanerEnvironmentImpl().withTaskRunner(TASK_RUNNER).withDescriptorProvider(DESCRIPTOR_PROVIDER)
                    .withStorageProvider(STORAGE_PROVIDER);

    private final HdfsHelper _hdfsHelper;

    public SparkConfigurationReaderInterceptor(final Map customProperties) {
        super(customProperties, BASE_ENVIRONMENT);
        _hdfsHelper = HdfsHelper.createHelper();
    }

    @Override
    public Resource createResource(String resourceUrl, final DataCleanerConfiguration tempConfiguration) {
        final Matcher matcher = HadoopResourceBuilder.RESOURCE_SCHEME_PATTERN.matcher(resourceUrl);
        if (matcher.find()) {
            resourceUrl = matcher.group(1) + "://" + matcher.group(3);
        }
        final URI uri = URI.create(resourceUrl);
        return _hdfsHelper.getResourceToUse(uri);
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy