All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hazelcast.jet.impl.JetServiceBackend Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2008-2024, Hazelcast, Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.impl;

import com.hazelcast.client.impl.ClientEngine;
import com.hazelcast.client.impl.ClientEngineImpl;
import com.hazelcast.client.impl.protocol.ClientExceptionFactory;
import com.hazelcast.cluster.Address;
import com.hazelcast.cluster.ClusterState;
import com.hazelcast.config.Config;
import com.hazelcast.config.MapConfig;
import com.hazelcast.config.MergePolicyConfig;
import com.hazelcast.core.HazelcastInstance;
import com.hazelcast.instance.impl.HazelcastBootstrap;
import com.hazelcast.instance.impl.Node;
import com.hazelcast.instance.impl.NodeState;
import com.hazelcast.internal.cluster.ClusterStateListener;
import com.hazelcast.internal.metrics.impl.MetricsService;
import com.hazelcast.internal.nio.Packet;
import com.hazelcast.internal.partition.InternalPartitionService;
import com.hazelcast.internal.serialization.InternalSerializationService;
import com.hazelcast.internal.services.ManagedService;
import com.hazelcast.internal.services.MembershipAwareService;
import com.hazelcast.internal.services.MembershipServiceEvent;
import com.hazelcast.internal.util.InvocationUtil;
import com.hazelcast.jet.JetException;
import com.hazelcast.jet.JetService;
import com.hazelcast.jet.config.JetConfig;
import com.hazelcast.jet.config.JobConfig;
import com.hazelcast.jet.core.JobNotFoundException;
import com.hazelcast.jet.impl.execution.TaskletExecutionService;
import com.hazelcast.jet.impl.metrics.JobMetricsPublisher;
import com.hazelcast.jet.impl.operation.NotifyMemberShutdownOperation;
import com.hazelcast.jet.impl.operation.PrepareForPassiveClusterOperation;
import com.hazelcast.jet.impl.serialization.DelegatingSerializationService;
import com.hazelcast.jet.impl.submitjob.memberside.JobMetaDataParameterObject;
import com.hazelcast.jet.impl.submitjob.memberside.JobMultiPartParameterObject;
import com.hazelcast.jet.impl.submitjob.memberside.JobUploadStatus;
import com.hazelcast.jet.impl.submitjob.memberside.JobUploadStore;
import com.hazelcast.jet.impl.submitjob.memberside.validator.JarOnClientValidator;
import com.hazelcast.jet.impl.submitjob.memberside.validator.JarOnMemberValidator;
import com.hazelcast.jet.impl.util.ExceptionUtil;
import com.hazelcast.logging.ILogger;
import com.hazelcast.spi.impl.NodeEngine;
import com.hazelcast.spi.impl.NodeEngineImpl;
import com.hazelcast.spi.impl.operationservice.LiveOperations;
import com.hazelcast.spi.impl.operationservice.LiveOperationsTracker;
import com.hazelcast.spi.impl.operationservice.Operation;
import com.hazelcast.spi.merge.DiscardMergePolicy;
import com.hazelcast.spi.merge.LatestUpdateMergePolicy;
import com.hazelcast.spi.properties.HazelcastProperties;

import java.io.IOException;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import java.util.function.Supplier;

import static com.hazelcast.cluster.ClusterState.PASSIVE;
import static com.hazelcast.config.MapConfig.DISABLED_TTL_SECONDS;
import static com.hazelcast.internal.util.ExceptionUtil.sneakyThrow;
import static com.hazelcast.jet.impl.JobRepository.INTERNAL_JET_OBJECTS_PREFIX;
import static com.hazelcast.jet.impl.JobRepository.JOB_METRICS_MAP_NAME;
import static com.hazelcast.jet.impl.JobRepository.JOB_RESULTS_MAP_NAME;
import static com.hazelcast.jet.impl.util.ExceptionUtil.rethrow;
import static com.hazelcast.jet.impl.util.Util.memoizeConcurrent;
import static com.hazelcast.spi.properties.ClusterProperty.JOB_RESULTS_TTL_SECONDS;
import static java.util.concurrent.TimeUnit.SECONDS;

public class JetServiceBackend implements ManagedService, MembershipAwareService, ClusterStateListener,
        LiveOperationsTracker, Consumer {

    public static final String SERVICE_NAME = "hz:impl:jetService";
    public static final String SQL_ARGUMENTS_KEY_NAME = "__sql.arguments";
    public static final String SQL_CATALOG_MAP_NAME = "__sql.catalog";
    public static final int MAX_PARALLEL_ASYNC_OPS = 1000;

    private static final int NOTIFY_MEMBER_SHUTDOWN_DELAY = 5;
    private static final int SHUTDOWN_JOBS_MAX_WAIT_SECONDS = 10;

    private static final int JOB_UPLOAD_STORE_PERIOD = 30;

    private NodeEngineImpl nodeEngine;
    private final ILogger logger;
    private final LiveOperationRegistry liveOperationRegistry;
    private final AtomicReference> shutdownFuture = new AtomicReference<>();
    private final JetConfig jetConfig;

    private JetService jet;
    private Networking networking;
    private TaskletExecutionService taskletExecutionService;
    private JobRepository jobRepository;
    private JobCoordinationService jobCoordinationService;
    private JobClassLoaderService jobClassLoaderService;
    private JobExecutionService jobExecutionService;
    private final AtomicInteger numConcurrentAsyncOps = new AtomicInteger();
    private final Supplier sharedPartitionKeys = memoizeConcurrent(this::computeSharedPartitionKeys);
    private final JobUploadStore jobUploadStore = new JobUploadStore();
    private ScheduledFuture jobUploadStoreCheckerFuture;

    public JetServiceBackend(Node node) {
        this.logger = node.getLogger(getClass());
        this.liveOperationRegistry = new LiveOperationRegistry();
        this.jetConfig = node.getConfig().getJetConfig();
    }

    // ManagedService
    @Override
    public void init(NodeEngine engine, Properties hzProperties) {
        this.nodeEngine = (NodeEngineImpl) engine;
        this.jet = new JetInstanceImpl(nodeEngine.getNode().hazelcastInstance, jetConfig);
        jobRepository = new JobRepository(engine.getHazelcastInstance());
        taskletExecutionService = new TaskletExecutionService(
                nodeEngine, jetConfig.getCooperativeThreadCount(), nodeEngine.getProperties()
        );
        jobCoordinationService = createJobCoordinationService();
        jobClassLoaderService = new JobClassLoaderService(nodeEngine, jobRepository);
        jobExecutionService = new JobExecutionService(nodeEngine, taskletExecutionService, jobClassLoaderService);

        MetricsService metricsService = nodeEngine.getService(MetricsService.SERVICE_NAME);
        metricsService.registerPublisher(nodeEngine ->
                new JobMetricsPublisher(jobExecutionService, nodeEngine.getLocalMember()));
        nodeEngine.getMetricsRegistry().registerDynamicMetricsProvider(jobCoordinationService);
        nodeEngine.getMetricsRegistry().registerDynamicMetricsProvider(jobExecutionService);
        networking = new Networking(engine, jobExecutionService, jetConfig.getFlowControlPeriodMs());

        ClientEngine clientEngine = engine.getService(ClientEngineImpl.SERVICE_NAME);
        ClientExceptionFactory clientExceptionFactory = clientEngine.getExceptionFactory();
        if (clientExceptionFactory != null) {
            ExceptionUtil.registerJetExceptions(clientExceptionFactory);
        } else {
            logger.fine("Jet exceptions are not registered to the ClientExceptionFactory" +
                        " since the ClientExceptionFactory is not accessible.");
        }
        logger.info("Setting number of cooperative threads and default parallelism to "
                    + jetConfig.getCooperativeThreadCount());

        // Run periodically to clean expired jar uploads
        this.jobUploadStoreCheckerFuture = nodeEngine.getExecutionService().scheduleWithRepetition(
                jobUploadStore::cleanExpiredUploads, 0, JOB_UPLOAD_STORE_PERIOD, SECONDS);
    }

    public void configureJetInternalObjects(Config config, HazelcastProperties properties) {
        JetConfig jetConfig = config.getJetConfig();
        MapConfig internalMapConfig = new MapConfig(INTERNAL_JET_OBJECTS_PREFIX + '*')
                .setBackupCount(jetConfig.getBackupCount())
                // we query creationTime of resources maps
                .setStatisticsEnabled(true);

        internalMapConfig.getMergePolicyConfig().setPolicy(DiscardMergePolicy.class.getName());

        MapConfig resultsMapConfig = new MapConfig(internalMapConfig)
                .setName(JOB_RESULTS_MAP_NAME)
                .setTimeToLiveSeconds(properties.getSeconds(JOB_RESULTS_TTL_SECONDS));

        MapConfig metricsMapConfig = new MapConfig(internalMapConfig)
                .setName(JOB_METRICS_MAP_NAME)
                .setTimeToLiveSeconds(properties.getSeconds(JOB_RESULTS_TTL_SECONDS));

        config.addMapConfig(internalMapConfig)
                .addMapConfig(resultsMapConfig)
                .addMapConfig(metricsMapConfig)
                .addMapConfig(createSqlCatalogConfig());
    }

    // visible for tests
    static MapConfig createSqlCatalogConfig() {
        // TODO HZ-1743 when implemented properly align this with the chosen
        //  approach that HZ-1743 follows
        return new MapConfig(SQL_CATALOG_MAP_NAME)
                .setBackupCount(MapConfig.MAX_BACKUP_COUNT)
                .setAsyncBackupCount(MapConfig.MIN_BACKUP_COUNT)
                .setTimeToLiveSeconds(DISABLED_TTL_SECONDS)
                .setReadBackupData(true)
                .setMergePolicyConfig(new MergePolicyConfig().setPolicy(LatestUpdateMergePolicy.class.getName()))
                .setPerEntryStatsEnabled(true);
    }

    /**
     * Tells master to gracefully terminate jobs on this member. Blocks until
     * all are down.
     */
    public void shutDownJobs() {
        if (shutdownFuture.compareAndSet(null, new CompletableFuture<>())) {
            notifyAllMembersWeAreShuttingDown(shutdownFuture.get());
        }
        try {
            CompletableFuture future = shutdownFuture.get();
            future.get(SHUTDOWN_JOBS_MAX_WAIT_SECONDS, SECONDS);
            // Note that at this point there can still be executions running - those for light jobs
            // or those created automatically after a packet was received.
            // They are all non-fault-tolerant or contain only the packets, that will be dropped
            // when this member actually shuts down.
        } catch (Exception e) {
            logger.severe("Shutdown jobs timeout", e);
        }
    }

    private void notifyNonMasterMembersWeAreShuttingDown(Address masterAddress) {
        Supplier operationSupplier = NotifyMemberShutdownOperation::new;
        var localAddress = nodeEngine.getThisAddress();
        InvocationUtil.invokeOnStableClusterParallel(
                nodeEngine,
                operationSupplier,
                0,
                member -> !member.getAddress().equals(masterAddress) && !member.getAddress().equals(localAddress)
        ).whenComplete((r, t) -> {
            if (t != null) {
                logger.warning(
                        "Some non-master members encountered errors during the notification process about the shutdown member "
                                + nodeEngine.getNode().getThisUuid(),
                        t
                );
            } else {
                logger.fine(
                        "All non-master members were informed about the shutdown of member "
                                + nodeEngine.getNode().getThisUuid()
                );
            }
        });
    }

    private void notifyAllMembersWeAreShuttingDown(CompletableFuture future) {
        var fixedMasterAddress = nodeEngine.getMasterAddress();
        // we not guarantee the delivery of information to non-master members
        // in order not to delay shutdown of this member. If the notification is lost
        // some light jobs coordinated by those members can fail.
        notifyNonMasterMembersWeAreShuttingDown(fixedMasterAddress);
        // we guarantee delivery of information to the master
        notifyMasterWeAreShuttingDown(future, fixedMasterAddress);
    }

    private void notifyMasterWeAreShuttingDown(CompletableFuture future, Address masterAddress) {
        Operation op = new NotifyMemberShutdownOperation();
        nodeEngine.getOperationService()
                .invokeOnTarget(JetServiceBackend.SERVICE_NAME, op, masterAddress)
                .whenCompleteAsync((response, throwable) -> {
                    // if there is an error and the node is still ACTIVE, try again. If the node isn't ACTIVE, log & ignore.
                    NodeState nodeState = nodeEngine.getNode().getState();
                    if (throwable != null && nodeState == NodeState.ACTIVE) {
                        logger.warning("Failed to notify master member that this member is shutting down," +
                                " will retry in " + NOTIFY_MEMBER_SHUTDOWN_DELAY + " seconds", throwable);
                        // recursive call
                        nodeEngine.getExecutionService().schedule(
                                () -> notifyMasterWeAreShuttingDown(future, nodeEngine.getMasterAddress()),
                                NOTIFY_MEMBER_SHUTDOWN_DELAY,
                                SECONDS
                        );
                    } else {
                        if (throwable != null) {
                            logger.warning("Failed to notify master member that this member is shutting down," +
                                    " but this member is " + nodeState + ", so not retrying", throwable);
                        }
                        future.complete(null);
                    }
                });

    }

    @Override
    public void shutdown(boolean forceful) {
        // Cancel timer
        jobUploadStoreCheckerFuture.cancel(true);

        jobExecutionService.shutdown();
        taskletExecutionService.shutdown();
        taskletExecutionService.awaitWorkerTermination();
        networking.shutdown();
    }

    @Override
    public void reset() {
        jobExecutionService.reset();
        jobCoordinationService.reset();
    }

    // Overridden in EE with EnterpriseJobCoordinationService
    JobCoordinationService createJobCoordinationService() {
        return new JobCoordinationService(nodeEngine, this, jetConfig, jobRepository);
    }

    public InternalSerializationService createSerializationService(Map serializerConfigs) {
        return DelegatingSerializationService
                .from(getNodeEngine().getSerializationService(), serializerConfigs);
    }

    @SuppressWarnings("unused") // parameters are used from jet-enterprise
    public Operation createExportSnapshotOperation(long jobId, String name, boolean cancelJob) {
        throw new UnsupportedOperationException("You need Hazelcast Enterprise to use this feature");
    }

    public JetService getJet() {
        return this.jet;
    }

    public LiveOperationRegistry getLiveOperationRegistry() {
        return liveOperationRegistry;
    }

    public JobRepository getJobRepository() {
        return jobRepository;
    }

    public NodeEngineImpl getNodeEngine() {
        return nodeEngine;
    }

    public JetConfig getJetConfig() {
        return jetConfig;
    }

    public JobCoordinationService getJobCoordinationService() {
        return jobCoordinationService;
    }

    public JobClassLoaderService getJobClassLoaderService() {
        return jobClassLoaderService;
    }

    public JobExecutionService getJobExecutionService() {
        return jobExecutionService;
    }

    /**
     * Returns the job config or fails with {@link JobNotFoundException}
     * if the requested job is not found.
     */
    public JobConfig getJobConfig(long jobId, boolean isLightJob) {
        if (isLightJob) {
            return jobCoordinationService.getLightJobConfig(jobId);
        }

        JobRecord jobRecord = jobRepository.getJobRecord(jobId);
        if (jobRecord != null) {
            return jobRecord.getConfig();
        }

        JobResult jobResult = jobRepository.getJobResult(jobId);
        if (jobResult != null) {
            return jobResult.getJobConfig();
        }

        throw new JobNotFoundException(jobId);
    }

    @Override
    public void accept(Packet packet) {
        try {
            networking.handle(packet);
        } catch (IOException e) {
            throw sneakyThrow(e);
        }
    }

    @Override
    public void memberRemoved(MembershipServiceEvent event) {
        jobExecutionService.onMemberRemoved(event.getMember());
        jobCoordinationService.onMemberRemoved(event.getMember().getUuid());
    }

    @Override
    public void memberAdded(MembershipServiceEvent event) {
        jobCoordinationService.onMemberAdded(event.getMember());
    }

    @Override
    public void onClusterStateChange(ClusterState newState) {
        getJobCoordinationService().clusterChangeDone();
    }

    public AtomicInteger numConcurrentAsyncOps() {
        return numConcurrentAsyncOps;
    }

    @Override
    public void populate(LiveOperations liveOperations) {
        liveOperationRegistry.populate(liveOperations);
    }

    /**
     * Returns an array of pre-generated keys, one for each partition. At index
     * i there's a key, that we know will go to partition i.
     */
    public int[] getSharedPartitionKeys() {
        return sharedPartitionKeys.get();
    }

    private int[] computeSharedPartitionKeys() {
        InternalPartitionService partitionService = nodeEngine.getPartitionService();
        int[] keys = new int[partitionService.getPartitionCount()];
        int remainingCount = partitionService.getPartitionCount();
        for (int i = 1; remainingCount > 0; i++) {
            int partitionId = partitionService.getPartitionId(i);
            if (keys[partitionId] == 0) {
                keys[partitionId] = i;
                remainingCount--;
            }
        }
        return keys;
    }

    public TaskletExecutionService getTaskletExecutionService() {
        return taskletExecutionService;
    }

    public void beforeClusterStateChange(ClusterState requestedState) {
        if (requestedState == PASSIVE) {
            try {
                nodeEngine.getOperationService().createInvocationBuilder(JetServiceBackend.SERVICE_NAME,
                                new PrepareForPassiveClusterOperation(), nodeEngine.getMasterAddress())
                        .invoke().get();
            } catch (InterruptedException | ExecutionException e) {
                throw rethrow(e);
            }
        }
    }

    public void startScanningForJobs() {
        jobCoordinationService.startScanningForJobs();
    }

    /**
     * Execute the given jar
     */
    public void jarOnMember(JobMetaDataParameterObject jobMetaDataParameterObject) {
        // Performs validations before processing the request
        JarOnMemberValidator.validate(jobMetaDataParameterObject);

        executeJar(jobMetaDataParameterObject);
    }

    /**
     * Store the metadata about the jar that is uploaded from client side
     */
    public void jarOnClient(JobMetaDataParameterObject jobMetaDataParameterObject) {
        // Performs validations before processing the request
        checkResourceUploadEnabled();
        JarOnClientValidator.validate(jobMetaDataParameterObject);

        try {
            // Delegate processing to store
            jobUploadStore.processJobMetaData(jobMetaDataParameterObject);
        } catch (Exception exception) {
            // Upon exception, remove from the store
            jobUploadStore.removeBadSession(jobMetaDataParameterObject.getSessionId());

            throwJetExceptionFromJobMetaData(jobMetaDataParameterObject, exception);
        }
    }

    /**
     * Store a part of jar that is uploaded
     */
    public void storeJobMultiPart(JobMultiPartParameterObject jobMultiPartParameterObject) {
        try {
            JobMetaDataParameterObject partsComplete = jobUploadStore.processJobMultipart(jobMultiPartParameterObject);
            // If parts are complete
            if (partsComplete != null) {
                // Execute the jar
                executeJar(partsComplete);
            }
        } catch (Exception exception) {
            // Upon exception, remove from the store
            JobUploadStatus jobUploadStatus = jobUploadStore.removeBadSession(jobMultiPartParameterObject.getSessionId());

            // Check null. Maybe  non-existing session id is given
            if (jobUploadStatus != null) {

                JobMetaDataParameterObject jobMetaDataParameterObject = jobUploadStatus.getJobMetaDataParameterObject();
                if (jobMetaDataParameterObject != null) {
                    throwJetExceptionFromJobMetaData(jobMetaDataParameterObject, exception);
                }
            } else {
                // Only throw a JetException
                wrapWithJetException(exception);
            }
        }
    }

    private void throwJetExceptionFromJobMetaData(JobMetaDataParameterObject jobMetaDataParameterObject, Exception exception) {
        // Enrich exception with metadata
        String exceptionString = jobMetaDataParameterObject.exceptionString();
        JetException jetExceptionWithMetaData = new JetException(exceptionString, exception);

        // Only throw a JetException
        wrapWithJetException(jetExceptionWithMetaData);

    }

    /**
     * If exception is not JetException e.g. IOException, FileSystemException etc., wrap it with JetException
     */
    static void wrapWithJetException(Exception exception) {
        // Exception is not JetException
        if (!(exception instanceof JetException)) {
            // Get the root cause and wrap it with JetException
            throw rethrow(exception);
        } else {
            // Just throw the JetException as is
            sneakyThrow(exception);
        }
    }

    private void checkResourceUploadEnabled() {
        if (!jetConfig.isResourceUploadEnabled()) {
            throw new JetException("Resource upload is not enabled");
        }
    }

    /**
     * Run the given jar as Jet job. Triggered by both client and member side
     */
    public void executeJar(JobMetaDataParameterObject jobMetaDataParameterObject) {
        if (logger.isInfoEnabled()) {
            String message = String.format("Try executing jar file %s for session %s", jobMetaDataParameterObject.getJarPath(),
                    jobMetaDataParameterObject.getSessionId());
            logger.info(message);
        }

        checkResourceUploadEnabled();

        try {
            HazelcastBootstrap.executeJarOnMember(this::getHazelcastInstance,
                    jobMetaDataParameterObject.getJarPath().toString(),
                    jobMetaDataParameterObject.getSnapshotName(),
                    jobMetaDataParameterObject.getJobName(),
                    jobMetaDataParameterObject.getMainClass(),
                    jobMetaDataParameterObject.getJobParameters()
            );
            if (logger.isInfoEnabled()) {
                String message = String.format("executing jar file %s for session %s finished successfully",
                        jobMetaDataParameterObject.getJarPath(), jobMetaDataParameterObject.getSessionId());
                logger.info(message);
            }
        } catch (Exception exception) {
            logger.severe("caught exception when running the jar", exception);
            // Rethrow the exception back to client to notify  that job did not run
            throwJetExceptionFromJobMetaData(jobMetaDataParameterObject, exception);
        } finally {
            JobUploadStatus.cleanup(jobMetaDataParameterObject);
        }
    }

    private HazelcastInstance getHazelcastInstance() {
        return getNodeEngine().getHazelcastInstance();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy