All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.staros.worker.WorkerManager Maven / Gradle / Ivy

There is a newer version: 3.4-rc2
Show newest version
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


package com.staros.worker;

import com.google.protobuf.Message;
import com.google.protobuf.util.JsonFormat;
import com.staros.exception.ExceptionCode;
import com.staros.exception.StarException;
import com.staros.journal.DummyJournalSystem;
import com.staros.journal.Journal;
import com.staros.journal.JournalSystem;
import com.staros.proto.WorkerFileMeta;
import com.staros.proto.WorkerInfo;
import com.staros.util.IdGenerator;
import com.staros.util.LockCloseable;
import com.staros.util.LogUtils;
import com.staros.util.Text;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantReadWriteLock;

/**
 * Worker Manager handles all operations that are related
 * to worker, such as add worker and remove worker.
 */
public class WorkerManager {
    private static final Logger LOG = LogManager.getLogger(WorkerManager.class);

    private Map serviceWorkerGroups; // 
    private Map workers; // 

    private ReentrantReadWriteLock lock;

    private JournalSystem journalSystem;
    private IdGenerator idGenerator;
    private AtomicInteger roundRobinOffset; // help choose worker in round-robin way when schedule

    // FOR TEST
    public WorkerManager() {
        this(new DummyJournalSystem(), new IdGenerator(null));
    }

    public WorkerManager(JournalSystem journalSystem, IdGenerator idGenerator) {
        // TODO: load from storage

        this.serviceWorkerGroups = new HashMap<>();
        this.workers = new HashMap<>();
        this.lock = new ReentrantReadWriteLock();

        this.journalSystem = journalSystem;
        this.idGenerator = idGenerator;
        this.roundRobinOffset = new AtomicInteger();
    }

    /**
     * Create a default worker group for service, called when a service is bootstrapped
     */
    public void createDefaultServiceWorkerGroup(String serviceId) {
        try (LockCloseable lock = new LockCloseable(writeLock())) {
            serviceWorkerGroups.put(serviceId, new ServiceWorkerGroup(serviceId));
        }
    }

    /**
     * Create a worker group for service
     */
    public long createWorkerGroup(String serviceId) {
        try (LockCloseable lock = new LockCloseable(writeLock())) {
            ServiceWorkerGroup serviceWorkerGroup = getServiceWorkerGroup(serviceId);
            long groupId = idGenerator.getNextId();
            serviceWorkerGroup.createGroup(groupId);
            return groupId;
        }
    }

    /**
     * Add a worker for worker group in service
     *
     * @return worker id
     *
     * @throws StarException
     *             if group id not exist
     */
    public long addWorker(String serviceId, long groupId, String ipPort) throws StarException {
        try (LockCloseable lock = new LockCloseable(writeLock())) {
            // validate worker group
            ServiceWorkerGroup serviceWorkerGroup = getServiceWorkerGroup(serviceId);
            if (!serviceWorkerGroup.existGroup(groupId)) {
                throw new StarException(ExceptionCode.NOT_EXIST,
                        String.format("worker group %d not exist in service %s.", groupId, serviceId));
            }

            // validate worker address
            for (Worker worker : workers.values()) {
                if (worker.getIpPort().equals(ipPort)) {
                    throw new StarException(ExceptionCode.ALREADY_EXIST,
                            String.format("worker address %s already exist.", ipPort));
                }
            }

            long workerId = idGenerator.getNextId();
            Worker worker = new Worker(serviceId, groupId, workerId, ipPort);

            Journal journal = Journal.logAddWorker(worker);
            journalSystem.write(journal);

            serviceWorkerGroup.addWorker(worker);

            workers.put(workerId, worker);

            LOG.info("worker {} added to group {} in service {}.", workerId, groupId, serviceId);

            try {
                worker.heartbeat();
            } catch (Exception e) {
                // it's ok, later heartbeat manager will take care of this
                LOG.warn("worker {} fail to handle first heartbeat, {}.", workerId, e.getMessage());
            }
            return workerId;
        }
    }

    /**
     * Remove a worker for worker group in service
     *
     * @throws StarException
     *             if group id or worker id not exist
     */
    public void removeWorker(String serviceId, long groupId, long workerId) throws StarException {
        try (LockCloseable lock = new LockCloseable(writeLock())) {
            // validate worker group
            ServiceWorkerGroup serviceWorkerGroup = getServiceWorkerGroup(serviceId);
            if (!serviceWorkerGroup.existGroup(groupId)) {
                throw new StarException(ExceptionCode.NOT_EXIST,
                        String.format("worker group %d not exist in service %s.", groupId, serviceId));
            }

            Worker worker = workers.get(workerId);
            if (worker == null) {
                throw new StarException(ExceptionCode.NOT_EXIST,
                        String.format("worker %d not exist.", workerId));
            }

            if (!worker.getServiceId().equals(serviceId)) {
                throw new StarException(ExceptionCode.INVALID_ARGUMENT,
                        String.format("worker %d not belong to service %s.",
                        worker.getServiceId(), serviceId));
            }

            if (worker.getGroupId() != groupId) {
                throw new StarException(ExceptionCode.INVALID_ARGUMENT,
                        String.format("worker %d not belong to group %d.",
                        worker.getGroupId(), groupId));
            }

            Journal journal = Journal.logRemoveWorker(serviceId, groupId, workerId);
            journalSystem.write(journal);

            getServiceWorkerGroup(serviceId).removeWorker(worker);

            workers.remove(workerId);

            LOG.info("worker {} removed from group {} in service {}.", workerId, groupId, serviceId);
        }
    }

    /**
     * Get worker info by id
     */
    public WorkerInfo getWorkerInfo(long workerId) throws StarException {
        try (LockCloseable lock = new LockCloseable(readLock())) {
            Worker worker = workers.get(workerId);
            if (worker == null) {
                throw new StarException(ExceptionCode.NOT_EXIST,
                        String.format("worker %d not exist.", workerId));
            }

            return worker.toProtobuf();
        }
    }

    /**
     * Get worker info by ip port
     */
    public WorkerInfo getWorkerInfo(String ipPort) throws StarException {
        try (LockCloseable lock = new LockCloseable(readLock())) {
            for (Worker worker : workers.values()) {
                if (worker.getIpPort().equals(ipPort)) {
                    return worker.toProtobuf();
                }
            }

            throw new StarException(
                ExceptionCode.NOT_EXIST, String.format("worker %s not exist.", ipPort));
        }
    }

    /**
     * Get service worker group with lock held, this call must succeed
     */
    private ServiceWorkerGroup getServiceWorkerGroup(String serviceId) {
        ServiceWorkerGroup serviceWorkerGroup = serviceWorkerGroups.get(serviceId);
        if (serviceWorkerGroup == null) {
            LogUtils.fatal(LOG, "ServiceWorkerGroup not exist for service {}!", serviceId);
        }

        return serviceWorkerGroup;
    }

    /**
     * Returns a default worker group of the service.
     * @param serviceId service identity
     * @return default WorkerGroup. the workerGroup with the smallest id returns or exception if no worker group available.
     * @throws StarException NOT_EXIST if the service doesn't have any workerGroup yet.
     */
    public WorkerGroup getDefaultWorkerGroup(String serviceId) throws StarException {
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            ServiceWorkerGroup swg = getServiceWorkerGroup(serviceId);
            if (swg == null) {
                throw new StarException(ExceptionCode.NOT_EXIST, String.format("No workerGroup in service %s", serviceId));
            }
            return getServiceWorkerGroup(serviceId).getDefaultWorkerGroup();
        }
    }

    public WorkerGroup getWorkerGroup(String serviceId, long groupId) {
        return getServiceWorkerGroup(serviceId).getWorkerGroup(groupId);
    }

    public Lock readLock() {
        return lock.readLock();
    }

    public Lock writeLock() {
        return lock.writeLock();
    }

    // TODO: the returned worker is not safe, need look into it later
    public Worker getWorker(long workerId) throws StarException {
        try (LockCloseable lock = new LockCloseable(readLock())) {
            return workers.get(workerId);
        }
    }

    public void sendHeartbeatForAll() {
        try (LockCloseable lock = new LockCloseable(writeLock())) {
            for (Worker worker : workers.values()) {
                worker.heartbeat();
            }
        }
    }

    /**
     * process worker heartbeat, update worker's info
     */
    public boolean processWorkerHeartbeat(String serviceId, long workerId,
            long startTime, long numOfShards, Map workerProperties, long lastSeenTime) {

        try (LockCloseable lock = new LockCloseable(writeLock())) {
            Worker worker = workers.get(workerId);
            if (worker == null) {
                throw new StarException(ExceptionCode.NOT_EXIST,
                        String.format("worker %d not exist.", workerId));
            }

            if (!worker.getServiceId().equals(serviceId)) {
                throw new StarException(ExceptionCode.INVALID_ARGUMENT,
                        String.format("worker %d does not belong to service %s.", workerId, serviceId));
            }

            worker.updateLastSeenTime(lastSeenTime);

            // 
            Pair pair = worker.updateInfo(startTime, workerProperties, numOfShards);
            if (pair.getValue()) {
                // NOTE: if fails to persist, worker info is not reverted, it should be fine
                Journal journal = Journal.logUpdateWorker(serviceId, Collections.nCopies(1, worker));
                journalSystem.write(journal);
            }
            return pair.getKey();
        }
    }

    public void replayAddWorker(String serviceId, Worker worker) {
        try (LockCloseable lock = new LockCloseable(writeLock())) {
            if (workers.get(worker.getWorkerId()) != null) {
                LogUtils.fatal(LOG, "worker {} already exist when replay add worker, should not happen!", worker.getWorkerId());
            }
            getServiceWorkerGroup(serviceId).addWorker(worker);
            workers.put(worker.getWorkerId(), worker);

            // TODO: state machine
        }
    }

    public void replayRemoveWorker(String serviceId, long groupId, long workerId) {
        try (LockCloseable lock = new LockCloseable(writeLock())) {
            Worker worker = workers.get(workerId);
            if (worker == null) {
                LOG.warn("worker {} not exist when replay remove worker, just ignore.", workerId);
                return;
            }
            if (!worker.getServiceId().equals(serviceId)) {
                LogUtils.fatal(LOG, "worker {} mismatch service id, [expected:%s, found:%s], when replay remove worker, "
                        + "should not happen!", workerId, serviceId, worker.getServiceId());
            }
            if (worker.getGroupId() != groupId) {
                LogUtils.fatal(LOG, "worker {} mismatch group id, [expected:%d, found:%d], when replay remove worker, "
                        + "should not happen!", workerId, groupId, worker.getGroupId());
            }

            getServiceWorkerGroup(serviceId).removeWorker(worker);
            workers.remove(workerId);

            // TODO: state machine
        }
    }

    public void replayUpdateWorker(String serviceId, List workers) {
        try (LockCloseable lock = new LockCloseable(writeLock())) {
            for (Worker worker : workers) {
                if (this.workers.get(worker.getWorkerId()) == null) {
                    LogUtils.fatal(LOG, "worker {} not exist when replay update worker, should not happen!",
                            worker.getWorkerId());
                }
            }
            for (Worker worker : workers) {
                getServiceWorkerGroup(serviceId).updateWorker(worker);
                this.workers.put(worker.getWorkerId(), worker);
            }

            // TODO: state machine
        }
    }

    // FOR TEST
    public int getWorkerCount() {
        try (LockCloseable lk = new LockCloseable(readLock())) {
            return workers.size();
        }
    }

    // FOR TEST
    public void cleanStarletAgentForAll() {
        try (LockCloseable lock = new LockCloseable(writeLock())) {
            for (Worker worker : workers.values()) {
                worker.cleanStarletAgent();
            }
        }
    }

    public void dumpMeta(DataOutputStream out) throws IOException {
        try (LockCloseable lk = new LockCloseable(readLock())) {
            LOG.debug("start dump worker manager meta data to file.");

            WorkerFileMeta.Builder builder = WorkerFileMeta.newBuilder();

            for (Worker worker : workers.values()) {
                builder.addWorkerInfos(worker.toProtobuf());
            }

            WorkerFileMeta meta = builder.build();
            byte[] bytes = meta.toByteArray();
            Text.writeBytes(out, bytes);

            LOG.debug("end dump worker manager meta data to file.");
        }
    }

    public void loadMeta(DataInputStream in) throws IOException {
        try (LockCloseable lk = new LockCloseable(writeLock())) {
            LOG.debug("start load worker manager meta data from file.");

            byte[] bytes = Text.readBytes(in);
            WorkerFileMeta meta = WorkerFileMeta.parseFrom(bytes);

            List workerInfos = meta.getWorkerInfosList();
            for (WorkerInfo info : workerInfos) {
                Worker worker = Worker.fromProtobuf(info);
                getServiceWorkerGroup(worker.getServiceId()).addWorker(worker);
                workers.put(worker.getWorkerId(), worker);
            }

            LOG.debug("end load worker manager meta data from file.");
        }
    }

    public void dump(DataOutputStream out) throws IOException {
        try (LockCloseable lk = new LockCloseable(readLock())) {
            for (Worker worker : workers.values()) {
                String s = JsonFormat.printer().print((Message) worker.toProtobuf()) + "\n";
                out.writeBytes(s);
            }
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy