com.staros.worker.WorkerManager Maven / Gradle / Ivy
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.staros.worker;
import com.google.protobuf.Message;
import com.google.protobuf.util.JsonFormat;
import com.staros.exception.ExceptionCode;
import com.staros.exception.StarException;
import com.staros.journal.DummyJournalSystem;
import com.staros.journal.Journal;
import com.staros.journal.JournalSystem;
import com.staros.proto.WorkerFileMeta;
import com.staros.proto.WorkerInfo;
import com.staros.util.IdGenerator;
import com.staros.util.LockCloseable;
import com.staros.util.LogUtils;
import com.staros.util.Text;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
/**
* Worker Manager handles all operations that are related
* to worker, such as add worker and remove worker.
*/
public class WorkerManager {
private static final Logger LOG = LogManager.getLogger(WorkerManager.class);
private Map serviceWorkerGroups; //
private Map workers; //
private ReentrantReadWriteLock lock;
private JournalSystem journalSystem;
private IdGenerator idGenerator;
private AtomicInteger roundRobinOffset; // help choose worker in round-robin way when schedule
// FOR TEST
public WorkerManager() {
this(new DummyJournalSystem(), new IdGenerator(null));
}
public WorkerManager(JournalSystem journalSystem, IdGenerator idGenerator) {
// TODO: load from storage
this.serviceWorkerGroups = new HashMap<>();
this.workers = new HashMap<>();
this.lock = new ReentrantReadWriteLock();
this.journalSystem = journalSystem;
this.idGenerator = idGenerator;
this.roundRobinOffset = new AtomicInteger();
}
/**
* Create a default worker group for service, called when a service is bootstrapped
*/
public void createDefaultServiceWorkerGroup(String serviceId) {
try (LockCloseable lock = new LockCloseable(writeLock())) {
serviceWorkerGroups.put(serviceId, new ServiceWorkerGroup(serviceId));
}
}
/**
* Create a worker group for service
*/
public long createWorkerGroup(String serviceId) {
try (LockCloseable lock = new LockCloseable(writeLock())) {
ServiceWorkerGroup serviceWorkerGroup = getServiceWorkerGroup(serviceId);
long groupId = idGenerator.getNextId();
serviceWorkerGroup.createGroup(groupId);
return groupId;
}
}
/**
* Add a worker for worker group in service
*
* @return worker id
*
* @throws StarException
* if group id not exist
*/
public long addWorker(String serviceId, long groupId, String ipPort) throws StarException {
try (LockCloseable lock = new LockCloseable(writeLock())) {
// validate worker group
ServiceWorkerGroup serviceWorkerGroup = getServiceWorkerGroup(serviceId);
if (!serviceWorkerGroup.existGroup(groupId)) {
throw new StarException(ExceptionCode.NOT_EXIST,
String.format("worker group %d not exist in service %s.", groupId, serviceId));
}
// validate worker address
for (Worker worker : workers.values()) {
if (worker.getIpPort().equals(ipPort)) {
throw new StarException(ExceptionCode.ALREADY_EXIST,
String.format("worker address %s already exist.", ipPort));
}
}
long workerId = idGenerator.getNextId();
Worker worker = new Worker(serviceId, groupId, workerId, ipPort);
Journal journal = Journal.logAddWorker(worker);
journalSystem.write(journal);
serviceWorkerGroup.addWorker(worker);
workers.put(workerId, worker);
LOG.info("worker {} added to group {} in service {}.", workerId, groupId, serviceId);
try {
worker.heartbeat();
} catch (Exception e) {
// it's ok, later heartbeat manager will take care of this
LOG.warn("worker {} fail to handle first heartbeat, {}.", workerId, e.getMessage());
}
return workerId;
}
}
/**
* Remove a worker for worker group in service
*
* @throws StarException
* if group id or worker id not exist
*/
public void removeWorker(String serviceId, long groupId, long workerId) throws StarException {
try (LockCloseable lock = new LockCloseable(writeLock())) {
// validate worker group
ServiceWorkerGroup serviceWorkerGroup = getServiceWorkerGroup(serviceId);
if (!serviceWorkerGroup.existGroup(groupId)) {
throw new StarException(ExceptionCode.NOT_EXIST,
String.format("worker group %d not exist in service %s.", groupId, serviceId));
}
Worker worker = workers.get(workerId);
if (worker == null) {
throw new StarException(ExceptionCode.NOT_EXIST,
String.format("worker %d not exist.", workerId));
}
if (!worker.getServiceId().equals(serviceId)) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT,
String.format("worker %d not belong to service %s.",
worker.getServiceId(), serviceId));
}
if (worker.getGroupId() != groupId) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT,
String.format("worker %d not belong to group %d.",
worker.getGroupId(), groupId));
}
Journal journal = Journal.logRemoveWorker(serviceId, groupId, workerId);
journalSystem.write(journal);
getServiceWorkerGroup(serviceId).removeWorker(worker);
workers.remove(workerId);
LOG.info("worker {} removed from group {} in service {}.", workerId, groupId, serviceId);
}
}
/**
* Get worker info by id
*/
public WorkerInfo getWorkerInfo(long workerId) throws StarException {
try (LockCloseable lock = new LockCloseable(readLock())) {
Worker worker = workers.get(workerId);
if (worker == null) {
throw new StarException(ExceptionCode.NOT_EXIST,
String.format("worker %d not exist.", workerId));
}
return worker.toProtobuf();
}
}
/**
* Get worker info by ip port
*/
public WorkerInfo getWorkerInfo(String ipPort) throws StarException {
try (LockCloseable lock = new LockCloseable(readLock())) {
for (Worker worker : workers.values()) {
if (worker.getIpPort().equals(ipPort)) {
return worker.toProtobuf();
}
}
throw new StarException(
ExceptionCode.NOT_EXIST, String.format("worker %s not exist.", ipPort));
}
}
/**
* Get service worker group with lock held, this call must succeed
*/
private ServiceWorkerGroup getServiceWorkerGroup(String serviceId) {
ServiceWorkerGroup serviceWorkerGroup = serviceWorkerGroups.get(serviceId);
if (serviceWorkerGroup == null) {
LogUtils.fatal(LOG, "ServiceWorkerGroup not exist for service {}!", serviceId);
}
return serviceWorkerGroup;
}
/**
* Returns a default worker group of the service.
* @param serviceId service identity
* @return default WorkerGroup. the workerGroup with the smallest id returns or exception if no worker group available.
* @throws StarException NOT_EXIST if the service doesn't have any workerGroup yet.
*/
public WorkerGroup getDefaultWorkerGroup(String serviceId) throws StarException {
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
ServiceWorkerGroup swg = getServiceWorkerGroup(serviceId);
if (swg == null) {
throw new StarException(ExceptionCode.NOT_EXIST, String.format("No workerGroup in service %s", serviceId));
}
return getServiceWorkerGroup(serviceId).getDefaultWorkerGroup();
}
}
public WorkerGroup getWorkerGroup(String serviceId, long groupId) {
return getServiceWorkerGroup(serviceId).getWorkerGroup(groupId);
}
public Lock readLock() {
return lock.readLock();
}
public Lock writeLock() {
return lock.writeLock();
}
// TODO: the returned worker is not safe, need look into it later
public Worker getWorker(long workerId) throws StarException {
try (LockCloseable lock = new LockCloseable(readLock())) {
return workers.get(workerId);
}
}
public void sendHeartbeatForAll() {
try (LockCloseable lock = new LockCloseable(writeLock())) {
for (Worker worker : workers.values()) {
worker.heartbeat();
}
}
}
/**
* process worker heartbeat, update worker's info
*/
public boolean processWorkerHeartbeat(String serviceId, long workerId,
long startTime, long numOfShards, Map workerProperties, long lastSeenTime) {
try (LockCloseable lock = new LockCloseable(writeLock())) {
Worker worker = workers.get(workerId);
if (worker == null) {
throw new StarException(ExceptionCode.NOT_EXIST,
String.format("worker %d not exist.", workerId));
}
if (!worker.getServiceId().equals(serviceId)) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT,
String.format("worker %d does not belong to service %s.", workerId, serviceId));
}
worker.updateLastSeenTime(lastSeenTime);
//
Pair pair = worker.updateInfo(startTime, workerProperties, numOfShards);
if (pair.getValue()) {
// NOTE: if fails to persist, worker info is not reverted, it should be fine
Journal journal = Journal.logUpdateWorker(serviceId, Collections.nCopies(1, worker));
journalSystem.write(journal);
}
return pair.getKey();
}
}
public void replayAddWorker(String serviceId, Worker worker) {
try (LockCloseable lock = new LockCloseable(writeLock())) {
if (workers.get(worker.getWorkerId()) != null) {
LogUtils.fatal(LOG, "worker {} already exist when replay add worker, should not happen!", worker.getWorkerId());
}
getServiceWorkerGroup(serviceId).addWorker(worker);
workers.put(worker.getWorkerId(), worker);
// TODO: state machine
}
}
public void replayRemoveWorker(String serviceId, long groupId, long workerId) {
try (LockCloseable lock = new LockCloseable(writeLock())) {
Worker worker = workers.get(workerId);
if (worker == null) {
LOG.warn("worker {} not exist when replay remove worker, just ignore.", workerId);
return;
}
if (!worker.getServiceId().equals(serviceId)) {
LogUtils.fatal(LOG, "worker {} mismatch service id, [expected:%s, found:%s], when replay remove worker, "
+ "should not happen!", workerId, serviceId, worker.getServiceId());
}
if (worker.getGroupId() != groupId) {
LogUtils.fatal(LOG, "worker {} mismatch group id, [expected:%d, found:%d], when replay remove worker, "
+ "should not happen!", workerId, groupId, worker.getGroupId());
}
getServiceWorkerGroup(serviceId).removeWorker(worker);
workers.remove(workerId);
// TODO: state machine
}
}
public void replayUpdateWorker(String serviceId, List workers) {
try (LockCloseable lock = new LockCloseable(writeLock())) {
for (Worker worker : workers) {
if (this.workers.get(worker.getWorkerId()) == null) {
LogUtils.fatal(LOG, "worker {} not exist when replay update worker, should not happen!",
worker.getWorkerId());
}
}
for (Worker worker : workers) {
getServiceWorkerGroup(serviceId).updateWorker(worker);
this.workers.put(worker.getWorkerId(), worker);
}
// TODO: state machine
}
}
// FOR TEST
public int getWorkerCount() {
try (LockCloseable lk = new LockCloseable(readLock())) {
return workers.size();
}
}
// FOR TEST
public void cleanStarletAgentForAll() {
try (LockCloseable lock = new LockCloseable(writeLock())) {
for (Worker worker : workers.values()) {
worker.cleanStarletAgent();
}
}
}
public void dumpMeta(DataOutputStream out) throws IOException {
try (LockCloseable lk = new LockCloseable(readLock())) {
LOG.debug("start dump worker manager meta data to file.");
WorkerFileMeta.Builder builder = WorkerFileMeta.newBuilder();
for (Worker worker : workers.values()) {
builder.addWorkerInfos(worker.toProtobuf());
}
WorkerFileMeta meta = builder.build();
byte[] bytes = meta.toByteArray();
Text.writeBytes(out, bytes);
LOG.debug("end dump worker manager meta data to file.");
}
}
public void loadMeta(DataInputStream in) throws IOException {
try (LockCloseable lk = new LockCloseable(writeLock())) {
LOG.debug("start load worker manager meta data from file.");
byte[] bytes = Text.readBytes(in);
WorkerFileMeta meta = WorkerFileMeta.parseFrom(bytes);
List workerInfos = meta.getWorkerInfosList();
for (WorkerInfo info : workerInfos) {
Worker worker = Worker.fromProtobuf(info);
getServiceWorkerGroup(worker.getServiceId()).addWorker(worker);
workers.put(worker.getWorkerId(), worker);
}
LOG.debug("end load worker manager meta data from file.");
}
}
public void dump(DataOutputStream out) throws IOException {
try (LockCloseable lk = new LockCloseable(readLock())) {
for (Worker worker : workers.values()) {
String s = JsonFormat.printer().print((Message) worker.toProtobuf()) + "\n";
out.writeBytes(s);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy