All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.staros.shard.ShardManager Maven / Gradle / Ivy

There is a newer version: 3.4-rc2
Show newest version
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


package com.staros.shard;

import com.google.common.base.Preconditions;
import com.google.protobuf.util.JsonFormat;
import com.staros.exception.ExceptionCode;
import com.staros.exception.StarException;
import com.staros.filecache.FileCache;
import com.staros.filestore.FilePath;
import com.staros.filestore.FileStore;
import com.staros.filestore.FileStoreMgr;
import com.staros.filestore.FileStoreMgrs;
import com.staros.journal.Journal;
import com.staros.journal.JournalSystem;
import com.staros.proto.CreateMetaGroupInfo;
import com.staros.proto.CreateShardGroupInfo;
import com.staros.proto.CreateShardInfo;
import com.staros.proto.CreateShardJournalInfo;
import com.staros.proto.DeleteMetaGroupInfo;
import com.staros.proto.DeleteShardGroupInfo;
import com.staros.proto.FileStoreType;
import com.staros.proto.MetaGroupInfo;
import com.staros.proto.MetaGroupJournalInfo;
import com.staros.proto.PlacementPolicy;
import com.staros.proto.PlacementPreference;
import com.staros.proto.PlacementRelationship;
import com.staros.proto.ShardFileMetaFooter;
import com.staros.proto.ShardFileMetaHeader;
import com.staros.proto.ShardGroupInfo;
import com.staros.proto.ShardInfo;
import com.staros.proto.UpdateMetaGroupInfo;
import com.staros.schedule.Scheduler;
import com.staros.util.Config;
import com.staros.util.Constant;
import com.staros.util.IdGenerator;
import com.staros.util.LockCloseable;
import com.staros.util.LogUtils;
import com.staros.util.Text;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.BiFunction;
import java.util.stream.Collectors;

/**
 * Shard Manager manages all the shard for single service, including
 * shard group operations and shard operations.
 */
public class ShardManager {
    private static final Logger LOG = LogManager.getLogger(ShardManager.class);

    private final String serviceId;

    // TODO: split shards to multiple group,
    //       some of the functions below will be too heavy when shard number grows
    private Map shards; // 
    private final Map shardGroups; // 
    private final Map metaGroups; // 

    private final ReentrantReadWriteLock lock;

    private final JournalSystem journalSystem;
    private final IdGenerator idGenerator;

    private final Scheduler shardScheduler;

    public ShardManager(String serviceId, JournalSystem journalSystem, IdGenerator idGenerator,
                        Scheduler shardScheduler) {
        // TODO: load from storage

        this.serviceId = serviceId;

        this.shards = new HashMap<>();
        this.shardGroups = new HashMap<>();
        this.metaGroups = new HashMap<>();
        this.lock = new ReentrantReadWriteLock();

        this.journalSystem = journalSystem;
        this.idGenerator = idGenerator;
        this.shardScheduler = shardScheduler;

        // create default shard group
        ShardGroup shardGroup = new ShardGroup(serviceId, Constant.DEFAULT_ID);
        shardGroups.put(Constant.DEFAULT_ID, shardGroup);
    }

    public String getServiceId() {
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            return serviceId;
        }
    }

    public List createShardGroup(List createShardGroupInfos) throws StarException {
        if (createShardGroupInfos.isEmpty()) {
            throw new StarException(ExceptionCode.INVALID_ARGUMENT, "shard group info can not be empty.");
        }

        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            List shardGroupsToCreate = new ArrayList<>(createShardGroupInfos.size());
            for (CreateShardGroupInfo info : createShardGroupInfos) {
                ShardGroup shardGroup = new ShardGroup(serviceId, idGenerator.getNextId(), info.getPolicy(), false,
                        Constant.DEFAULT_ID /* metaGroupId */, info.getLabelsMap(), info.getPropertiesMap());
                shardGroupsToCreate.add(shardGroup);
            }

            Journal journal = Journal.logCreateShardGroup(serviceId, shardGroupsToCreate);
            journalSystem.write(journal);

            List shardGroupInfos = new ArrayList<>(shardGroupsToCreate.size());
            for (ShardGroup shardGroup : shardGroupsToCreate) {
                shardGroups.put(shardGroup.getGroupId(), shardGroup);
                shardGroupInfos.add(shardGroup.toProtobuf());
            }
            return shardGroupInfos;
        }
    }


    public void deleteShardGroup(List groupIds, boolean deleteShards) throws StarException {
        deleteShardGroupInternal(groupIds, deleteShards, false /* isReplay */);
    }

    private void deleteShardGroupInternal(List groupIds, boolean deleteShards, boolean isReplay) throws StarException {
        if (groupIds.isEmpty()) {
            throw new StarException(ExceptionCode.INVALID_ARGUMENT, "shard group id can not be empty.");
        }
        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            List groupsToDelete = new ArrayList<>(groupIds.size());

            for (Long groupId : groupIds) {
                if (groupId == Constant.DEFAULT_ID) {
                    throw new StarException(ExceptionCode.INVALID_ARGUMENT,
                            String.format("default shard group %d can not be deleted in service %s.", groupId, serviceId));
                }
                ShardGroup shardGroup = shardGroups.get(groupId);
                if (shardGroup == null) {
                    continue;
                }
                groupsToDelete.add(shardGroup);
            }
            if (!isReplay) {
                List groupIdsToDelete = groupsToDelete.stream().map(ShardGroup::getGroupId).collect(Collectors.toList());
                DeleteShardGroupInfo info = DeleteShardGroupInfo.newBuilder()
                        .addAllGroupIds(groupIdsToDelete)
                        .setCascadeDeleteShard(deleteShards)
                        .build();

                Journal journal = Journal.logDeleteShardGroup(serviceId, info);
                journalSystem.write(journal);
            }

            for (ShardGroup shardGroup : groupsToDelete) {
                shardGroups.remove(shardGroup.getGroupId());
                List shardIds = shardGroup.getShardIds();
                if (deleteShards) {
                    // delete shards
                    for (long id : shardIds) {
                        Shard shard = shards.get(id);
                        if (shard == null) {
                            // shard can be removed by other shard group that is deleted prior to this shard group
                            continue;
                        }
                        removeShardInternalNoLock(shards.get(id));
                    }
                } else {
                    // update shards' group_id list
                    for (long id : shardIds) {
                        shards.get(id).quitGroup(shardGroup.getGroupId());
                    }
                }
            }
        }
    }

    public List listShardGroupInfo(boolean includeAnonymousGroup) throws StarException {
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            List shardGroupInfos = new ArrayList<>();
            for (ShardGroup shardGroup : shardGroups.values()) {
                if (!includeAnonymousGroup && shardGroup.isAnonymous()) {
                    continue;
                }
                shardGroupInfos.add(shardGroup.toProtobuf());
            }
            return shardGroupInfos;
        }
    }

    /**
     * Add shard into shards and update shard groups as well.
     * This method doesn't acquire lock, the caller should obtain ShardManager's lock before calling this.
     *
     * @param shard the shard to be added.
     * @return old shard if the same shard id exists
     */
    private Shard addShardInternalNoLock(Shard shard) {
        assert shard != null;
        Shard old = shards.put(shard.getShardId(), shard);
        for (Long groupId : shard.getGroupIds()) {
            ShardGroup shardGroup = shardGroups.get(groupId);
            assert shardGroup != null;
            // ignore shard group return value on purpose
            shardGroup.addShardId(shard.getShardId());
        }
        return old;
    }


    /**
     * Remove shard from shards and shard groups.
     * This method doesn't acquire lock, the caller should obtain ShardManager's lock before calling this.
     *
     * @param shard the shard to be removed.
     * @return the shard to be removed
     */
    private Shard removeShardInternalNoLock(Shard shard) {
        assert shard != null;
        Shard old = shards.remove(shard.getShardId());
        for (Long groupId : shard.getGroupIds()) {
            ShardGroup shardGroup = shardGroups.get(groupId);
            if (shardGroup == null) {
                // shardGroup could be removed already
                continue;
            }
            // ignore shard group return value on purpose
            shardGroup.removeShardId(shard.getShardId());

            // shard group created by placement preference
            // delete it after reference count less than 2
            if (shardGroup.isAnonymous() &&
                    shardGroup.getMetaGroupId() == Constant.DEFAULT_ID &&
                    shardGroup.getShardIds().size() < 2) {
                for (long id : shardGroup.getShardIds()) {
                    shards.get(id).quitGroup(shardGroup.getGroupId());
                }
                shardGroups.remove(shardGroup.getGroupId());
            }
        }
        return old;
    }

    private void commitCreateShard(List shardList, List anonymousShardGroups) {
        // TODO: optimize logic here
        for (ShardGroup anonymousShardGroup : anonymousShardGroups) {
            shardGroups.put(anonymousShardGroup.getGroupId(), anonymousShardGroup);
        }
        for (Shard shard : shardList) {
            // ignore return value on purpose
            addShardInternalNoLock(shard);
        }
        for (ShardGroup anonymousShardGroup : anonymousShardGroups) {
            for (Long sid : anonymousShardGroup.getShardIds()) {
                // ignore return value on purpose
                shards.get(sid).joinGroup(anonymousShardGroup.getGroupId());
            }
        }
    }

    private void checkPlacementPreference(PlacementPreference preference, int replicaNum) throws StarException {
        if (preference.getPlacementPolicy() != PlacementPolicy.PACK) {
            throw new StarException(ExceptionCode.INVALID_ARGUMENT,
                    String.format("shard placement preference does not support %s policy in service %s.",
                            preference.getPlacementPolicy().name(), serviceId));
        }
        if (preference.getPlacementRelationship() != PlacementRelationship.WITH_SHARD) {
            throw new StarException(ExceptionCode.INVALID_ARGUMENT,
                    String.format("shard placement preference does not support %s relationship in service %s.",
                            preference.getPlacementRelationship().name(), serviceId));
        }
        long targetId = preference.getRelationshipTargetId();
        Shard targetShard = shards.get(targetId);
        if (targetShard == null) {
            throw new StarException(ExceptionCode.NOT_EXIST,
                    String.format("shard placement preference target id %d not exist in service %s.",
                            targetId, serviceId));
        }
        // The following line is added to reminder that replica number check is only needed for PACK PlacementPolicy.
        Preconditions.checkState(preference.getPlacementPolicy() == PlacementPolicy.PACK);
        if (targetShard.getExpectedReplicaCount() != replicaNum) {
            throw new StarException(ExceptionCode.INVALID_ARGUMENT, String.format(
                    "shard has a different replica num than target referenced shard. our:%d, target:%d",
                    replicaNum,
                    targetShard.getExpectedReplicaCount()));
        }
    }

    public List createShard(List createShardInfos, FileStoreMgrs fsMgrs) throws StarException {
        List shardsToCreate = new ArrayList<>(createShardInfos.size());
        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            // prepare shards
            List anonymousShardGroups = new ArrayList<>();
            for (CreateShardInfo info : createShardInfos) {
                long shardId = info.getShardId();
                if (shardId == Constant.DEFAULT_ID) {
                    // application did not assign one
                    shardId = idGenerator.getNextId();
                }

                if (shards.containsKey(shardId)) {
                    throw new StarException(ExceptionCode.ALREADY_EXIST,
                            String.format("shard %d already exist in service %s.", shardId, serviceId));
                }
                int shardReplicaNum =
                        info.getReplicaCount() == Constant.DEFAULT_PROTOBUF_INTEGER ? 1 : info.getReplicaCount();

                List groupIds = info.getGroupIdsList();
                for (Long groupId : groupIds) {
                    ShardGroup group = shardGroups.get(groupId);
                    if (group == null) {
                        throw new StarException(ExceptionCode.NOT_EXIST,
                                String.format("shard group %d not exist in service %s.", groupId, serviceId));
                    }
                    if (group.getPlacementPolicy() == PlacementPolicy.PACK) {
                        int replicaNum = getFirstShardReplicaNumFromGroup(group, -1);
                        if (replicaNum != -1 && shardReplicaNum != replicaNum) {
                            throw new StarException(ExceptionCode.INVALID_ARGUMENT,
                                    String.format("Inconsistent shard replica num:%d when join a PACK shard group:%d.",
                                            shardReplicaNum, groupId));
                        }
                    }
                }
                ShardGroup anonymousShardGroup;
                if (!info.getPlacementPreferencesList().isEmpty()) {
                    // TODO: support multi preferences
                    if (info.getPlacementPreferencesList().size() != 1) {
                        throw new StarException(ExceptionCode.INVALID_ARGUMENT,
                                String.format("shard placement preference does not support multiple in service %s.",
                                        serviceId));
                    }
                    PlacementPreference preference = info.getPlacementPreferencesList().get(0);
                    checkPlacementPreference(preference, shardReplicaNum);
                    anonymousShardGroup = new ShardGroup(serviceId, idGenerator.getNextId(),
                            preference.getPlacementPolicy(), true /* anonymous */, Constant.DEFAULT_ID /* metaGroupId */);
                    anonymousShardGroup.addShardId(preference.getRelationshipTargetId());
                    anonymousShardGroup.addShardId(shardId);
                    anonymousShardGroups.add(anonymousShardGroup);
                }

                FilePath path;
                if (!info.hasPathInfo()) {
                    FileStoreMgr fsMgr = fsMgrs.getFileStoreMgr(FileStoreType.S3);
                    FileStore fs = fsMgr.allocFileStore();
                    path = new FilePath(fs, String.format("%s/%s", serviceId, ""));
                } else {
                    FileStore fs = fsMgrs.getFileStore(info.getPathInfo().getFsInfo().getFsType(),
                            info.getPathInfo().getFsInfo().getFsKey());
                    if (fs == null) {
                        throw new StarException(ExceptionCode.NOT_EXIST,
                                String.format("file store with key '%s' not exist", info.getPathInfo().getFsInfo().getFsKey()));
                    }
                    path = FilePath.fromFullPath(fs, info.getPathInfo().getFullPath());
                }

                FileCache cache = FileCache.fromProtobuf(info.getCacheInfo());
                Shard shard = new Shard(serviceId, groupIds, shardId, path, cache);
                shard.setExpectedReplicaCount(shardReplicaNum);

                Map shardProperties = info.getShardPropertiesMap();
                shard.setProperties(shardProperties);

                shardsToCreate.add(shard);
            }

            CreateShardJournalInfo.Builder builder = CreateShardJournalInfo.newBuilder();
            for (Shard shard : shardsToCreate) {
                builder.addShardInfos(shard.toProtobuf());
            }
            for (ShardGroup anonymousShardGroup : anonymousShardGroups) {
                builder.addShardGroupInfos(anonymousShardGroup.toProtobuf());
            }
            Journal journal = Journal.logCreateShard(serviceId, builder.build());
            journalSystem.write(journal);

            // commit shards
            commitCreateShard(shardsToCreate, anonymousShardGroups);
        }
        List shardIds = shardsToCreate.stream().map(Shard::getShardId).collect(Collectors.toList());
        if (Config.SCHEDULER_TRIGGER_SCHEDULE_WHEN_CREATE_SHARD) {
            try {
                shardScheduler.scheduleAddToDefaultGroup(serviceId, shardIds);
            } catch (StarException e) {
                // Ignore the failure for now, shardChecker will take care.
                LOG.warn("Fail to schedule new created shards to default workerGroup for service: {}," +
                        "error: {}. Ignore the error for now.", serviceId, e.getMessage());
            }
        }
        return getShardInfo(shardIds);
    }

    public void deleteShard(List shardIds) throws StarException {
        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            // validate all shards
            List shardsToDelete = new ArrayList<>(shardIds.size());
            for (Long shardId : shardIds) {
                Shard shard = shards.get(shardId);
                if (shard == null) {
                    continue; // it's ok to delete non-exist shard
                }
                shardsToDelete.add(shard);
            }

            Journal journal = Journal.logDeleteShard(serviceId, shardIds);
            journalSystem.write(journal);

            for (Shard shard : shardsToDelete) {
                Shard old = removeShardInternalNoLock(shard);
                assert old != null;
            }
        }
    }

    /**
     * Get shard info in service
     *
     * @throws StarException if shard does not exist or shard not belong to this service
     */
    public List getShardInfo(List shardIds) throws StarException {
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            List shardInfos = new ArrayList<>(shardIds.size());
            for (Long shardId : shardIds) {
                Shard shard = shards.get(shardId);
                if (shard == null) {
                    throw new StarException(ExceptionCode.NOT_EXIST,
                            String.format("shard %d not exist.", shardId));
                }
                shardInfos.add(shard.toProtobuf());
            }

            return shardInfos;
        }
    }

    public List> listShardInfo(List groupIds) throws StarException {
        if (groupIds.isEmpty()) {
            throw new StarException(ExceptionCode.INVALID_ARGUMENT, "shard group id can not be empty.");
        }

        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            List> shardInfos = new ArrayList<>(groupIds.size());

            for (Long groupId : groupIds) {
                ShardGroup shardGroup = shardGroups.get(groupId);
                if (shardGroup == null) {
                    throw new StarException(ExceptionCode.NOT_EXIST, String.format("shard group %d not exist.", groupId));
                }

                List infos = new ArrayList<>(shardGroup.getShardIds().size());
                for (Long shardId : shardGroup.getShardIds()) {
                    Shard shard = shards.get(shardId);
                    infos.add(shard.toProtobuf());
                }

                shardInfos.add(infos);
            }

            return shardInfos;
        }
    }

    // TODO: the returned shard is not safe, need look into it later
    public Shard getShard(long shardId) {
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            return shards.get(shardId);
        }
    }

    public ShardGroup getShardGroup(long groupId) {
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            return shardGroups.get(groupId);
        }
    }

    public List getAllShardIds() {
        try (LockCloseable ignore = new LockCloseable(lock.readLock())) {
            return new ArrayList<>(shards.keySet());
        }
    }

    public List getAllShardGroupIds() {
        try (LockCloseable ignore = new LockCloseable(lock.readLock())) {
            return new ArrayList<>(shardGroups.keySet());
        }
    }

    // return size of anonymous group
    private int verifyShardGroupInfoForMetaGroup(List shardGroupIds, int expectSize) throws StarException {
        // TODO: maybe also verify all shard id and group id?
        for (Long shardGroupId : shardGroupIds) {
            ShardGroup shardGroup = shardGroups.get(shardGroupId);
            if (shardGroup == null) {
                throw new StarException(ExceptionCode.INVALID_ARGUMENT,
                        String.format("shard group %d not exist.", shardGroupId));
            }
            if (expectSize == -1) { // use the first group size
                expectSize = shardGroup.getShardIds().size();
            }
            if (expectSize != shardGroup.getShardIds().size()) {
                throw new StarException(ExceptionCode.INVALID_ARGUMENT,
                        String.format("shard size mismatch, expect %d, has %d.",
                                expectSize, shardGroup.getShardIds().size()));
            }
        }
        return expectSize;
    }

    public MetaGroupInfo createMetaGroup(CreateMetaGroupInfo createMetaGroupInfo) throws StarException {
        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            long metaGroupId = createMetaGroupInfo.getMetaGroupId();
            if (metaGroupId == Constant.DEFAULT_ID) {
                // application did not assign one
                metaGroupId = idGenerator.getNextId();
            }
            if (metaGroups.containsKey(metaGroupId)) {
                throw new StarException(ExceptionCode.ALREADY_EXIST,
                        String.format("meta group %d already exists.", metaGroupId));
            }
            PlacementPolicy placementPolicy = createMetaGroupInfo.getPlacementPolicy();
            if (placementPolicy != PlacementPolicy.PACK) {
                // TODO: support EXCLUDE policy
                throw new StarException(ExceptionCode.INVALID_ARGUMENT,
                        String.format("meta group placement policy %s not allowed.",
                                placementPolicy.name()));
            }

            List anonymousShardGroups = new ArrayList<>();
            List shardGroupIds = createMetaGroupInfo.getShardGroupIdsList();
            if (!shardGroupIds.isEmpty()) {
                int size = verifyShardGroupInfoForMetaGroup(shardGroupIds, -1 /* expectSize */);

                anonymousShardGroups = prepareAnonymousShardGroup(metaGroupId, size, placementPolicy,
                        null /* anonymousShardGroupIds */);
                validateShardReplicaNumInGroup(placementPolicy, anonymousShardGroups, shardGroupIds);
            }

            List anonymousShardGroupIds =
                    anonymousShardGroups.stream().map(ShardGroup::getGroupId).collect(Collectors.toList());
            MetaGroup metaGroup = new MetaGroup(serviceId, metaGroupId,
                    anonymousShardGroupIds,
                    createMetaGroupInfo.getPlacementPolicy());

            MetaGroupJournalInfo journalInfo = MetaGroupJournalInfo.newBuilder()
                    .setMetaGroupInfo(metaGroup.toProtobuf())
                    .setCreateInfo(createMetaGroupInfo)
                    .build();
            Journal journal = Journal.logCreateMetaGroup(serviceId, journalInfo);
            journalSystem.write(journal);

            commitAnonymousShardGroup(metaGroup, anonymousShardGroups, shardGroupIds);

            metaGroups.put(metaGroupId, metaGroup);

            return metaGroup.toProtobuf();
        }
    }

    /**
     * validate every group.get(x) can be added into targetGroups.get(x)
     *
     * @param targetGroups target pack group list
     * @param groupIds     source shard groups
     */
    private void validateShardReplicaNumInGroup(PlacementPolicy policy, List targetGroups, List groupIds) {
        if (policy != PlacementPolicy.PACK) {
            return;
        }
        final int INVALID_REPLICA_NUM = -1;
        List expectReplicaNum = new ArrayList<>(targetGroups.size());
        targetGroups.forEach(x -> {
            int replicaNum = getFirstShardReplicaNumFromGroup(x, INVALID_REPLICA_NUM);
            expectReplicaNum.add(replicaNum);
        });

        for (long gid : groupIds) {
            ShardGroup group = shardGroups.get(gid);
            Preconditions.checkNotNull(group);
            int pos = 0;
            for (long sid : group.getShardIds()) {
                Shard shard = shards.get(sid);
                Preconditions.checkNotNull(shard);
                int replicaNum = shard.getExpectedReplicaCount();
                if (expectReplicaNum.get(pos) == INVALID_REPLICA_NUM) {
                    expectReplicaNum.add(pos, replicaNum);
                } else if (expectReplicaNum.get(pos) != replicaNum) {
                    throw new StarException(ExceptionCode.INVALID_ARGUMENT,
                            String.format("shard:%d replicaNum: %d, target group expected replica num: %d",
                                    sid, replicaNum, expectReplicaNum.get(pos)));
                }
                ++pos;
            }
        }
    }

    /**
     * Get the replica number of the first shard in the shard group, return valueIfEmpty if no valid shard in group.
     *
     * @param group        target shard group
     * @param valueIfEmpty value returned if group is empty
     * @return the first valid shard replica number or `valueIfEmpty` if no valid shard can be found.
     */
    private int getFirstShardReplicaNumFromGroup(ShardGroup group, int valueIfEmpty) {
        int result = valueIfEmpty;
        for (long shardId : group.getShardIds()) {
            if (shards.containsKey(shardId)) {
                result = shards.get(shardId).getExpectedReplicaCount();
                break;
            }
        }
        return result;
    }

    public void deleteMetaGroup(long metaGroupId) throws StarException {
        deleteMetaGroupInternal(metaGroupId, false /* isReplay */);
    }

    public void deleteMetaGroupInternal(long metaGroupId, boolean isReplay) throws StarException {
        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            MetaGroup metaGroup = metaGroups.get(metaGroupId);

            if (metaGroup == null) {
                return;
            }

            if (!isReplay) {
                DeleteMetaGroupInfo deleteInfo = DeleteMetaGroupInfo.newBuilder()
                        .setMetaGroupId(metaGroupId)
                        .build();
                MetaGroupJournalInfo journalInfo = MetaGroupJournalInfo.newBuilder()
                        .setDeleteInfo(deleteInfo)
                        .build();
                Journal journal = Journal.logDeleteMetaGroup(serviceId, journalInfo);
                journalSystem.write(journal);
            }

            List anonymousShardGroupIds = metaGroup.getShardGroupIds();

            // 1. remove all shards from anonymous group
            for (Long groupId : anonymousShardGroupIds) {
                ShardGroup shardGroup = shardGroups.get(groupId);
                assert shardGroup != null;
                for (Long shardId : shardGroup.getShardIds()) {
                    Shard shard = shards.get(shardId);
                    assert shard != null;
                    boolean v = shard.quitGroup(groupId);
                    assert v;
                }
            }

            // 2. remove anonymous group
            for (Long groupId : anonymousShardGroupIds) {
                ShardGroup old = shardGroups.remove(groupId);
                assert old != null;
            }

            // 3. remove meta group itself
            MetaGroup old = metaGroups.remove(metaGroupId);
            assert old != null;
        }
    }

    private List prepareAnonymousShardGroup(long metaGroupId, int anonymousGroupSize, PlacementPolicy placementPolicy,
            List anonymousShardGroupIds) throws StarException {
        if (anonymousGroupSize == 0) {
            throw new StarException(ExceptionCode.INVALID_ARGUMENT, "anonymous shard group size can not be 0.");
        }
        List anonymousShardGroups = new ArrayList<>();
        for (int i = 0; i < anonymousGroupSize; ++i) {
            long groupId;
            if (anonymousShardGroupIds != null) { // for replay, id already assigned
                groupId = anonymousShardGroupIds.get(i);
            } else {
                groupId = idGenerator.getNextId();
            }
            ShardGroup anonymousShardGroup = new ShardGroup(serviceId, groupId,
                    placementPolicy, true /* anonymous */, metaGroupId);
            anonymousShardGroups.add(anonymousShardGroup);
        }
        return anonymousShardGroups;
    }

    private void commitAnonymousShardGroup(MetaGroup metaGroup, List anonymousShardGroups, List shardGroupIds) {
        // 1. add shard to anonymous group
        for (Long shardGroupId : shardGroupIds) {
            ShardGroup shardGroup = shardGroups.get(shardGroupId);
            for (int i = 0; i < shardGroup.getShardIds().size(); ++i) {
                anonymousShardGroups.get(i).addShardId(shardGroup.getShardIds().get(i));
            }
        }

        // 2. add anonymous group to shard
        for (ShardGroup shardGroup : anonymousShardGroups) {
            for (Long shardId : shardGroup.getShardIds()) {
                Shard shard = shards.get(shardId);
                shard.joinGroup(shardGroup.getGroupId());
            }
        }

        // 3. add anonymous group to shard group
        for (ShardGroup shardGroup : anonymousShardGroups) {
            shardGroups.put(shardGroup.getGroupId(), shardGroup);
        }

        // 4. add anonymous group to meta group
        List anonymousShardGroupIds =
                anonymousShardGroups.stream().map(ShardGroup::getGroupId).collect(Collectors.toList());
        metaGroup.setShardGroupIds(anonymousShardGroupIds);
    }

    private MetaGroup verifyAndGetMetaGroup(long metaGroupId) throws StarException {
        if (metaGroupId == Constant.DEFAULT_ID) {
            throw new StarException(ExceptionCode.INVALID_ARGUMENT, "meta group id not set.");
        }
        MetaGroup metaGroup = metaGroups.get(metaGroupId);
        if (metaGroup == null) {
            throw new StarException(ExceptionCode.NOT_EXIST,
                    String.format("meta group id %d not exist.", metaGroupId));
        }
        return metaGroup;
    }

    private Pair verifyUpdateMetaGroupInfo(UpdateMetaGroupInfo updateMetaGroupInfo) throws StarException {
        MetaGroup src = null;
        MetaGroup dst = null;
        UpdateMetaGroupInfo.InfoCase icase = updateMetaGroupInfo.getInfoCase();
        switch (icase) {
            case JOIN_INFO: {
                dst = verifyAndGetMetaGroup(updateMetaGroupInfo.getJoinInfo().getMetaGroupId());
                break;
            }
            case QUIT_INFO: {
                src = verifyAndGetMetaGroup(updateMetaGroupInfo.getQuitInfo().getMetaGroupId());
                break;
            }
            case TRANSFER_INFO: {
                src = verifyAndGetMetaGroup(updateMetaGroupInfo.getTransferInfo().getSrcMetaGroupId());
                dst = verifyAndGetMetaGroup(updateMetaGroupInfo.getTransferInfo().getDstMetaGroupId());
                break;
            }
            case INFO_NOT_SET: {
                throw new StarException(ExceptionCode.INVALID_ARGUMENT, "update meta group info type not set.");
            }
        }
        return Pair.of(src, dst);
    }

    public void updateMetaGroup(UpdateMetaGroupInfo updateMetaGroupInfo) throws StarException {
        updateMetaGroupInternal(updateMetaGroupInfo, false /* isReplay */, null /* anonymousShardGroupIdsForReplay */);
    }

    private void updateMetaGroupInternal(UpdateMetaGroupInfo updateMetaGroupInfo,
                                         boolean isReplay,
                                         List anonymousShardGroupIdsForReplay) throws StarException {
        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            Pair pair = verifyUpdateMetaGroupInfo(updateMetaGroupInfo);
            MetaGroup src = pair.getKey();
            MetaGroup dst = pair.getValue();

            // verify shard group
            List shardGroupIds = updateMetaGroupInfo.getShardGroupIdsList();
            if (shardGroupIds.isEmpty()) {
                throw new StarException(ExceptionCode.INVALID_ARGUMENT, "empty shard group list.");
            }
            if (src != null) {
                verifyShardGroupInfoForMetaGroup(shardGroupIds, src.getShardGroupIds().size() /* expectSize */);
            }
            int dstSize = -1;
            if (dst != null) {
                int expectSize = -1;
                if (dst.getShardGroupIds().size() != 0) {
                    expectSize = dst.getShardGroupIds().size();
                }
                dstSize = verifyShardGroupInfoForMetaGroup(shardGroupIds, expectSize);
            }

            // prepare dst anonymous shard group
            List anonymousShardGroups = new ArrayList<>();
            if (dst != null) {
                if (dst.getShardGroupIds().size() == 0) {
                    anonymousShardGroups = prepareAnonymousShardGroup(dst.getMetaGroupId(), dstSize, dst.getPlacementPolicy(),
                            anonymousShardGroupIdsForReplay);
                } else {
                    for (Long groupId : dst.getShardGroupIds()) {
                        anonymousShardGroups.add(shardGroups.get(groupId));
                    }
                }
                validateShardReplicaNumInGroup(dst.getPlacementPolicy(), anonymousShardGroups, shardGroupIds);
            }

            if (!isReplay) {
                MetaGroupJournalInfo.Builder journalInfoBuilder = MetaGroupJournalInfo.newBuilder();
                if (dst != null) {
                    MetaGroupInfo.Builder infoBuilder = MetaGroupInfo.newBuilder().mergeFrom(dst.toProtobuf());
                    if (dst.getShardGroupIds().size() == 0) {
                        List anonymousShardGroupIds =
                                anonymousShardGroups.stream().map(ShardGroup::getGroupId).collect(Collectors.toList());
                        infoBuilder.addAllShardGroupIds(anonymousShardGroupIds);
                    }
                    journalInfoBuilder.setMetaGroupInfo(infoBuilder.build());
                }
                journalInfoBuilder.setUpdateInfo(updateMetaGroupInfo);
                Journal journal = Journal.logUpdateMetaGroup(serviceId, journalInfoBuilder.build());
                journalSystem.write(journal);
            }

            // remove from src meta group
            if (src != null) {
                // TODO: maybe reset src anonymous group if all shards are removed
                for (Long shardGroupId : shardGroupIds) {
                    ShardGroup shardGroup = shardGroups.get(shardGroupId);
                    int idx = 0;
                    for (Long shardId : shardGroup.getShardIds()) {
                        ShardGroup anonymousShardGroup = shardGroups.get(src.getShardGroupIds().get(idx));
                        boolean v1 = anonymousShardGroup.removeShardId(shardId);
                        assert v1;
                        Shard shard = shards.get(shardId);
                        boolean v2 = shard.quitGroup(anonymousShardGroup.getGroupId());
                        assert v2;
                        idx++;
                    }
                }
            }

            // add to dst meta group
            if (dst != null) {
                commitAnonymousShardGroup(dst, anonymousShardGroups, shardGroupIds);
            }
        }
    }

    public MetaGroupInfo getMetaGroupInfo(long metaGroupId) throws StarException {
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            MetaGroup metaGroup = metaGroups.get(metaGroupId);

            if (metaGroup == null) {
                throw new StarException(ExceptionCode.NOT_EXIST,
                        String.format("meta group %d not exist.", metaGroupId));
            }
            return metaGroup.toProtobuf();
        }
    }

    public List listMetaGroupInfo() throws StarException {
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            List metaGroupInfos = new ArrayList<>();
            for (MetaGroup metaGroup : metaGroups.values()) {
                metaGroupInfos.add(metaGroup.toProtobuf());
            }
            return metaGroupInfos;
        }
    }

    public void scheduleShardsBelongToWorker(long workerId) {
        List shardIds;
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            shardIds = shards.entrySet()
                    .stream()
                    .filter(x -> x.getValue().hasReplica(workerId))
                    .map(Map.Entry::getKey)
                    .collect(Collectors.toList());
        }
        if (!shardIds.isEmpty()) {
            shardScheduler.scheduleAsyncAddToWorker(serviceId, shardIds, workerId);
        }
    }

    public void addShardReplicas(List shardIds, long workerId) {
        updateShardReplicaInfoInternal(shardIds, workerId, true);
    }

    public void removeShardReplicas(List shardIds, long workerId) {
        updateShardReplicaInfoInternal(shardIds, workerId, false);
    }

    /**
     * Add/Remove shard replica of workerId, write journal if necessary
     * @param shardIds list of shards whose replicas will be changed
     * @param workerId target worker id
     * @param isAdd true: add replica operation, false: remove replica operation
     */
    private void updateShardReplicaInfoInternal(List shardIds, long workerId, boolean isAdd) {
        if (shardIds.isEmpty()) {
            return;
        }

        // Create a function object to wrapper the actual shard operator
        BiFunction updateObj = (Shard shard, Long id) -> {
            if (isAdd) {
                return shard.addReplica(id);
            } else {
                return shard.removeReplica(id);
            }
        };

        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            List shardsToUpdate = new ArrayList<>();
            List missingIds = new ArrayList<>();

            for (Long shardId : shardIds) {
                Shard shard = shards.get(shardId);
                if (shard == null) {
                    // it's possible that shard is already deleted
                    missingIds.add(shardId);
                    continue;
                }
                if (updateObj.apply(shard, workerId)) {
                    shardsToUpdate.add(shard);
                }
            }

            // write shard info to disk after updating memory
            if (!shardsToUpdate.isEmpty()) {
                try {
                    Journal journal = Journal.logUpdateShard(serviceId, shardsToUpdate);
                    journalSystem.write(journal);
                } catch (StarException e) {
                    // NOTE: shard schedule does not offer strong consistency, if log shard info failed,
                    //       availability always proceeds consistency, so here we consider
                    //       shard schedule succeed even if journal write failure happens
                    List shardIdsToPrint = shardsToUpdate.stream().map(Shard::getShardId).collect(Collectors.toList());
                    LOG.error("log shard info after schedule failed, {}. shards:{}, service:{}.",
                            e.getMessage(), shardIdsToPrint, serviceId);
                }
            }

            if (!missingIds.isEmpty()) {
                LOG.warn("shard {} not exist when update shard info from shard scheduler!", missingIds);
            }
        }
    }

    /**
     * Check missing replicas from the list of shards for the specific worker id, and remove non-exist replicas
     * @param shardIds list of shard id
     * @param workerId target worker id
     */
    public void validateWorkerReportedReplicas(List shardIds, long workerId) {
        if (shardIds.isEmpty()) {
            return;
        }

        List missingIds = new ArrayList<>();
        try (LockCloseable ignore = new LockCloseable(lock.readLock())) {
            shardIds.forEach(x -> {
                Shard shard = shards.get(x);
                // shard is deleted or shard does not have the replica
                if (shard == null || !shard.hasReplica(workerId)) {
                    missingIds.add(x);
                }
            });
        }
        // TODO: handle possible race condition.
        //   one of shardA's replica is scheduled to workerId, the RPC is made successful, but the updateShardInfo in
        //   scheduling is not yet completed, in this time window, worker's heartbeat reports the shard which can't be
        //   found in this shards check, and hence will ask worker to remove it again.
        if (!missingIds.isEmpty()) {
            LOG.warn("shard {} not exist or have outdated info when update shard info from worker heartbeat, " +
                    "schedule remove from worker {}.", missingIds, workerId);
            shardScheduler.scheduleAsyncRemoveFromWorker(serviceId, missingIds, workerId);
        }
    }

    public void replayCreateShard(CreateShardJournalInfo info) {
        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            List shardList = new ArrayList<>();
            for (ShardInfo shardInfo : info.getShardInfosList()) {
                Shard shard = Shard.fromProtobuf(shardInfo);
                shardList.add(shard);
            }
            List anonymousShardGroups = new ArrayList<>();
            for (ShardGroupInfo shardGroupInfo : info.getShardGroupInfosList()) {
                ShardGroup shardGroup = ShardGroup.fromProtobuf(shardGroupInfo);
                anonymousShardGroups.add(shardGroup);
            }

            commitCreateShard(shardList, anonymousShardGroups);

            // TODO: state machine
        }
    }

    public void replayDeleteShard(List shardIds) {
        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            for (Long shardId : shardIds) {
                Shard shard = shards.get(shardId);
                if (shard == null) {
                    LOG.warn("shard {} not exist when replay delete shard, just ignore.", shardId);
                    continue;
                }
                Shard old = removeShardInternalNoLock(shard);
                assert old != null;
            }
            // TODO: state machine
        }
    }

    public void replayUpdateShard(List shardList) {
        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            for (Shard shard : shardList) {
                Shard old = addShardInternalNoLock(shard);
                if (old == null) {
                    LogUtils.fatal(LOG, "shard {} not exist when replay update shard, should not happen!", shard.getShardId());
                }
            }
            // TODO: state machine
        }
    }

    public void replayCreateShardGroup(List groups) {
        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            for (ShardGroup shardGroup : groups) {
                ShardGroup old = shardGroups.put(shardGroup.getGroupId(), shardGroup);
                if (old != null) {
                    LogUtils.fatal(LOG, "shard group {} already exist when replay create shard group, should not happen!",
                            shardGroup.getGroupId());
                }
            }
            // TODO: state machine
        }
    }

    public void replayDeleteShardGroup(DeleteShardGroupInfo info) {
        deleteShardGroupInternal(info.getGroupIdsList(), info.getCascadeDeleteShard(), true /* isReplay */);
    }

    public void replayCreateMetaGroup(MetaGroupJournalInfo info) throws StarException {
        CreateMetaGroupInfo createMetaGroupInfo = info.getCreateInfo();
        MetaGroup metaGroup = MetaGroup.fromProtobuf(info.getMetaGroupInfo());

        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            long metaGroupId = metaGroup.getMetaGroupId();
            if (metaGroups.containsKey(metaGroupId)) {
                throw new StarException(ExceptionCode.ALREADY_EXIST,
                        String.format("meta group %d already exists.", metaGroupId));
            }

            List anonymousShardGroups = new ArrayList<>();
            List shardGroupIds = createMetaGroupInfo.getShardGroupIdsList();
            if (!shardGroupIds.isEmpty()) {
                int size = verifyShardGroupInfoForMetaGroup(shardGroupIds, -1 /* expectSize */);

                anonymousShardGroups = prepareAnonymousShardGroup(metaGroupId, size, metaGroup.getPlacementPolicy(),
                        metaGroup.getShardGroupIds());
            }

            // skip the shard replica num check since this is a replay operation
            commitAnonymousShardGroup(metaGroup, anonymousShardGroups, shardGroupIds);

            metaGroups.put(metaGroupId, metaGroup);
        }
    }

    public void replayDeleteMetaGroup(MetaGroupJournalInfo info) throws StarException {
        long metaGroupId = info.getDeleteInfo().getMetaGroupId();
        deleteMetaGroupInternal(metaGroupId, true /* isReplay */);
    }

    public void replayUpdateMetaGroup(MetaGroupJournalInfo info) throws StarException {
        UpdateMetaGroupInfo updateMetaGroupInfo = info.getUpdateInfo();
        MetaGroup dstMetaGroup = MetaGroup.fromProtobuf(info.getMetaGroupInfo());
        updateMetaGroupInternal(updateMetaGroupInfo, true /* isReplay */, dstMetaGroup.getShardGroupIds());
    }

    // FOR TEST
    public int getShardCount() {
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            return shards.size();
        }
    }

    // FOR TEST
    public int getShardGroupCount() {
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            return shardGroups.size();
        }
    }

    // FOR TEST
    public int getMetaGroupCount() {
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            return metaGroups.size();
        }
    }

    // FOR TEST
    public void overrideShards(Map shards) {
        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            // override shard group
            shardGroups.clear();
            for (Shard shard : shards.values()) {
                for (Long groupId : shard.getGroupIds()) {
                    ShardGroup shardGroup = shardGroups.get(groupId);
                    if (shardGroup == null) {
                        shardGroup = new ShardGroup(serviceId, groupId);
                        shardGroups.put(groupId, shardGroup);
                    }
                    shardGroup.addShardId(shard.getShardId());
                }
            }
            // override shard
            this.shards = shards;
        }
    }

    public void dumpMeta(DataOutputStream out) throws IOException {
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            LOG.debug("start dump shard manager meta data to file.");

            // write header
            ShardFileMetaHeader header = ShardFileMetaHeader.newBuilder()
                    .setShardCount(shards.size())
                    .setShardGroupCount(shardGroups.size())
                    .setMetaGroupCount(metaGroups.size())
                    .build();
            Text.writeBytes(out, header.toByteArray());

            // write meta group
            for (MetaGroup metaGroup : metaGroups.values()) {
                metaGroup.write(out);
            }

            // write shard group
            for (ShardGroup shardGroup : shardGroups.values()) {
                shardGroup.write(out);
            }

            // write shard
            for (Shard shard : shards.values()) {
                shard.write(out);
            }

            // write footer
            ShardFileMetaFooter footer = ShardFileMetaFooter.newBuilder().build();
            Text.writeBytes(out, footer.toByteArray());

            LOG.debug("end dump shard manager meta data to file.");
        }
    }

    public void loadMeta(DataInputStream in) throws IOException {
        try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
            LOG.debug("start load shard manager meta data from file.");

            // read header
            ShardFileMetaHeader header = ShardFileMetaHeader.parseFrom(Text.readBytes(in));
            int shardCount = header.getShardCount();
            int shardGroupCount = header.getShardGroupCount();
            int metaGroupCount = header.getMetaGroupCount();

            // load meta group
            for (int i = 0; i < metaGroupCount; ++i) {
                MetaGroup metaGroup = MetaGroup.read(in);
                metaGroups.put(metaGroup.getMetaGroupId(), metaGroup);
            }

            // load shard group
            for (int i = 0; i < shardGroupCount; ++i) {
                ShardGroup shardGroup = ShardGroup.read(in);
                shardGroups.put(shardGroup.getGroupId(), shardGroup);
            }

            // load shard
            for (int i = 0; i < shardCount; ++i) {
                Shard shard = Shard.read(in);
                Shard old = addShardInternalNoLock(shard);
                assert old == null;
            }

            // read footer
            ShardFileMetaFooter.parseFrom(Text.readBytes(in));

            LOG.debug("end load shard manager meta data from file.");
        }
    }

    public void dump(DataOutputStream out) throws IOException {
        try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
            for (MetaGroup metaGroup : metaGroups.values()) {
                String s = JsonFormat.printer().print(metaGroup.toProtobuf()) + "\n";
                out.writeBytes(s);
            }

            for (ShardGroup shardGroup : shardGroups.values()) {
                String s = JsonFormat.printer().print(shardGroup.toProtobuf()) + "\n";
                out.writeBytes(s);
            }

            for (Shard shard : shards.values()) {
                String s = JsonFormat.printer().print(shard.toProtobuf()) + "\n";
                out.writeBytes(s);
            }
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy