com.staros.shard.ShardManager Maven / Gradle / Ivy
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.staros.shard;
import com.google.common.base.Preconditions;
import com.google.protobuf.util.JsonFormat;
import com.staros.exception.ExceptionCode;
import com.staros.exception.StarException;
import com.staros.filecache.FileCache;
import com.staros.filestore.FilePath;
import com.staros.filestore.FileStore;
import com.staros.filestore.FileStoreMgr;
import com.staros.filestore.FileStoreMgrs;
import com.staros.journal.Journal;
import com.staros.journal.JournalSystem;
import com.staros.proto.CreateMetaGroupInfo;
import com.staros.proto.CreateShardGroupInfo;
import com.staros.proto.CreateShardInfo;
import com.staros.proto.CreateShardJournalInfo;
import com.staros.proto.DeleteMetaGroupInfo;
import com.staros.proto.DeleteShardGroupInfo;
import com.staros.proto.FileStoreType;
import com.staros.proto.MetaGroupInfo;
import com.staros.proto.MetaGroupJournalInfo;
import com.staros.proto.PlacementPolicy;
import com.staros.proto.PlacementPreference;
import com.staros.proto.PlacementRelationship;
import com.staros.proto.ShardFileMetaFooter;
import com.staros.proto.ShardFileMetaHeader;
import com.staros.proto.ShardGroupInfo;
import com.staros.proto.ShardInfo;
import com.staros.proto.UpdateMetaGroupInfo;
import com.staros.schedule.Scheduler;
import com.staros.util.Config;
import com.staros.util.Constant;
import com.staros.util.IdGenerator;
import com.staros.util.LockCloseable;
import com.staros.util.LogUtils;
import com.staros.util.Text;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.BiFunction;
import java.util.stream.Collectors;
/**
* Shard Manager manages all the shard for single service, including
* shard group operations and shard operations.
*/
public class ShardManager {
private static final Logger LOG = LogManager.getLogger(ShardManager.class);
private final String serviceId;
// TODO: split shards to multiple group,
// some of the functions below will be too heavy when shard number grows
private Map shards; //
private final Map shardGroups; //
private final Map metaGroups; //
private final ReentrantReadWriteLock lock;
private final JournalSystem journalSystem;
private final IdGenerator idGenerator;
private final Scheduler shardScheduler;
public ShardManager(String serviceId, JournalSystem journalSystem, IdGenerator idGenerator,
Scheduler shardScheduler) {
// TODO: load from storage
this.serviceId = serviceId;
this.shards = new HashMap<>();
this.shardGroups = new HashMap<>();
this.metaGroups = new HashMap<>();
this.lock = new ReentrantReadWriteLock();
this.journalSystem = journalSystem;
this.idGenerator = idGenerator;
this.shardScheduler = shardScheduler;
// create default shard group
ShardGroup shardGroup = new ShardGroup(serviceId, Constant.DEFAULT_ID);
shardGroups.put(Constant.DEFAULT_ID, shardGroup);
}
public String getServiceId() {
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
return serviceId;
}
}
public List createShardGroup(List createShardGroupInfos) throws StarException {
if (createShardGroupInfos.isEmpty()) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT, "shard group info can not be empty.");
}
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
List shardGroupsToCreate = new ArrayList<>(createShardGroupInfos.size());
for (CreateShardGroupInfo info : createShardGroupInfos) {
ShardGroup shardGroup = new ShardGroup(serviceId, idGenerator.getNextId(), info.getPolicy(), false,
Constant.DEFAULT_ID /* metaGroupId */, info.getLabelsMap(), info.getPropertiesMap());
shardGroupsToCreate.add(shardGroup);
}
Journal journal = Journal.logCreateShardGroup(serviceId, shardGroupsToCreate);
journalSystem.write(journal);
List shardGroupInfos = new ArrayList<>(shardGroupsToCreate.size());
for (ShardGroup shardGroup : shardGroupsToCreate) {
shardGroups.put(shardGroup.getGroupId(), shardGroup);
shardGroupInfos.add(shardGroup.toProtobuf());
}
return shardGroupInfos;
}
}
public void deleteShardGroup(List groupIds, boolean deleteShards) throws StarException {
deleteShardGroupInternal(groupIds, deleteShards, false /* isReplay */);
}
private void deleteShardGroupInternal(List groupIds, boolean deleteShards, boolean isReplay) throws StarException {
if (groupIds.isEmpty()) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT, "shard group id can not be empty.");
}
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
List groupsToDelete = new ArrayList<>(groupIds.size());
for (Long groupId : groupIds) {
if (groupId == Constant.DEFAULT_ID) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT,
String.format("default shard group %d can not be deleted in service %s.", groupId, serviceId));
}
ShardGroup shardGroup = shardGroups.get(groupId);
if (shardGroup == null) {
continue;
}
groupsToDelete.add(shardGroup);
}
if (!isReplay) {
List groupIdsToDelete = groupsToDelete.stream().map(ShardGroup::getGroupId).collect(Collectors.toList());
DeleteShardGroupInfo info = DeleteShardGroupInfo.newBuilder()
.addAllGroupIds(groupIdsToDelete)
.setCascadeDeleteShard(deleteShards)
.build();
Journal journal = Journal.logDeleteShardGroup(serviceId, info);
journalSystem.write(journal);
}
for (ShardGroup shardGroup : groupsToDelete) {
shardGroups.remove(shardGroup.getGroupId());
List shardIds = shardGroup.getShardIds();
if (deleteShards) {
// delete shards
for (long id : shardIds) {
Shard shard = shards.get(id);
if (shard == null) {
// shard can be removed by other shard group that is deleted prior to this shard group
continue;
}
removeShardInternalNoLock(shards.get(id));
}
} else {
// update shards' group_id list
for (long id : shardIds) {
shards.get(id).quitGroup(shardGroup.getGroupId());
}
}
}
}
}
public List listShardGroupInfo(boolean includeAnonymousGroup) throws StarException {
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
List shardGroupInfos = new ArrayList<>();
for (ShardGroup shardGroup : shardGroups.values()) {
if (!includeAnonymousGroup && shardGroup.isAnonymous()) {
continue;
}
shardGroupInfos.add(shardGroup.toProtobuf());
}
return shardGroupInfos;
}
}
/**
* Add shard into shards and update shard groups as well.
* This method doesn't acquire lock, the caller should obtain ShardManager's lock before calling this.
*
* @param shard the shard to be added.
* @return old shard if the same shard id exists
*/
private Shard addShardInternalNoLock(Shard shard) {
assert shard != null;
Shard old = shards.put(shard.getShardId(), shard);
for (Long groupId : shard.getGroupIds()) {
ShardGroup shardGroup = shardGroups.get(groupId);
assert shardGroup != null;
// ignore shard group return value on purpose
shardGroup.addShardId(shard.getShardId());
}
return old;
}
/**
* Remove shard from shards and shard groups.
* This method doesn't acquire lock, the caller should obtain ShardManager's lock before calling this.
*
* @param shard the shard to be removed.
* @return the shard to be removed
*/
private Shard removeShardInternalNoLock(Shard shard) {
assert shard != null;
Shard old = shards.remove(shard.getShardId());
for (Long groupId : shard.getGroupIds()) {
ShardGroup shardGroup = shardGroups.get(groupId);
if (shardGroup == null) {
// shardGroup could be removed already
continue;
}
// ignore shard group return value on purpose
shardGroup.removeShardId(shard.getShardId());
// shard group created by placement preference
// delete it after reference count less than 2
if (shardGroup.isAnonymous() &&
shardGroup.getMetaGroupId() == Constant.DEFAULT_ID &&
shardGroup.getShardIds().size() < 2) {
for (long id : shardGroup.getShardIds()) {
shards.get(id).quitGroup(shardGroup.getGroupId());
}
shardGroups.remove(shardGroup.getGroupId());
}
}
return old;
}
private void commitCreateShard(List shardList, List anonymousShardGroups) {
// TODO: optimize logic here
for (ShardGroup anonymousShardGroup : anonymousShardGroups) {
shardGroups.put(anonymousShardGroup.getGroupId(), anonymousShardGroup);
}
for (Shard shard : shardList) {
// ignore return value on purpose
addShardInternalNoLock(shard);
}
for (ShardGroup anonymousShardGroup : anonymousShardGroups) {
for (Long sid : anonymousShardGroup.getShardIds()) {
// ignore return value on purpose
shards.get(sid).joinGroup(anonymousShardGroup.getGroupId());
}
}
}
private void checkPlacementPreference(PlacementPreference preference, int replicaNum) throws StarException {
if (preference.getPlacementPolicy() != PlacementPolicy.PACK) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT,
String.format("shard placement preference does not support %s policy in service %s.",
preference.getPlacementPolicy().name(), serviceId));
}
if (preference.getPlacementRelationship() != PlacementRelationship.WITH_SHARD) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT,
String.format("shard placement preference does not support %s relationship in service %s.",
preference.getPlacementRelationship().name(), serviceId));
}
long targetId = preference.getRelationshipTargetId();
Shard targetShard = shards.get(targetId);
if (targetShard == null) {
throw new StarException(ExceptionCode.NOT_EXIST,
String.format("shard placement preference target id %d not exist in service %s.",
targetId, serviceId));
}
// The following line is added to reminder that replica number check is only needed for PACK PlacementPolicy.
Preconditions.checkState(preference.getPlacementPolicy() == PlacementPolicy.PACK);
if (targetShard.getExpectedReplicaCount() != replicaNum) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT, String.format(
"shard has a different replica num than target referenced shard. our:%d, target:%d",
replicaNum,
targetShard.getExpectedReplicaCount()));
}
}
public List createShard(List createShardInfos, FileStoreMgrs fsMgrs) throws StarException {
List shardsToCreate = new ArrayList<>(createShardInfos.size());
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
// prepare shards
List anonymousShardGroups = new ArrayList<>();
for (CreateShardInfo info : createShardInfos) {
long shardId = info.getShardId();
if (shardId == Constant.DEFAULT_ID) {
// application did not assign one
shardId = idGenerator.getNextId();
}
if (shards.containsKey(shardId)) {
throw new StarException(ExceptionCode.ALREADY_EXIST,
String.format("shard %d already exist in service %s.", shardId, serviceId));
}
int shardReplicaNum =
info.getReplicaCount() == Constant.DEFAULT_PROTOBUF_INTEGER ? 1 : info.getReplicaCount();
List groupIds = info.getGroupIdsList();
for (Long groupId : groupIds) {
ShardGroup group = shardGroups.get(groupId);
if (group == null) {
throw new StarException(ExceptionCode.NOT_EXIST,
String.format("shard group %d not exist in service %s.", groupId, serviceId));
}
if (group.getPlacementPolicy() == PlacementPolicy.PACK) {
int replicaNum = getFirstShardReplicaNumFromGroup(group, -1);
if (replicaNum != -1 && shardReplicaNum != replicaNum) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT,
String.format("Inconsistent shard replica num:%d when join a PACK shard group:%d.",
shardReplicaNum, groupId));
}
}
}
ShardGroup anonymousShardGroup;
if (!info.getPlacementPreferencesList().isEmpty()) {
// TODO: support multi preferences
if (info.getPlacementPreferencesList().size() != 1) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT,
String.format("shard placement preference does not support multiple in service %s.",
serviceId));
}
PlacementPreference preference = info.getPlacementPreferencesList().get(0);
checkPlacementPreference(preference, shardReplicaNum);
anonymousShardGroup = new ShardGroup(serviceId, idGenerator.getNextId(),
preference.getPlacementPolicy(), true /* anonymous */, Constant.DEFAULT_ID /* metaGroupId */);
anonymousShardGroup.addShardId(preference.getRelationshipTargetId());
anonymousShardGroup.addShardId(shardId);
anonymousShardGroups.add(anonymousShardGroup);
}
FilePath path;
if (!info.hasPathInfo()) {
FileStoreMgr fsMgr = fsMgrs.getFileStoreMgr(FileStoreType.S3);
FileStore fs = fsMgr.allocFileStore();
path = new FilePath(fs, String.format("%s/%s", serviceId, ""));
} else {
FileStore fs = fsMgrs.getFileStore(info.getPathInfo().getFsInfo().getFsType(),
info.getPathInfo().getFsInfo().getFsKey());
if (fs == null) {
throw new StarException(ExceptionCode.NOT_EXIST,
String.format("file store with key '%s' not exist", info.getPathInfo().getFsInfo().getFsKey()));
}
path = FilePath.fromFullPath(fs, info.getPathInfo().getFullPath());
}
FileCache cache = FileCache.fromProtobuf(info.getCacheInfo());
Shard shard = new Shard(serviceId, groupIds, shardId, path, cache);
shard.setExpectedReplicaCount(shardReplicaNum);
Map shardProperties = info.getShardPropertiesMap();
shard.setProperties(shardProperties);
shardsToCreate.add(shard);
}
CreateShardJournalInfo.Builder builder = CreateShardJournalInfo.newBuilder();
for (Shard shard : shardsToCreate) {
builder.addShardInfos(shard.toProtobuf());
}
for (ShardGroup anonymousShardGroup : anonymousShardGroups) {
builder.addShardGroupInfos(anonymousShardGroup.toProtobuf());
}
Journal journal = Journal.logCreateShard(serviceId, builder.build());
journalSystem.write(journal);
// commit shards
commitCreateShard(shardsToCreate, anonymousShardGroups);
}
List shardIds = shardsToCreate.stream().map(Shard::getShardId).collect(Collectors.toList());
if (Config.SCHEDULER_TRIGGER_SCHEDULE_WHEN_CREATE_SHARD) {
try {
shardScheduler.scheduleAddToDefaultGroup(serviceId, shardIds);
} catch (StarException e) {
// Ignore the failure for now, shardChecker will take care.
LOG.warn("Fail to schedule new created shards to default workerGroup for service: {}," +
"error: {}. Ignore the error for now.", serviceId, e.getMessage());
}
}
return getShardInfo(shardIds);
}
public void deleteShard(List shardIds) throws StarException {
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
// validate all shards
List shardsToDelete = new ArrayList<>(shardIds.size());
for (Long shardId : shardIds) {
Shard shard = shards.get(shardId);
if (shard == null) {
continue; // it's ok to delete non-exist shard
}
shardsToDelete.add(shard);
}
Journal journal = Journal.logDeleteShard(serviceId, shardIds);
journalSystem.write(journal);
for (Shard shard : shardsToDelete) {
Shard old = removeShardInternalNoLock(shard);
assert old != null;
}
}
}
/**
* Get shard info in service
*
* @throws StarException if shard does not exist or shard not belong to this service
*/
public List getShardInfo(List shardIds) throws StarException {
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
List shardInfos = new ArrayList<>(shardIds.size());
for (Long shardId : shardIds) {
Shard shard = shards.get(shardId);
if (shard == null) {
throw new StarException(ExceptionCode.NOT_EXIST,
String.format("shard %d not exist.", shardId));
}
shardInfos.add(shard.toProtobuf());
}
return shardInfos;
}
}
public List> listShardInfo(List groupIds) throws StarException {
if (groupIds.isEmpty()) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT, "shard group id can not be empty.");
}
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
List> shardInfos = new ArrayList<>(groupIds.size());
for (Long groupId : groupIds) {
ShardGroup shardGroup = shardGroups.get(groupId);
if (shardGroup == null) {
throw new StarException(ExceptionCode.NOT_EXIST, String.format("shard group %d not exist.", groupId));
}
List infos = new ArrayList<>(shardGroup.getShardIds().size());
for (Long shardId : shardGroup.getShardIds()) {
Shard shard = shards.get(shardId);
infos.add(shard.toProtobuf());
}
shardInfos.add(infos);
}
return shardInfos;
}
}
// TODO: the returned shard is not safe, need look into it later
public Shard getShard(long shardId) {
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
return shards.get(shardId);
}
}
public ShardGroup getShardGroup(long groupId) {
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
return shardGroups.get(groupId);
}
}
public List getAllShardIds() {
try (LockCloseable ignore = new LockCloseable(lock.readLock())) {
return new ArrayList<>(shards.keySet());
}
}
public List getAllShardGroupIds() {
try (LockCloseable ignore = new LockCloseable(lock.readLock())) {
return new ArrayList<>(shardGroups.keySet());
}
}
// return size of anonymous group
private int verifyShardGroupInfoForMetaGroup(List shardGroupIds, int expectSize) throws StarException {
// TODO: maybe also verify all shard id and group id?
for (Long shardGroupId : shardGroupIds) {
ShardGroup shardGroup = shardGroups.get(shardGroupId);
if (shardGroup == null) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT,
String.format("shard group %d not exist.", shardGroupId));
}
if (expectSize == -1) { // use the first group size
expectSize = shardGroup.getShardIds().size();
}
if (expectSize != shardGroup.getShardIds().size()) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT,
String.format("shard size mismatch, expect %d, has %d.",
expectSize, shardGroup.getShardIds().size()));
}
}
return expectSize;
}
public MetaGroupInfo createMetaGroup(CreateMetaGroupInfo createMetaGroupInfo) throws StarException {
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
long metaGroupId = createMetaGroupInfo.getMetaGroupId();
if (metaGroupId == Constant.DEFAULT_ID) {
// application did not assign one
metaGroupId = idGenerator.getNextId();
}
if (metaGroups.containsKey(metaGroupId)) {
throw new StarException(ExceptionCode.ALREADY_EXIST,
String.format("meta group %d already exists.", metaGroupId));
}
PlacementPolicy placementPolicy = createMetaGroupInfo.getPlacementPolicy();
if (placementPolicy != PlacementPolicy.PACK) {
// TODO: support EXCLUDE policy
throw new StarException(ExceptionCode.INVALID_ARGUMENT,
String.format("meta group placement policy %s not allowed.",
placementPolicy.name()));
}
List anonymousShardGroups = new ArrayList<>();
List shardGroupIds = createMetaGroupInfo.getShardGroupIdsList();
if (!shardGroupIds.isEmpty()) {
int size = verifyShardGroupInfoForMetaGroup(shardGroupIds, -1 /* expectSize */);
anonymousShardGroups = prepareAnonymousShardGroup(metaGroupId, size, placementPolicy,
null /* anonymousShardGroupIds */);
validateShardReplicaNumInGroup(placementPolicy, anonymousShardGroups, shardGroupIds);
}
List anonymousShardGroupIds =
anonymousShardGroups.stream().map(ShardGroup::getGroupId).collect(Collectors.toList());
MetaGroup metaGroup = new MetaGroup(serviceId, metaGroupId,
anonymousShardGroupIds,
createMetaGroupInfo.getPlacementPolicy());
MetaGroupJournalInfo journalInfo = MetaGroupJournalInfo.newBuilder()
.setMetaGroupInfo(metaGroup.toProtobuf())
.setCreateInfo(createMetaGroupInfo)
.build();
Journal journal = Journal.logCreateMetaGroup(serviceId, journalInfo);
journalSystem.write(journal);
commitAnonymousShardGroup(metaGroup, anonymousShardGroups, shardGroupIds);
metaGroups.put(metaGroupId, metaGroup);
return metaGroup.toProtobuf();
}
}
/**
* validate every group.get(x) can be added into targetGroups.get(x)
*
* @param targetGroups target pack group list
* @param groupIds source shard groups
*/
private void validateShardReplicaNumInGroup(PlacementPolicy policy, List targetGroups, List groupIds) {
if (policy != PlacementPolicy.PACK) {
return;
}
final int INVALID_REPLICA_NUM = -1;
List expectReplicaNum = new ArrayList<>(targetGroups.size());
targetGroups.forEach(x -> {
int replicaNum = getFirstShardReplicaNumFromGroup(x, INVALID_REPLICA_NUM);
expectReplicaNum.add(replicaNum);
});
for (long gid : groupIds) {
ShardGroup group = shardGroups.get(gid);
Preconditions.checkNotNull(group);
int pos = 0;
for (long sid : group.getShardIds()) {
Shard shard = shards.get(sid);
Preconditions.checkNotNull(shard);
int replicaNum = shard.getExpectedReplicaCount();
if (expectReplicaNum.get(pos) == INVALID_REPLICA_NUM) {
expectReplicaNum.add(pos, replicaNum);
} else if (expectReplicaNum.get(pos) != replicaNum) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT,
String.format("shard:%d replicaNum: %d, target group expected replica num: %d",
sid, replicaNum, expectReplicaNum.get(pos)));
}
++pos;
}
}
}
/**
* Get the replica number of the first shard in the shard group, return valueIfEmpty if no valid shard in group.
*
* @param group target shard group
* @param valueIfEmpty value returned if group is empty
* @return the first valid shard replica number or `valueIfEmpty` if no valid shard can be found.
*/
private int getFirstShardReplicaNumFromGroup(ShardGroup group, int valueIfEmpty) {
int result = valueIfEmpty;
for (long shardId : group.getShardIds()) {
if (shards.containsKey(shardId)) {
result = shards.get(shardId).getExpectedReplicaCount();
break;
}
}
return result;
}
public void deleteMetaGroup(long metaGroupId) throws StarException {
deleteMetaGroupInternal(metaGroupId, false /* isReplay */);
}
public void deleteMetaGroupInternal(long metaGroupId, boolean isReplay) throws StarException {
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
MetaGroup metaGroup = metaGroups.get(metaGroupId);
if (metaGroup == null) {
return;
}
if (!isReplay) {
DeleteMetaGroupInfo deleteInfo = DeleteMetaGroupInfo.newBuilder()
.setMetaGroupId(metaGroupId)
.build();
MetaGroupJournalInfo journalInfo = MetaGroupJournalInfo.newBuilder()
.setDeleteInfo(deleteInfo)
.build();
Journal journal = Journal.logDeleteMetaGroup(serviceId, journalInfo);
journalSystem.write(journal);
}
List anonymousShardGroupIds = metaGroup.getShardGroupIds();
// 1. remove all shards from anonymous group
for (Long groupId : anonymousShardGroupIds) {
ShardGroup shardGroup = shardGroups.get(groupId);
assert shardGroup != null;
for (Long shardId : shardGroup.getShardIds()) {
Shard shard = shards.get(shardId);
assert shard != null;
boolean v = shard.quitGroup(groupId);
assert v;
}
}
// 2. remove anonymous group
for (Long groupId : anonymousShardGroupIds) {
ShardGroup old = shardGroups.remove(groupId);
assert old != null;
}
// 3. remove meta group itself
MetaGroup old = metaGroups.remove(metaGroupId);
assert old != null;
}
}
private List prepareAnonymousShardGroup(long metaGroupId, int anonymousGroupSize, PlacementPolicy placementPolicy,
List anonymousShardGroupIds) throws StarException {
if (anonymousGroupSize == 0) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT, "anonymous shard group size can not be 0.");
}
List anonymousShardGroups = new ArrayList<>();
for (int i = 0; i < anonymousGroupSize; ++i) {
long groupId;
if (anonymousShardGroupIds != null) { // for replay, id already assigned
groupId = anonymousShardGroupIds.get(i);
} else {
groupId = idGenerator.getNextId();
}
ShardGroup anonymousShardGroup = new ShardGroup(serviceId, groupId,
placementPolicy, true /* anonymous */, metaGroupId);
anonymousShardGroups.add(anonymousShardGroup);
}
return anonymousShardGroups;
}
private void commitAnonymousShardGroup(MetaGroup metaGroup, List anonymousShardGroups, List shardGroupIds) {
// 1. add shard to anonymous group
for (Long shardGroupId : shardGroupIds) {
ShardGroup shardGroup = shardGroups.get(shardGroupId);
for (int i = 0; i < shardGroup.getShardIds().size(); ++i) {
anonymousShardGroups.get(i).addShardId(shardGroup.getShardIds().get(i));
}
}
// 2. add anonymous group to shard
for (ShardGroup shardGroup : anonymousShardGroups) {
for (Long shardId : shardGroup.getShardIds()) {
Shard shard = shards.get(shardId);
shard.joinGroup(shardGroup.getGroupId());
}
}
// 3. add anonymous group to shard group
for (ShardGroup shardGroup : anonymousShardGroups) {
shardGroups.put(shardGroup.getGroupId(), shardGroup);
}
// 4. add anonymous group to meta group
List anonymousShardGroupIds =
anonymousShardGroups.stream().map(ShardGroup::getGroupId).collect(Collectors.toList());
metaGroup.setShardGroupIds(anonymousShardGroupIds);
}
private MetaGroup verifyAndGetMetaGroup(long metaGroupId) throws StarException {
if (metaGroupId == Constant.DEFAULT_ID) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT, "meta group id not set.");
}
MetaGroup metaGroup = metaGroups.get(metaGroupId);
if (metaGroup == null) {
throw new StarException(ExceptionCode.NOT_EXIST,
String.format("meta group id %d not exist.", metaGroupId));
}
return metaGroup;
}
private Pair verifyUpdateMetaGroupInfo(UpdateMetaGroupInfo updateMetaGroupInfo) throws StarException {
MetaGroup src = null;
MetaGroup dst = null;
UpdateMetaGroupInfo.InfoCase icase = updateMetaGroupInfo.getInfoCase();
switch (icase) {
case JOIN_INFO: {
dst = verifyAndGetMetaGroup(updateMetaGroupInfo.getJoinInfo().getMetaGroupId());
break;
}
case QUIT_INFO: {
src = verifyAndGetMetaGroup(updateMetaGroupInfo.getQuitInfo().getMetaGroupId());
break;
}
case TRANSFER_INFO: {
src = verifyAndGetMetaGroup(updateMetaGroupInfo.getTransferInfo().getSrcMetaGroupId());
dst = verifyAndGetMetaGroup(updateMetaGroupInfo.getTransferInfo().getDstMetaGroupId());
break;
}
case INFO_NOT_SET: {
throw new StarException(ExceptionCode.INVALID_ARGUMENT, "update meta group info type not set.");
}
}
return Pair.of(src, dst);
}
public void updateMetaGroup(UpdateMetaGroupInfo updateMetaGroupInfo) throws StarException {
updateMetaGroupInternal(updateMetaGroupInfo, false /* isReplay */, null /* anonymousShardGroupIdsForReplay */);
}
private void updateMetaGroupInternal(UpdateMetaGroupInfo updateMetaGroupInfo,
boolean isReplay,
List anonymousShardGroupIdsForReplay) throws StarException {
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
Pair pair = verifyUpdateMetaGroupInfo(updateMetaGroupInfo);
MetaGroup src = pair.getKey();
MetaGroup dst = pair.getValue();
// verify shard group
List shardGroupIds = updateMetaGroupInfo.getShardGroupIdsList();
if (shardGroupIds.isEmpty()) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT, "empty shard group list.");
}
if (src != null) {
verifyShardGroupInfoForMetaGroup(shardGroupIds, src.getShardGroupIds().size() /* expectSize */);
}
int dstSize = -1;
if (dst != null) {
int expectSize = -1;
if (dst.getShardGroupIds().size() != 0) {
expectSize = dst.getShardGroupIds().size();
}
dstSize = verifyShardGroupInfoForMetaGroup(shardGroupIds, expectSize);
}
// prepare dst anonymous shard group
List anonymousShardGroups = new ArrayList<>();
if (dst != null) {
if (dst.getShardGroupIds().size() == 0) {
anonymousShardGroups = prepareAnonymousShardGroup(dst.getMetaGroupId(), dstSize, dst.getPlacementPolicy(),
anonymousShardGroupIdsForReplay);
} else {
for (Long groupId : dst.getShardGroupIds()) {
anonymousShardGroups.add(shardGroups.get(groupId));
}
}
validateShardReplicaNumInGroup(dst.getPlacementPolicy(), anonymousShardGroups, shardGroupIds);
}
if (!isReplay) {
MetaGroupJournalInfo.Builder journalInfoBuilder = MetaGroupJournalInfo.newBuilder();
if (dst != null) {
MetaGroupInfo.Builder infoBuilder = MetaGroupInfo.newBuilder().mergeFrom(dst.toProtobuf());
if (dst.getShardGroupIds().size() == 0) {
List anonymousShardGroupIds =
anonymousShardGroups.stream().map(ShardGroup::getGroupId).collect(Collectors.toList());
infoBuilder.addAllShardGroupIds(anonymousShardGroupIds);
}
journalInfoBuilder.setMetaGroupInfo(infoBuilder.build());
}
journalInfoBuilder.setUpdateInfo(updateMetaGroupInfo);
Journal journal = Journal.logUpdateMetaGroup(serviceId, journalInfoBuilder.build());
journalSystem.write(journal);
}
// remove from src meta group
if (src != null) {
// TODO: maybe reset src anonymous group if all shards are removed
for (Long shardGroupId : shardGroupIds) {
ShardGroup shardGroup = shardGroups.get(shardGroupId);
int idx = 0;
for (Long shardId : shardGroup.getShardIds()) {
ShardGroup anonymousShardGroup = shardGroups.get(src.getShardGroupIds().get(idx));
boolean v1 = anonymousShardGroup.removeShardId(shardId);
assert v1;
Shard shard = shards.get(shardId);
boolean v2 = shard.quitGroup(anonymousShardGroup.getGroupId());
assert v2;
idx++;
}
}
}
// add to dst meta group
if (dst != null) {
commitAnonymousShardGroup(dst, anonymousShardGroups, shardGroupIds);
}
}
}
public MetaGroupInfo getMetaGroupInfo(long metaGroupId) throws StarException {
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
MetaGroup metaGroup = metaGroups.get(metaGroupId);
if (metaGroup == null) {
throw new StarException(ExceptionCode.NOT_EXIST,
String.format("meta group %d not exist.", metaGroupId));
}
return metaGroup.toProtobuf();
}
}
public List listMetaGroupInfo() throws StarException {
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
List metaGroupInfos = new ArrayList<>();
for (MetaGroup metaGroup : metaGroups.values()) {
metaGroupInfos.add(metaGroup.toProtobuf());
}
return metaGroupInfos;
}
}
public void scheduleShardsBelongToWorker(long workerId) {
List shardIds;
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
shardIds = shards.entrySet()
.stream()
.filter(x -> x.getValue().hasReplica(workerId))
.map(Map.Entry::getKey)
.collect(Collectors.toList());
}
if (!shardIds.isEmpty()) {
shardScheduler.scheduleAsyncAddToWorker(serviceId, shardIds, workerId);
}
}
public void addShardReplicas(List shardIds, long workerId) {
updateShardReplicaInfoInternal(shardIds, workerId, true);
}
public void removeShardReplicas(List shardIds, long workerId) {
updateShardReplicaInfoInternal(shardIds, workerId, false);
}
/**
* Add/Remove shard replica of workerId, write journal if necessary
* @param shardIds list of shards whose replicas will be changed
* @param workerId target worker id
* @param isAdd true: add replica operation, false: remove replica operation
*/
private void updateShardReplicaInfoInternal(List shardIds, long workerId, boolean isAdd) {
if (shardIds.isEmpty()) {
return;
}
// Create a function object to wrapper the actual shard operator
BiFunction updateObj = (Shard shard, Long id) -> {
if (isAdd) {
return shard.addReplica(id);
} else {
return shard.removeReplica(id);
}
};
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
List shardsToUpdate = new ArrayList<>();
List missingIds = new ArrayList<>();
for (Long shardId : shardIds) {
Shard shard = shards.get(shardId);
if (shard == null) {
// it's possible that shard is already deleted
missingIds.add(shardId);
continue;
}
if (updateObj.apply(shard, workerId)) {
shardsToUpdate.add(shard);
}
}
// write shard info to disk after updating memory
if (!shardsToUpdate.isEmpty()) {
try {
Journal journal = Journal.logUpdateShard(serviceId, shardsToUpdate);
journalSystem.write(journal);
} catch (StarException e) {
// NOTE: shard schedule does not offer strong consistency, if log shard info failed,
// availability always proceeds consistency, so here we consider
// shard schedule succeed even if journal write failure happens
List shardIdsToPrint = shardsToUpdate.stream().map(Shard::getShardId).collect(Collectors.toList());
LOG.error("log shard info after schedule failed, {}. shards:{}, service:{}.",
e.getMessage(), shardIdsToPrint, serviceId);
}
}
if (!missingIds.isEmpty()) {
LOG.warn("shard {} not exist when update shard info from shard scheduler!", missingIds);
}
}
}
/**
* Check missing replicas from the list of shards for the specific worker id, and remove non-exist replicas
* @param shardIds list of shard id
* @param workerId target worker id
*/
public void validateWorkerReportedReplicas(List shardIds, long workerId) {
if (shardIds.isEmpty()) {
return;
}
List missingIds = new ArrayList<>();
try (LockCloseable ignore = new LockCloseable(lock.readLock())) {
shardIds.forEach(x -> {
Shard shard = shards.get(x);
// shard is deleted or shard does not have the replica
if (shard == null || !shard.hasReplica(workerId)) {
missingIds.add(x);
}
});
}
// TODO: handle possible race condition.
// one of shardA's replica is scheduled to workerId, the RPC is made successful, but the updateShardInfo in
// scheduling is not yet completed, in this time window, worker's heartbeat reports the shard which can't be
// found in this shards check, and hence will ask worker to remove it again.
if (!missingIds.isEmpty()) {
LOG.warn("shard {} not exist or have outdated info when update shard info from worker heartbeat, " +
"schedule remove from worker {}.", missingIds, workerId);
shardScheduler.scheduleAsyncRemoveFromWorker(serviceId, missingIds, workerId);
}
}
public void replayCreateShard(CreateShardJournalInfo info) {
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
List shardList = new ArrayList<>();
for (ShardInfo shardInfo : info.getShardInfosList()) {
Shard shard = Shard.fromProtobuf(shardInfo);
shardList.add(shard);
}
List anonymousShardGroups = new ArrayList<>();
for (ShardGroupInfo shardGroupInfo : info.getShardGroupInfosList()) {
ShardGroup shardGroup = ShardGroup.fromProtobuf(shardGroupInfo);
anonymousShardGroups.add(shardGroup);
}
commitCreateShard(shardList, anonymousShardGroups);
// TODO: state machine
}
}
public void replayDeleteShard(List shardIds) {
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
for (Long shardId : shardIds) {
Shard shard = shards.get(shardId);
if (shard == null) {
LOG.warn("shard {} not exist when replay delete shard, just ignore.", shardId);
continue;
}
Shard old = removeShardInternalNoLock(shard);
assert old != null;
}
// TODO: state machine
}
}
public void replayUpdateShard(List shardList) {
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
for (Shard shard : shardList) {
Shard old = addShardInternalNoLock(shard);
if (old == null) {
LogUtils.fatal(LOG, "shard {} not exist when replay update shard, should not happen!", shard.getShardId());
}
}
// TODO: state machine
}
}
public void replayCreateShardGroup(List groups) {
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
for (ShardGroup shardGroup : groups) {
ShardGroup old = shardGroups.put(shardGroup.getGroupId(), shardGroup);
if (old != null) {
LogUtils.fatal(LOG, "shard group {} already exist when replay create shard group, should not happen!",
shardGroup.getGroupId());
}
}
// TODO: state machine
}
}
public void replayDeleteShardGroup(DeleteShardGroupInfo info) {
deleteShardGroupInternal(info.getGroupIdsList(), info.getCascadeDeleteShard(), true /* isReplay */);
}
public void replayCreateMetaGroup(MetaGroupJournalInfo info) throws StarException {
CreateMetaGroupInfo createMetaGroupInfo = info.getCreateInfo();
MetaGroup metaGroup = MetaGroup.fromProtobuf(info.getMetaGroupInfo());
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
long metaGroupId = metaGroup.getMetaGroupId();
if (metaGroups.containsKey(metaGroupId)) {
throw new StarException(ExceptionCode.ALREADY_EXIST,
String.format("meta group %d already exists.", metaGroupId));
}
List anonymousShardGroups = new ArrayList<>();
List shardGroupIds = createMetaGroupInfo.getShardGroupIdsList();
if (!shardGroupIds.isEmpty()) {
int size = verifyShardGroupInfoForMetaGroup(shardGroupIds, -1 /* expectSize */);
anonymousShardGroups = prepareAnonymousShardGroup(metaGroupId, size, metaGroup.getPlacementPolicy(),
metaGroup.getShardGroupIds());
}
// skip the shard replica num check since this is a replay operation
commitAnonymousShardGroup(metaGroup, anonymousShardGroups, shardGroupIds);
metaGroups.put(metaGroupId, metaGroup);
}
}
public void replayDeleteMetaGroup(MetaGroupJournalInfo info) throws StarException {
long metaGroupId = info.getDeleteInfo().getMetaGroupId();
deleteMetaGroupInternal(metaGroupId, true /* isReplay */);
}
public void replayUpdateMetaGroup(MetaGroupJournalInfo info) throws StarException {
UpdateMetaGroupInfo updateMetaGroupInfo = info.getUpdateInfo();
MetaGroup dstMetaGroup = MetaGroup.fromProtobuf(info.getMetaGroupInfo());
updateMetaGroupInternal(updateMetaGroupInfo, true /* isReplay */, dstMetaGroup.getShardGroupIds());
}
// FOR TEST
public int getShardCount() {
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
return shards.size();
}
}
// FOR TEST
public int getShardGroupCount() {
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
return shardGroups.size();
}
}
// FOR TEST
public int getMetaGroupCount() {
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
return metaGroups.size();
}
}
// FOR TEST
public void overrideShards(Map shards) {
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
// override shard group
shardGroups.clear();
for (Shard shard : shards.values()) {
for (Long groupId : shard.getGroupIds()) {
ShardGroup shardGroup = shardGroups.get(groupId);
if (shardGroup == null) {
shardGroup = new ShardGroup(serviceId, groupId);
shardGroups.put(groupId, shardGroup);
}
shardGroup.addShardId(shard.getShardId());
}
}
// override shard
this.shards = shards;
}
}
public void dumpMeta(DataOutputStream out) throws IOException {
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
LOG.debug("start dump shard manager meta data to file.");
// write header
ShardFileMetaHeader header = ShardFileMetaHeader.newBuilder()
.setShardCount(shards.size())
.setShardGroupCount(shardGroups.size())
.setMetaGroupCount(metaGroups.size())
.build();
Text.writeBytes(out, header.toByteArray());
// write meta group
for (MetaGroup metaGroup : metaGroups.values()) {
metaGroup.write(out);
}
// write shard group
for (ShardGroup shardGroup : shardGroups.values()) {
shardGroup.write(out);
}
// write shard
for (Shard shard : shards.values()) {
shard.write(out);
}
// write footer
ShardFileMetaFooter footer = ShardFileMetaFooter.newBuilder().build();
Text.writeBytes(out, footer.toByteArray());
LOG.debug("end dump shard manager meta data to file.");
}
}
public void loadMeta(DataInputStream in) throws IOException {
try (LockCloseable ignored = new LockCloseable(lock.writeLock())) {
LOG.debug("start load shard manager meta data from file.");
// read header
ShardFileMetaHeader header = ShardFileMetaHeader.parseFrom(Text.readBytes(in));
int shardCount = header.getShardCount();
int shardGroupCount = header.getShardGroupCount();
int metaGroupCount = header.getMetaGroupCount();
// load meta group
for (int i = 0; i < metaGroupCount; ++i) {
MetaGroup metaGroup = MetaGroup.read(in);
metaGroups.put(metaGroup.getMetaGroupId(), metaGroup);
}
// load shard group
for (int i = 0; i < shardGroupCount; ++i) {
ShardGroup shardGroup = ShardGroup.read(in);
shardGroups.put(shardGroup.getGroupId(), shardGroup);
}
// load shard
for (int i = 0; i < shardCount; ++i) {
Shard shard = Shard.read(in);
Shard old = addShardInternalNoLock(shard);
assert old == null;
}
// read footer
ShardFileMetaFooter.parseFrom(Text.readBytes(in));
LOG.debug("end load shard manager meta data from file.");
}
}
public void dump(DataOutputStream out) throws IOException {
try (LockCloseable ignored = new LockCloseable(lock.readLock())) {
for (MetaGroup metaGroup : metaGroups.values()) {
String s = JsonFormat.printer().print(metaGroup.toProtobuf()) + "\n";
out.writeBytes(s);
}
for (ShardGroup shardGroup : shardGroups.values()) {
String s = JsonFormat.printer().print(shardGroup.toProtobuf()) + "\n";
out.writeBytes(s);
}
for (Shard shard : shards.values()) {
String s = JsonFormat.printer().print(shard.toProtobuf()) + "\n";
out.writeBytes(s);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy