com.yelp.nrtsearch.tools.nrt_utils.cleanup.CleanupDataCommand Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of server Show documentation
Show all versions of server Show documentation
GRPC lucene server using near-real-time replication
/*
* Copyright 2024 Yelp Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.yelp.nrtsearch.tools.nrt_utils.cleanup;
import static com.yelp.nrtsearch.tools.nrt_utils.backup.BackupCommandUtils.deleteObjects;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.model.ListObjectsV2Request;
import com.amazonaws.services.s3.model.ListObjectsV2Result;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Sets;
import com.yelp.nrtsearch.server.nrt.state.NrtPointState;
import com.yelp.nrtsearch.server.remote.RemoteBackend;
import com.yelp.nrtsearch.server.remote.RemoteUtils;
import com.yelp.nrtsearch.server.remote.s3.S3Backend;
import com.yelp.nrtsearch.server.utils.TimeStringUtils;
import com.yelp.nrtsearch.tools.nrt_utils.backup.BackupCommandUtils;
import com.yelp.nrtsearch.tools.nrt_utils.state.StateCommandUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.stream.Collectors;
import picocli.CommandLine;
@CommandLine.Command(
name = CleanupDataCommand.CLEANUP_DATA,
description = "Cleanup unneeded index data in S3")
public class CleanupDataCommand implements Callable {
public static final String CLEANUP_DATA = "cleanupData";
private static final int DELETE_BATCH_SIZE = 1000;
@CommandLine.Option(
names = {"-s", "--serviceName"},
description = "Name of nrtsearch cluster",
required = true)
private String serviceName;
@CommandLine.Option(
names = {"-i", "--indexName"},
description = "Name of cluster index",
required = true)
private String indexName;
@CommandLine.Option(
names = {"--exactResourceName"},
description = "If index resource name already has unique identifier")
private boolean exactResourceName;
@CommandLine.Option(
names = {"-b", "--bucketName"},
description = "Name of bucket containing state files",
required = true)
private String bucketName;
@CommandLine.Option(
names = {"--region"},
description = "AWS region name, such as us-west-1, us-west-2, us-east-1")
private String region;
@CommandLine.Option(
names = {"-c", "--credsFile"},
description =
"File holding AWS credentials; Will use DefaultCredentialProvider if this is unset.")
private String credsFile;
@CommandLine.Option(
names = {"-p", "--credsProfile"},
description = "Profile to use from creds file; Neglected when credsFile is unset.",
defaultValue = "default")
private String credsProfile;
@CommandLine.Option(
names = {"-d", "--deleteAfter"},
description =
"Delete unneeded files older than this, in the form <#> "
+ "with valid units (s)econds, (m)inutes, (h)ours, (d)ays. (60m, 7h, 3d, etc.)",
required = true)
private String deleteAfter;
@CommandLine.Option(
names = {"--gracePeriod"},
description =
"Keep files within this grace period from the oldest index version creation, in the form <#> "
+ "with valid units (s)econds, (m)inutes, (h)ours, (d)ays. (60m, 7h, 3d, etc.) default: ${DEFAULT-VALUE}",
defaultValue = "3h")
private String gracePeriod;
@CommandLine.Option(
names = {"--dryRun"},
description = "Print file deletions, instead of applying to S3")
private boolean dryRun;
@CommandLine.Option(
names = {"--maxRetry"},
description = "Maximum number of retry attempts for S3 failed requests",
defaultValue = "20")
private int maxRetry;
private AmazonS3 s3Client;
@VisibleForTesting
void setS3Client(AmazonS3 s3Client) {
this.s3Client = s3Client;
}
@Override
public Integer call() throws Exception {
long deleteAfterMs = BackupCommandUtils.getTimeIntervalMs(deleteAfter);
long gracePeriodMs = BackupCommandUtils.getTimeIntervalMs(gracePeriod);
if (s3Client == null) {
s3Client =
StateCommandUtils.createS3Client(bucketName, region, credsFile, credsProfile, maxRetry);
}
S3Backend s3Backend = new S3Backend(bucketName, false, s3Client);
String resolvedIndexResource =
StateCommandUtils.getResourceName(s3Backend, serviceName, indexName, exactResourceName);
// Cleanup point state files
String pointStatePrefix =
S3Backend.getIndexResourcePrefix(
serviceName, resolvedIndexResource, RemoteBackend.IndexResourceType.POINT_STATE);
if (!s3Backend.exists(
serviceName, resolvedIndexResource, RemoteBackend.IndexResourceType.POINT_STATE)) {
System.out.println("No data found for index: " + indexName);
return 1;
}
String currentPointStateVersion = s3Backend.getCurrentResourceName(pointStatePrefix);
long currentPointStateTimestampMs = validatePSNameAndGetTimestampMs(currentPointStateVersion);
System.out.println(
"Current index point state version: "
+ currentPointStateVersion
+ ", timestamp: "
+ currentPointStateTimestampMs);
long currentTimeMs = System.currentTimeMillis();
long minPointStateTimestampMs = currentTimeMs - deleteAfterMs;
minPointStateTimestampMs = Math.min(minPointStateTimestampMs, currentPointStateTimestampMs);
System.out.println(
"Cleaning up version files, minPointStateTimestampMs: " + minPointStateTimestampMs);
PointStateDeletionDecider pointStateDeletionDecider =
new PointStateDeletionDecider(minPointStateTimestampMs);
cleanupS3Files(s3Client, bucketName, pointStatePrefix, pointStateDeletionDecider, dryRun);
String oldestRetainedPointFile = pointStateDeletionDecider.getOldestRetainedFile();
System.out.println("Oldest point state version retained: " + oldestRetainedPointFile);
if (oldestRetainedPointFile == null) {
System.out.println("Could not determine oldest retained point state file");
return 1;
}
// find the min of current time, current point state time, and lowest point state
// time. This conservatively determines the lower bounds, in case there is an issue with
// one of the timestamps
long lowestVersionTimestampMs = validatePSNameAndGetTimestampMs(oldestRetainedPointFile);
// subtract grace period for safety, this will be more important when there is pre copied
// merge data in S3
long dataMinTimestampMs =
Math.min(Math.min(currentPointStateTimestampMs, currentTimeMs), lowestVersionTimestampMs)
- gracePeriodMs;
// get all the S3 files referenced by the first and last retained point state versions
byte[] currentPointStateData =
s3Client
.getObject(bucketName, pointStatePrefix + currentPointStateVersion)
.getObjectContent()
.readAllBytes();
NrtPointState currentPointState = RemoteUtils.pointStateFromUtf8(currentPointStateData);
byte[] oldestPointStateData =
s3Client
.getObject(bucketName, pointStatePrefix + oldestRetainedPointFile)
.getObjectContent()
.readAllBytes();
NrtPointState oldestPointState = RemoteUtils.pointStateFromUtf8(oldestPointStateData);
Set currentPointStateFiles = getPointStateFiles(currentPointState);
Set oldestPointStateFiles = getPointStateFiles(oldestPointState);
// Cleanup index data files
String dataPrefix = S3Backend.getIndexDataPrefix(serviceName, resolvedIndexResource);
// uses union of current version and lowest version, this is done to conservatively
// protect the lowest version index files to ensure it can be restored
Set activeIndexFiles = Sets.union(currentPointStateFiles, oldestPointStateFiles);
System.out.println(
"Cleaning up index data files, minTimestampMs: "
+ dataMinTimestampMs
+ ", activeIndexFiles: "
+ activeIndexFiles);
// clean up all data files that are not needed by the retained index versions
cleanupS3Files(
s3Client,
bucketName,
dataPrefix,
new IndexDataDeletionDecider(dataMinTimestampMs, activeIndexFiles),
dryRun);
return 0;
}
/**
* Validate that the point state file name conforms to the expected format and return the
* timestamp component in milliseconds.
*
* @param pointStateFileName point state file name
* @return timestamp in milliseconds
*/
@VisibleForTesting
static long validatePSNameAndGetTimestampMs(String pointStateFileName) {
String[] parts = pointStateFileName.split("-");
if (parts.length != 7) {
throw new IllegalArgumentException("Invalid point state name: " + pointStateFileName);
}
try {
// parse file name components to validate
String uuidString = String.join("-", parts[1], parts[2], parts[3], parts[4], parts[5]);
UUID.fromString(uuidString);
Long.valueOf(parts[6]);
String timeString = pointStateFileName.split("-")[0];
return TimeStringUtils.parseTimeStringSec(timeString).toEpochMilli();
} catch (Exception e) {
throw new IllegalArgumentException("Invalid point state name: " + pointStateFileName, e);
}
}
/**
* Validate that the data file name conforms to the expected format and return the timestamp
* component in milliseconds.
*
* @param dataFileName data file name
* @return timestamp in milliseconds
*/
@VisibleForTesting
static long validateDataNameAndGetTimestampMs(String dataFileName) {
String[] parts = dataFileName.split("-");
// lucene index files start with '_'
if (parts.length != 7 || !parts[6].startsWith("_")) {
throw new IllegalArgumentException("Invalid data name: " + dataFileName);
}
try {
// parse file name components to validate
String uuidString = String.join("-", parts[1], parts[2], parts[3], parts[4], parts[5]);
UUID.fromString(uuidString);
String timeString = dataFileName.split("-")[0];
return TimeStringUtils.parseTimeStringSec(timeString).toEpochMilli();
} catch (Exception e) {
throw new IllegalArgumentException("Invalid data name: " + dataFileName, e);
}
}
/**
* Get the set of S3 index file names referenced by the point state.
*
* @param pointState point state
* @return set of S3 index file names
*/
private static Set getPointStateFiles(NrtPointState pointState) {
return pointState.files.entrySet().stream()
.map(e -> S3Backend.getIndexBackendFileName(e.getKey(), e.getValue()))
.collect(Collectors.toSet());
}
/** Decider for deleting index data files based on the min timestamp and active index files. */
static class IndexDataDeletionDecider implements FileDeletionDecider {
private final long minTimestampMs;
private final Set activeIndexFiles;
private boolean done = false;
IndexDataDeletionDecider(long minTimestampMs, Set activeIndexFiles) {
this.minTimestampMs = minTimestampMs;
this.activeIndexFiles = activeIndexFiles;
}
@Override
public boolean shouldDelete(String fileBaseName, long timestampMs) {
if (activeIndexFiles.contains(fileBaseName)) {
return false;
}
long fileNameTimestampMs = validateDataNameAndGetTimestampMs(fileBaseName);
// once we hit the first time string that is newer than the min timestamp, we are done
// since the files are sorted by time string
if (fileNameTimestampMs >= minTimestampMs) {
done = true;
return false;
}
// extra sanity check to only delete files if both the file time string and the s3 timestamp
// are older than the min timestamp
return timestampMs < minTimestampMs;
}
@Override
public boolean isDone() {
return done;
}
}
/**
* Decider for deleting point state files based on the min timestamp. This decider will retain the
* first file that is newer than the min timestamp.
*/
static class PointStateDeletionDecider implements FileDeletionDecider {
private final long minPointStateTimestampMs;
private String oldestRetainedFile = null;
private boolean done = false;
PointStateDeletionDecider(long minPointStateTimestampMs) {
this.minPointStateTimestampMs = minPointStateTimestampMs;
}
@Override
public boolean shouldDelete(String fileBaseName, long timestampMs) {
if (S3Backend.CURRENT_VERSION.equals(fileBaseName)) {
return false;
}
long fileNameTimestampMs = validatePSNameAndGetTimestampMs(fileBaseName);
// once we hit the first time string that is newer than the min timestamp, we are done
// since the files are sorted by time string
if (fileNameTimestampMs >= minPointStateTimestampMs) {
oldestRetainedFile = fileBaseName;
done = true;
return false;
}
// extra sanity check to only delete files if both the file time string and the s3 timestamp
// are older than the min timestamp
return timestampMs < minPointStateTimestampMs;
}
@Override
public boolean isDone() {
return done;
}
public String getOldestRetainedFile() {
return oldestRetainedFile;
}
}
/**
* Interface for deciding if an object should be deleted from s3 given its base name and
* modification time.
*/
interface FileDeletionDecider {
/**
* Determine if an object should be deleted
*
* @param fileBaseName file name after the key prefix
* @param timestampMs modification timestamp
* @return if object should be deleted
*/
boolean shouldDelete(String fileBaseName, long timestampMs);
/**
* Determine if the deletion process is done. This can be used to short circuit the cleanup
* process when all subsequent files will be retained.
*
* @return if the deletion process is done
*/
boolean isDone();
}
/**
* Clean up files in s3. Checks all keys matching the given prefix, and uses the given {@link
* FileDeletionDecider} to determine if they should be deleted.
*
* @param s3Client s3 client
* @param bucketName s3 bucket name
* @param keyPrefix key prefix to clean up
* @param deletionDecider deletion decider
* @param dryRun skip sending actual deletion requests to s3
*/
static void cleanupS3Files(
AmazonS3 s3Client,
String bucketName,
String keyPrefix,
FileDeletionDecider deletionDecider,
boolean dryRun) {
ListObjectsV2Request req =
new ListObjectsV2Request().withBucketName(bucketName).withPrefix(keyPrefix);
ListObjectsV2Result result;
List deleteList = new ArrayList<>(DELETE_BATCH_SIZE);
do {
result = s3Client.listObjectsV2(req);
for (S3ObjectSummary objectSummary : result.getObjectSummaries()) {
String objFileName = objectSummary.getKey().split(keyPrefix)[1];
long versionTimestampMs = objectSummary.getLastModified().getTime();
if (deletionDecider.shouldDelete(objFileName, versionTimestampMs)) {
System.out.println(
"Deleting object - key: "
+ objectSummary.getKey()
+ ", timestampMs: "
+ versionTimestampMs);
deleteList.add(objectSummary.getKey());
if (deleteList.size() == DELETE_BATCH_SIZE) {
if (!dryRun) {
deleteObjects(s3Client, bucketName, deleteList);
}
deleteList.clear();
}
}
if (deletionDecider.isDone()) {
break;
}
}
if (deletionDecider.isDone()) {
break;
}
String token = result.getNextContinuationToken();
req.setContinuationToken(token);
} while (result.isTruncated());
if (!deleteList.isEmpty() && !dryRun) {
deleteObjects(s3Client, bucketName, deleteList);
}
}
}