All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.expediagroup.beekeeper.vacuum.BeekeeperVacuumToolApplication Maven / Gradle / Ivy
/**
* Copyright (C) ${license.git.copyrightYears} Expedia, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.expediagroup.beekeeper.vacuum;
import static com.expediagroup.beekeeper.core.model.HousekeepingStatus.SCHEDULED;
import static com.expediagroup.beekeeper.core.model.LifecycleEventType.UNREFERENCED;
import java.io.IOException;
import java.net.URISyntaxException;
import java.time.Duration;
import java.time.LocalDateTime;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.parquet.hadoop.util.HiddenFileFilter;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.ApplicationArguments;
import org.springframework.boot.ApplicationRunner;
import org.springframework.stereotype.Component;
import com.google.common.base.Supplier;
import com.expediagroup.beekeeper.core.model.HousekeepingPath;
import com.expediagroup.beekeeper.scheduler.service.SchedulerService;
import com.expediagroup.beekeeper.vacuum.repository.BeekeeperRepository;
import com.hotels.hcommon.hive.metastore.client.api.CloseableMetaStoreClient;
import com.hotels.hcommon.hive.metastore.paths.PathUtils;
import com.hotels.hcommon.hive.metastore.paths.TablePathResolver;
@Component
public class BeekeeperVacuumToolApplication implements ApplicationRunner {
private static final Logger log = LoggerFactory.getLogger(BeekeeperVacuumToolApplication.class);
private final SchedulerService schedulerService;
private final BeekeeperRepository beekeeperRepository;
private final Supplier clientSupplier;
private final boolean isDryRun;
private final short batchSize;
private final String databaseName;
private final String tableName;
private final String cleanupDelay;
private final HiveConf conf;
private Set housekeepingPaths;
private IMetaStoreClient metastore;
@Autowired
BeekeeperVacuumToolApplication(
Supplier clientSupplier,
SchedulerService schedulerService,
BeekeeperRepository beekeeperRepository,
HiveConf conf,
@Value("${database}") String databaseName,
@Value("${table}") String tableName,
@Value("${default-cleanup-delay:P3D}") String cleanupDelay,
@Value("${dry-run:false}") boolean isDryRun,
@Value("${partition-batch-size:1000}") short batchSize) {
this.clientSupplier = clientSupplier;
this.schedulerService = schedulerService;
this.beekeeperRepository = beekeeperRepository;
this.conf = conf;
this.databaseName = databaseName;
this.tableName = tableName;
this.cleanupDelay = cleanupDelay;
this.isDryRun = isDryRun;
this.batchSize = batchSize;
}
@Override
public void run(ApplicationArguments args) {
log.warn("Do not run this tool at the same time as replicating to the target table!");
if (isDryRun) {
log.warn("Dry-run only!");
}
try {
metastore = clientSupplier.get();
housekeepingPaths = fetchHousekeepingPaths(beekeeperRepository);
vacuumTable(databaseName, tableName);
} catch (URISyntaxException | TException | IOException e) {
throw new RuntimeException(e);
} finally {
if (metastore != null) {
metastore.close();
}
}
}
private Set fetchHousekeepingPaths(BeekeeperRepository repository) {
log.info("Fetching scheduled paths");
Set paths = new HashSet<>();
Iterable repositoryPaths = repository.findAllScheduledPaths();
for (HousekeepingPath repositoryPath : repositoryPaths) {
paths.add(repositoryPath.getPath());
}
return paths;
}
private void vacuumTable(String databaseName, String tableName) throws TException, URISyntaxException, IOException {
log.info("Vacuuming table '{}.{}'.", databaseName, tableName);
Table table = metastore.getTable(databaseName, tableName);
TablePathResolver pathResolver = TablePathResolver.Factory.newTablePathResolver(metastore, table);
Path tableBaseLocation = pathResolver.getTableBaseLocation();
Path globPath = pathResolver.getGlobPath();
log.debug("Table base location: '{}'", tableBaseLocation);
log.debug("Glob path: '{}'", globPath);
Set metastorePaths = pathResolver.getMetastorePaths(batchSize);
ConsistencyCheck.checkMetastorePaths(metastorePaths, globPath.depth());
Set unvisitedMetastorePaths = new HashSet<>(metastorePaths);
FileSystem fs = tableBaseLocation.getFileSystem(conf);
FileStatus[] listStatus = fs.globStatus(globPath, HiddenFileFilter.INSTANCE);
Set pathsToRemove = new HashSet<>();
int metaStorePathCount = 0;
int housekeepingPathCount = 0;
for (FileStatus fileStatus : listStatus) {
Path path = removeTrailingSlash(fileStatus.getPath());
Path normalisedPath = PathUtils.normalise(path);
if (metastorePaths.contains(normalisedPath)) {
log.info("KEEP path '{}', referenced in the metastore.", path);
unvisitedMetastorePaths.remove(normalisedPath);
metaStorePathCount++;
} else if (housekeepingPaths.contains(path.toString())) {
log.info("KEEP path '{}', referenced in housekeeping.", path);
housekeepingPathCount++;
} else {
pathsToRemove.add(path);
}
}
for (Path unvisitedMetastorePath : unvisitedMetastorePaths) {
log.warn("Metastore path '{}' references non-existent data!", unvisitedMetastorePath);
ConsistencyCheck.checkUnvisitedPath(fs, unvisitedMetastorePath);
}
long totalBytesConsumed = 0;
for (Path toRemove : pathsToRemove) {
ContentSummary contentSummary = fs.getContentSummary(toRemove);
totalBytesConsumed += contentSummary.getSpaceConsumed();
removePath(toRemove, databaseName, tableName);
}
log.info("Vacuum summary; filesystem: {}, metastore: {}, housekeeping: {}, to remove: {}, bytes: {}.",
listStatus.length, metaStorePathCount, housekeepingPathCount, pathsToRemove.size(), totalBytesConsumed);
}
private Path removeTrailingSlash(Path path) {
return new Path(StringUtils.stripEnd(path.toString(), "/"));
}
private void removePath(Path toRemove, String databaseName, String tableName) {
log.info("REMOVE path '{}'; it is not referenced and can be deleted.", toRemove);
if (!isDryRun) {
schedulerService.scheduleForHousekeeping(HousekeepingPath.builder().databaseName(databaseName)
.tableName(tableName)
.path(toRemove.toString())
.housekeepingStatus(SCHEDULED)
.lifecycleType(UNREFERENCED.name())
.creationTimestamp(LocalDateTime.now())
.cleanupDelay(Duration.parse(cleanupDelay))
.clientId("beekeeper-vacuum-tool")
.build());
log.info("Scheduled path '{}' for deletion.", toRemove.toString());
} else {
log.warn("DRY RUN ENABLED: path '{}' left as is.", toRemove);
}
}
}